data-docs/print.html

15267 строки
1.3 MiB
Исходник Постоянная ссылка Ответственный История

Этот файл содержит невидимые символы Юникода!

Этот файл содержит невидимые символы Юникода, которые могут быть отображены не так, как показано ниже. Если это намеренно, можете спокойно проигнорировать это предупреждение. Используйте кнопку Экранировать, чтобы показать скрытые символы.

Этот файл содержит неоднозначные символы Юникода, которые могут быть перепутаны с другими в текущей локали. Если это намеренно, можете спокойно проигнорировать это предупреждение. Используйте кнопку Экранировать, чтобы подсветить эти символы.

<!DOCTYPE HTML>
<html lang="en" class="light" dir="ltr">
<head>
<!-- Book generated using mdBook -->
<meta charset="UTF-8">
<title>Mozilla Data Documentation</title>
<meta name="robots" content="noindex">
<!-- Custom HTML head -->
<meta name="description" content="">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="theme-color" content="#ffffff">
<link rel="icon" href="favicon.svg">
<link rel="shortcut icon" href="favicon.png">
<link rel="stylesheet" href="css/variables.css">
<link rel="stylesheet" href="css/general.css">
<link rel="stylesheet" href="css/chrome.css">
<link rel="stylesheet" href="css/print.css" media="print">
<!-- Fonts -->
<link rel="stylesheet" href="FontAwesome/css/font-awesome.css">
<link rel="stylesheet" href="fonts/fonts.css">
<!-- Highlight.js Stylesheets -->
<link rel="stylesheet" href="highlight.css">
<link rel="stylesheet" href="tomorrow-night.css">
<link rel="stylesheet" href="ayu-highlight.css">
<!-- Custom theme stylesheets -->
<link rel="stylesheet" href="dtmo.css">
<link rel="stylesheet" href="mermaid.css">
<!-- MathJax -->
<script async src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
</head>
<body class="sidebar-visible no-js">
<div id="body-container">
<!-- Provide site root to javascript -->
<script>
var path_to_root = "";
var default_theme = window.matchMedia("(prefers-color-scheme: dark)").matches ? "navy" : "light";
</script>
<!-- Work around some values being stored in localStorage wrapped in quotes -->
<script>
try {
var theme = localStorage.getItem('mdbook-theme');
var sidebar = localStorage.getItem('mdbook-sidebar');
if (theme.startsWith('"') && theme.endsWith('"')) {
localStorage.setItem('mdbook-theme', theme.slice(1, theme.length - 1));
}
if (sidebar.startsWith('"') && sidebar.endsWith('"')) {
localStorage.setItem('mdbook-sidebar', sidebar.slice(1, sidebar.length - 1));
}
} catch (e) { }
</script>
<!-- Set the theme before any content is loaded, prevents flash -->
<script>
var theme;
try { theme = localStorage.getItem('mdbook-theme'); } catch(e) { }
if (theme === null || theme === undefined) { theme = default_theme; }
var html = document.querySelector('html');
html.classList.remove('light')
html.classList.add(theme);
var body = document.querySelector('body');
body.classList.remove('no-js')
body.classList.add('js');
</script>
<input type="checkbox" id="sidebar-toggle-anchor" class="hidden">
<!-- Hide / unhide sidebar before it is displayed -->
<script>
var body = document.querySelector('body');
var sidebar = null;
var sidebar_toggle = document.getElementById("sidebar-toggle-anchor");
if (document.body.clientWidth >= 1080) {
try { sidebar = localStorage.getItem('mdbook-sidebar'); } catch(e) { }
sidebar = sidebar || 'visible';
} else {
sidebar = 'hidden';
}
sidebar_toggle.checked = sidebar === 'visible';
body.classList.remove('sidebar-visible');
body.classList.add("sidebar-" + sidebar);
</script>
<nav id="sidebar" class="sidebar" aria-label="Table of contents">
<div class="sidebar-scrollbox">
<ol class="chapter"><li class="chapter-item expanded affix "><a href="index.html">Mozilla Data Documentation</a></li><li class="chapter-item expanded "><a href="introduction/index.html"><strong aria-hidden="true">1.</strong> Introduction</a></li><li><ol class="section"><li class="chapter-item expanded "><a href="introduction/what_data.html"><strong aria-hidden="true">1.1.</strong> What Data does Mozilla Collect?</a></li><li class="chapter-item expanded "><a href="introduction/tools.html"><strong aria-hidden="true">1.2.</strong> Tools for Data Analysis</a></li><li class="chapter-item expanded "><a href="concepts/terminology.html"><strong aria-hidden="true">1.3.</strong> Terminology</a></li></ol></li><li class="chapter-item expanded "><a href="cookbooks/index.html"><strong aria-hidden="true">2.</strong> Tutorials & Cookbooks</a></li><li><ol class="section"><li class="chapter-item expanded "><a href="cookbooks/getting_started/index.html"><strong aria-hidden="true">2.1.</strong> Getting Started</a></li><li><ol class="section"><li class="chapter-item expanded "><a href="concepts/gaining_access.html"><strong aria-hidden="true">2.1.1.</strong> Gaining Access</a></li><li class="chapter-item expanded "><a href="concepts/getting_help.html"><strong aria-hidden="true">2.1.2.</strong> Getting Help</a></li><li class="chapter-item expanded "><a href="concepts/reporting_a_problem.html"><strong aria-hidden="true">2.1.3.</strong> Reporting a problem</a></li></ol></li><li class="chapter-item expanded "><a href="cookbooks/analysis/index.html"><strong aria-hidden="true">2.2.</strong> Analysis</a></li><li><ol class="section"><li class="chapter-item expanded "><a href="cookbooks/analysis/data_discovery_tools.html"><strong aria-hidden="true">2.2.1.</strong> Data Discovery Tools</a></li><li><ol class="section"><li class="chapter-item expanded "><a href="cookbooks/analysis/data_catalog.html"><strong aria-hidden="true">2.2.1.1.</strong> Using the Data Catalog</a></li><li class="chapter-item expanded "><a href="cookbooks/analysis/glean_dictionary.html"><strong aria-hidden="true">2.2.1.2.</strong> Using the Glean Dictionary</a></li><li class="chapter-item expanded "><a href="cookbooks/analysis/probe_dictionary.html"><strong aria-hidden="true">2.2.1.3.</strong> Using the Probe Dictionary</a></li></ol></li><li class="chapter-item expanded "><a href="cookbooks/data_monitoring/intro.html"><strong aria-hidden="true">2.2.2.</strong> Data Monitoring - Intro to Bigeye</a></li><li><ol class="section"><li class="chapter-item expanded "><a href="cookbooks/data_monitoring/interface.html"><strong aria-hidden="true">2.2.2.1.</strong> Interface</a></li><li class="chapter-item expanded "><a href="cookbooks/data_monitoring/deploying_metrics.html"><strong aria-hidden="true">2.2.2.2.</strong> Deploying Metrics</a></li><li class="chapter-item expanded "><a href="cookbooks/data_monitoring/collections.html"><strong aria-hidden="true">2.2.2.3.</strong> Collections</a></li><li class="chapter-item expanded "><a href="cookbooks/data_monitoring/issues_management.html"><strong aria-hidden="true">2.2.2.4.</strong> Issues Management</a></li><li class="chapter-item expanded "><a href="cookbooks/data_monitoring/cost_considerations.html"><strong aria-hidden="true">2.2.2.5.</strong> Cost Considerations</a></li><li class="chapter-item expanded "><a href="cookbooks/data_monitoring/further_reading.html"><strong aria-hidden="true">2.2.2.6.</strong> Further Reading</a></li></ol></li><li class="chapter-item expanded "><a href="cookbooks/data_modeling/index.html"><strong aria-hidden="true">2.2.3.</strong> Data Modeling</a></li><li><ol class="section"><li class="chapter-item expanded "><a href="cookbooks/data_modeling/where_to_store.html"><strong aria-hidden="true">2.2.3.1.</strong> Where to store the data model assets</a></li><li class="chapter-item expanded "><a href="cookbooks/data_modeling/using_aggregates.html"><strong aria-hidden="true">2.2.3.2.</strong> Using aggregates in BigQuery and Looker</a></li></ol></li><li class="chapter-item expanded "><a href="cookbooks/looker/index.html"><strong aria-hidden="true">2.2.4.</strong> Working with Looker</a></li><li><ol class="section"><li class="chapter-item expanded "><a href="cookbooks/looker/intro.html"><strong aria-hidden="true">2.2.4.1.</strong> Introduction to Looker</a></li><li class="chapter-item expanded "><a href="cookbooks/looker/countries.html"><strong aria-hidden="true">2.2.4.2.</strong> Normalizing Country Data</a></li><li class="chapter-item expanded "><a href="cookbooks/looker/browser_versions.html"><strong aria-hidden="true">2.2.4.3.</strong> Normalizing Browser Version Data</a></li><li class="chapter-item expanded "><a href="cookbooks/looker/growth_usage_dashboards.html"><strong aria-hidden="true">2.2.4.4.</strong> Using Growth and Usage Dashboards</a></li><li class="chapter-item expanded "><a href="cookbooks/looker/event_counts_explore.html"><strong aria-hidden="true">2.2.4.5.</strong> Using the Event Counts Explore</a></li><li class="chapter-item expanded "><a href="cookbooks/looker/funnel_analysis_explore.html"><strong aria-hidden="true">2.2.4.6.</strong> Using the Funnel Analysis Explore</a></li><li class="chapter-item expanded "><a href="cookbooks/looker/performance_caching.html"><strong aria-hidden="true">2.2.4.7.</strong> Looker Performance - Caching</a></li></ol></li><li class="chapter-item expanded "><a href="cookbooks/analysis/tools.html"><strong aria-hidden="true">2.2.5.</strong> Other Data Analysis Tools</a></li><li><ol class="section"><li class="chapter-item expanded "><a href="cookbooks/glam.html"><strong aria-hidden="true">2.2.5.1.</strong> Introduction to GLAM</a></li><li class="chapter-item expanded "><a href="cookbooks/operational_monitoring.html"><strong aria-hidden="true">2.2.5.2.</strong> Introduction to Operational Monitoring</a></li><li class="chapter-item expanded "><a href="tools/stmo.html"><strong aria-hidden="true">2.2.5.3.</strong> Introduction to STMO</a></li></ol></li><li class="chapter-item expanded "><a href="cookbooks/public_data.html"><strong aria-hidden="true">2.2.6.</strong> Accessing Public Data</a></li><li class="chapter-item expanded "><a href="cookbooks/bigquery.html"><strong aria-hidden="true">2.2.7.</strong> Accessing and working with BigQuery</a></li><li><ol class="section"><li class="chapter-item expanded "><a href="cookbooks/bigquery/access.html"><strong aria-hidden="true">2.2.7.1.</strong> Access</a></li><li class="chapter-item expanded "><a href="cookbooks/bigquery/querying.html"><strong aria-hidden="true">2.2.7.2.</strong> Writing Queries</a></li><li class="chapter-item expanded "><a href="cookbooks/bigquery/optimization.html"><strong aria-hidden="true">2.2.7.3.</strong> Optimization</a></li><li class="chapter-item expanded "><a href="cookbooks/bigquery/accessing_desktop_data.html"><strong aria-hidden="true">2.2.7.4.</strong> Accessing Desktop Data</a></li><li class="chapter-item expanded "><a href="cookbooks/accessing_glean_data.html"><strong aria-hidden="true">2.2.7.5.</strong> Accessing Glean Data</a></li><li class="chapter-item expanded "><a href="cookbooks/additional_props.html"><strong aria-hidden="true">2.2.7.6.</strong> Accessing Additional Properties</a></li><li class="chapter-item expanded "><a href="tools/spark.html"><strong aria-hidden="true">2.2.7.7.</strong> Custom analysis with Spark</a></li></ol></li><li class="chapter-item expanded "><a href="cookbooks/dataset_specific.html"><strong aria-hidden="true">2.2.8.</strong> Dataset-Specific</a></li><li><ol class="section"><li class="chapter-item expanded "><a href="cookbooks/normandy_events.html"><strong aria-hidden="true">2.2.8.1.</strong> Working with Normandy events</a></li><li class="chapter-item expanded "><a href="cookbooks/crash_pings.html"><strong aria-hidden="true">2.2.8.2.</strong> Working with Crash Pings</a></li><li class="chapter-item expanded "><a href="cookbooks/clients_last_seen_bits.html"><strong aria-hidden="true">2.2.8.3.</strong> Working with Bit Patterns in Clients Last Seen</a></li><li class="chapter-item expanded "><a href="cookbooks/main_ping_exponential_histograms.html"><strong aria-hidden="true">2.2.8.4.</strong> Visualizing Percentiles of a Main Ping Exponential Histogram</a></li></ol></li><li class="chapter-item expanded "><a href="cookbooks/realtime.html"><strong aria-hidden="true">2.2.9.</strong> Real-time</a></li><li><ol class="section"><li class="chapter-item expanded "><a href="cookbooks/live_data.html"><strong aria-hidden="true">2.2.9.1.</strong> Working with Live Data</a></li><li class="chapter-item expanded "><a href="cookbooks/view_pings_cep.html"><strong aria-hidden="true">2.2.9.2.</strong> Seeing Your Own Pings</a></li><li class="chapter-item expanded "><a href="cookbooks/real_time_search.html"><strong aria-hidden="true">2.2.9.3.</strong> See Real-time search metrics</a></li></ol></li><li class="chapter-item expanded "><a href="cookbooks/metrics.html"><strong aria-hidden="true">2.2.10.</strong> Metrics</a></li></ol></li><li class="chapter-item expanded "><a href="cookbooks/operational/index.html"><strong aria-hidden="true">2.3.</strong> Operational</a></li><li><ol class="section"><li class="chapter-item expanded "><a href="cookbooks/gcp-projects.html"><strong aria-hidden="true">2.3.1.</strong> Creating a Prototype Data Project on Google Cloud Platform</a></li><li class="chapter-item expanded "><a href="cookbooks/operational/protosaur.html"><strong aria-hidden="true">2.3.2.</strong> Creating Static Dashboards with Protosaur</a></li><li class="chapter-item expanded "><a href="cookbooks/scheduling_queries.html"><strong aria-hidden="true">2.3.3.</strong> Scheduling Queries</a></li><li class="chapter-item expanded "><a href="cookbooks/deploying-containers.html"><strong aria-hidden="true">2.3.4.</strong> Building and Deploying Containers to GCR with CircleCI</a></li><li class="chapter-item expanded "><a href="cookbooks/publishing_datasets.html"><strong aria-hidden="true">2.3.5.</strong> Publishing Datasets</a></li><li class="chapter-item expanded "><a href="cookbooks/operational/connecting_external_data_bigquery.html"><strong aria-hidden="true">2.3.6.</strong> Connecting Sheets and External Data to BigQuery</a></li></ol></li><li class="chapter-item expanded "><a href="datasets/new_data.html"><strong aria-hidden="true">2.4.</strong> Sending telemetry</a></li><li><ol class="section"><li class="chapter-item expanded "><a href="cookbooks/client_guidelines.html"><strong aria-hidden="true">2.4.1.</strong> Implementing Experiments</a></li><li class="chapter-item expanded "><a href="cookbooks/events_best_practices.html"><strong aria-hidden="true">2.4.2.</strong> Sending Events</a></li><li class="chapter-item expanded "><a href="cookbooks/new_ping.html"><strong aria-hidden="true">2.4.3.</strong> Sending a Custom Ping</a></li></ol></li></ol></li><li class="chapter-item expanded "><a href="reference/index.html"><strong aria-hidden="true">3.</strong> Data Platform Reference</a></li><li><ol class="section"><li class="chapter-item expanded "><a href="tools/data_stack_overview.html"><strong aria-hidden="true">3.1.</strong> Data Stack Overview</a></li><li class="chapter-item expanded "><a href="tools/guiding_principles.html"><strong aria-hidden="true">3.2.</strong> Guiding Principles for Data Infrastructure</a></li><li class="chapter-item expanded "><a href="concepts/glean/glean.html"><strong aria-hidden="true">3.3.</strong> Glean overview</a></li><li class="chapter-item expanded "><a href="concepts/pipeline/gcp_data_pipeline.html"><strong aria-hidden="true">3.4.</strong> Overview of Mozilla's Data Pipeline</a></li><li><ol class="section"><li class="chapter-item expanded "><a href="concepts/pipeline/http_edge_spec.html"><strong aria-hidden="true">3.4.1.</strong> HTTP Edge Server Specification</a></li><li class="chapter-item expanded "><a href="concepts/pipeline/event_pipeline.html"><strong aria-hidden="true">3.4.2.</strong> Event Pipeline Detail</a></li><li class="chapter-item expanded "><a href="concepts/pipeline/schemas.html"><strong aria-hidden="true">3.4.3.</strong> Schemas</a></li><li class="chapter-item expanded "><a href="concepts/pipeline/glean_data.html"><strong aria-hidden="true">3.4.4.</strong> Glean Data</a></li><li class="chapter-item expanded "><a href="concepts/channels/channel_normalization.html"><strong aria-hidden="true">3.4.5.</strong> Channel Normalization</a></li><li class="chapter-item expanded "><a href="concepts/sample_id.html"><strong aria-hidden="true">3.4.6.</strong> Sampling</a></li><li class="chapter-item expanded "><a href="concepts/pipeline/filtering.html"><strong aria-hidden="true">3.4.7.</strong> Filtering</a></li><li class="chapter-item expanded "><a href="concepts/pipeline/artifact_deployment.html"><strong aria-hidden="true">3.4.8.</strong> BigQuery Artifact Deployment</a></li></ol></li><li class="chapter-item expanded "><a href="concepts/analysis_gotchas.html"><strong aria-hidden="true">3.5.</strong> Common Analysis Gotchas</a></li><li class="chapter-item expanded "><a href="concepts/sql_style.html"><strong aria-hidden="true">3.6.</strong> SQL Style Guide</a></li><li class="chapter-item expanded "><a href="concepts/airflow_gotchas.html"><strong aria-hidden="true">3.7.</strong> Airflow Gotcha's</a></li><li class="chapter-item expanded "><a href="concepts/index.html"><strong aria-hidden="true">3.8.</strong> Telemetry Behavior Reference</a></li><li><ol class="section"><li class="chapter-item expanded "><a href="concepts/history.html"><strong aria-hidden="true">3.8.1.</strong> History of Telemetry</a></li><li class="chapter-item expanded "><a href="concepts/profile/index.html"><strong aria-hidden="true">3.8.2.</strong> Profile Behavior</a></li><li><ol class="section"><li class="chapter-item expanded "><a href="concepts/profile/profile_creation.html"><strong aria-hidden="true">3.8.2.1.</strong> Profile Creation</a></li><li class="chapter-item expanded "><a href="concepts/profile/realworldusage.html"><strong aria-hidden="true">3.8.2.2.</strong> Real World Usage</a></li><li class="chapter-item expanded "><a href="concepts/profile/profilehistory.html"><strong aria-hidden="true">3.8.2.3.</strong> Profile History</a></li></ol></li><li class="chapter-item expanded "><a href="concepts/engagement.html"><strong aria-hidden="true">3.8.3.</strong> Engagement metrics</a></li><li class="chapter-item expanded "><a href="concepts/segments.html"><strong aria-hidden="true">3.8.4.</strong> User states/Segments</a></li></ol></li><li class="chapter-item expanded "><a href="concepts/experiments.html"><strong aria-hidden="true">3.9.</strong> Experimentation</a></li><li class="chapter-item expanded "><a href="concepts/metric_hub.html"><strong aria-hidden="true">3.10.</strong> Metric Hub</a></li><li class="chapter-item expanded "><a href="concepts/external_data_integration_using_fivetran.html"><strong aria-hidden="true">3.11.</strong> External data integration using Fivetran</a></li><li class="chapter-item expanded "><a href="tools/projects.html"><strong aria-hidden="true">3.12.</strong> Project Glossary</a></li></ol></li><li class="chapter-item expanded "><a href="datasets/reference.html"><strong aria-hidden="true">4.</strong> Dataset Reference</a></li><li><ol class="section"><li class="chapter-item expanded "><a href="datasets/pings.html"><strong aria-hidden="true">4.1.</strong> Pings</a></li><li class="chapter-item expanded "><a href="datasets/derived.html"><strong aria-hidden="true">4.2.</strong> Derived Datasets</a></li><li><ol class="section"><li class="chapter-item expanded "><a href="datasets/active_profiles.html"><strong aria-hidden="true">4.2.1.</strong> Active Profiles</a></li><li class="chapter-item expanded "><a href="datasets/bigquery/active_users_aggregates/reference.html"><strong aria-hidden="true">4.2.2.</strong> Active Users</a></li><li class="chapter-item expanded "><a href="datasets/batch_view/addons/reference.html"><strong aria-hidden="true">4.2.3.</strong> Addons</a></li><li class="chapter-item expanded "><a href="datasets/other/addons_daily/reference.html"><strong aria-hidden="true">4.2.4.</strong> Addons Daily</a></li><li class="chapter-item expanded "><a href="datasets/other/asn_aggregates/reference.html"><strong aria-hidden="true">4.2.5.</strong> Autonomous System Aggregates</a></li><li class="chapter-item expanded "><a href="datasets/batch_view/clients_daily/reference.html"><strong aria-hidden="true">4.2.6.</strong> Clients Daily</a></li><li class="chapter-item expanded "><a href="datasets/bigquery/clients_last_seen/reference.html"><strong aria-hidden="true">4.2.7.</strong> Clients Last Seen</a></li><li class="chapter-item expanded "><a href="datasets/batch_view/events/reference.html"><strong aria-hidden="true">4.2.8.</strong> Events</a></li><li class="chapter-item expanded "><a href="datasets/bigquery/events_daily/reference.html"><strong aria-hidden="true">4.2.9.</strong> Events Daily</a></li><li class="chapter-item expanded "><a href="datasets/bigquery/firefox_android_clients/reference.html"><strong aria-hidden="true">4.2.10.</strong> Firefox Android Clients</a></li><li class="chapter-item expanded "><a href="datasets/main_ping_tables.html"><strong aria-hidden="true">4.2.11.</strong> Main Ping Tables</a></li><li class="chapter-item expanded "><a href="datasets/batch_view/main_summary/reference.html"><strong aria-hidden="true">4.2.12.</strong> Main Summary</a></li><li class="chapter-item expanded "><a href="datasets/other/socorro_crash/reference.html"><strong aria-hidden="true">4.2.13.</strong> Socorro Crash Reports</a></li><li class="chapter-item expanded "><a href="datasets/other/ssl/reference.html"><strong aria-hidden="true">4.2.14.</strong> SSL Ratios (public)</a></li><li class="chapter-item expanded "><a href="datasets/batch_view/telemetry_aggregates/reference.html"><strong aria-hidden="true">4.2.15.</strong> Telemetry Aggregates</a></li><li class="chapter-item expanded "><a href="datasets/glam.html"><strong aria-hidden="true">4.2.16.</strong> GLAM Aggregates</a></li></ol></li><li class="chapter-item expanded "><a href="datasets/experiments.html"><strong aria-hidden="true">4.3.</strong> Experiment Datasets</a></li><li><ol class="section"><li class="chapter-item expanded "><a href="datasets/jetstream.html"><strong aria-hidden="true">4.3.1.</strong> Jetstream</a></li><li class="chapter-item expanded "><a href="datasets/experiment_telemetry.html"><strong aria-hidden="true">4.3.2.</strong> Accessing experiment data</a></li><li class="chapter-item expanded "><a href="datasets/heartbeat.html"><strong aria-hidden="true">4.3.3.</strong> Accessing Heartbeat data</a></li><li class="chapter-item expanded "><a href="datasets/dynamic_telemetry.html"><strong aria-hidden="true">4.3.4.</strong> Dynamic telemetry</a></li><li class="chapter-item expanded "><a href="datasets/experiment_monitoring.html"><strong aria-hidden="true">4.3.5.</strong> Experiment monitoring</a></li></ol></li><li class="chapter-item expanded "><a href="datasets/search.html"><strong aria-hidden="true">4.4.</strong> Search Datasets</a></li><li><ol class="section"><li class="chapter-item expanded "><a href="datasets/search/search_aggregates/reference.html"><strong aria-hidden="true">4.4.1.</strong> Search Aggregates</a></li><li class="chapter-item expanded "><a href="datasets/search/search_clients_engines_sources_daily/reference.html"><strong aria-hidden="true">4.4.2.</strong> Search Clients Engines Sources Daily</a></li><li class="chapter-item expanded "><a href="datasets/search/search_clients_last_seen/reference.html"><strong aria-hidden="true">4.4.3.</strong> Search Clients Last Seen</a></li><li class="chapter-item expanded "><a href="datasets/search/client_ltv/reference.html"><strong aria-hidden="true">4.4.4.</strong> Client LTV</a></li><li class="chapter-item expanded "><a href="datasets/search/mobile_search_clients_sources_daily/intro.html"><strong aria-hidden="true">4.4.5.</strong> Mobile Search Clients Sources Daily</a></li></ol></li><li class="chapter-item expanded "><a href="datasets/non_desktop.html"><strong aria-hidden="true">4.5.</strong> Non-Desktop Datasets</a></li><li><ol class="section"><li class="chapter-item expanded "><a href="datasets/non_desktop/day_2_7_activation/reference.html"><strong aria-hidden="true">4.5.1.</strong> Day 2-7 Activation</a></li><li class="chapter-item expanded "><a href="datasets/non_desktop/google_play_store/reference.html"><strong aria-hidden="true">4.5.2.</strong> Google Play Store</a></li><li class="chapter-item expanded "><a href="datasets/non_desktop/apple_app_store/reference.html"><strong aria-hidden="true">4.5.3.</strong> Apple App Store</a></li></ol></li><li class="chapter-item expanded "><a href="datasets/other.html"><strong aria-hidden="true">4.6.</strong> Other Datasets</a></li><li><ol class="section"><li class="chapter-item expanded "><a href="datasets/other/hgpush/reference.html"><strong aria-hidden="true">4.6.1.</strong> hgpush</a></li><li class="chapter-item expanded "><a href="datasets/other/stub_installer/reference.html"><strong aria-hidden="true">4.6.2.</strong> Stub installer ping</a></li><li class="chapter-item expanded "><a href="datasets/other/bmobugs/reference.html"><strong aria-hidden="true">4.6.3.</strong> bmobugs</a></li><li class="chapter-item expanded "><a href="datasets/buildhub.html"><strong aria-hidden="true">4.6.4.</strong> Build metadata</a></li><li class="chapter-item expanded "><a href="datasets/releases.html"><strong aria-hidden="true">4.6.5.</strong> Release information</a></li><li class="chapter-item expanded "><a href="datasets/other/suggest/suggest.html"><strong aria-hidden="true">4.6.6.</strong> Suggest</a></li><li class="chapter-item expanded "><a href="datasets/other/sponsored_tiles/sponsored_tiles.html"><strong aria-hidden="true">4.6.7.</strong> Sponsored Tiles</a></li><li class="chapter-item expanded "><a href="datasets/other/newtab_interactions/reference.html"><strong aria-hidden="true">4.6.8.</strong> Newtab_Interactions</a></li><li class="chapter-item expanded "><a href="datasets/other/urlbar_events/reference.html"><strong aria-hidden="true">4.6.9.</strong> Urlbar Events</a></li><li class="chapter-item expanded "><a href="datasets/other/urlbar_events_daily/reference.html"><strong aria-hidden="true">4.6.10.</strong> Urlbar Events Daily</a></li><li class="chapter-item expanded "><a href="datasets/other/serp_events/reference.html"><strong aria-hidden="true">4.6.11.</strong> SERP Events</a></li></ol></li><li class="chapter-item expanded "><a href="datasets/fxa.html"><strong aria-hidden="true">4.7.</strong> Mozilla Accounts Datasets</a></li><li><ol class="section"><li class="chapter-item expanded "><a href="datasets/fxa_metrics/attribution.html"><strong aria-hidden="true">4.7.1.</strong> Mozilla Account Attribution</a></li><li class="chapter-item expanded "><a href="datasets/fxa_metrics/funnels.html"><strong aria-hidden="true">4.7.2.</strong> Mozilla Account Funnel Metrics</a></li><li class="chapter-item expanded "><a href="datasets/fxa_metrics/emails.html"><strong aria-hidden="true">4.7.3.</strong> Mozilla Account Email Metrics</a></li></ol></li><li class="chapter-item expanded "><a href="datasets/static.html"><strong aria-hidden="true">4.8.</strong> Static Datasets</a></li><li><ol class="section"><li class="chapter-item expanded "><a href="datasets/static/normalized_os.html"><strong aria-hidden="true">4.8.1.</strong> Normalized OS Names And Versions</a></li></ol></li></ol></li><li class="chapter-item expanded "><a href="historical/index.html"><strong aria-hidden="true">5.</strong> Historical Reference</a></li><li><ol class="section"><li class="chapter-item expanded "><a href="concepts/pipeline/data_pipeline.html"><strong aria-hidden="true">5.1.</strong> Previous AWS Pipeline Overview</a></li><li class="chapter-item expanded "><a href="concepts/pipeline/data_pipeline_detail.html"><strong aria-hidden="true">5.2.</strong> In-depth AWS Data Pipeline Detail</a></li><li class="chapter-item expanded "><a href="metrics/index.html"><strong aria-hidden="true">5.3.</strong> Metrics</a></li><li><ol class="section"><li class="chapter-item expanded "><a href="metrics/definitions.html"><strong aria-hidden="true">5.3.1.</strong> Definitions</a></li><li><ol class="section"><li class="chapter-item expanded "><a href="metrics/metrics.html"><strong aria-hidden="true">5.3.1.1.</strong> Metrics</a></li><li class="chapter-item expanded "><a href="metrics/usage.html"><strong aria-hidden="true">5.3.1.2.</strong> Usage Criteria</a></li><li class="chapter-item expanded "><a href="metrics/dimensions.html"><strong aria-hidden="true">5.3.1.3.</strong> Slicing Dimensions</a></li></ol></li><li class="chapter-item expanded "><a href="metrics/policy.html"><strong aria-hidden="true">5.3.2.</strong> Metrics Standardization and Policy</a></li></ol></li><li class="chapter-item expanded "><a href="concepts/censuses.html"><strong aria-hidden="true">5.4.</strong> Legacy Census Metrics</a></li><li class="chapter-item expanded "><a href="datasets/obsolete.html"><strong aria-hidden="true">5.5.</strong> Obsolete Datasets</a></li><li><ol class="section"><li class="chapter-item expanded "><a href="datasets/obsolete/activity-stream/reference.html"><strong aria-hidden="true">5.5.1.</strong> Activity Stream</a></li><li class="chapter-item expanded "><a href="datasets/obsolete/attitudes_daily/reference.html"><strong aria-hidden="true">5.5.2.</strong> Attitudes Daily</a></li><li class="chapter-item expanded "><a href="datasets/obsolete/churn/reference.html"><strong aria-hidden="true">5.5.3.</strong> Churn</a></li><li class="chapter-item expanded "><a href="datasets/obsolete/client_count_daily/reference.html"><strong aria-hidden="true">5.5.4.</strong> Client Count Daily</a></li><li class="chapter-item expanded "><a href="datasets/obsolete/client_count/reference.html"><strong aria-hidden="true">5.5.5.</strong> Client Count</a></li><li class="chapter-item expanded "><a href="datasets/obsolete/crash_aggregates/reference.html"><strong aria-hidden="true">5.5.6.</strong> Crash Aggregates</a></li><li class="chapter-item expanded "><a href="datasets/obsolete/crash_summary/reference.html"><strong aria-hidden="true">5.5.7.</strong> Crash Summary</a></li><li class="chapter-item expanded "><a href="datasets/obsolete/error_aggregates/reference.html"><strong aria-hidden="true">5.5.8.</strong> Error Aggregates</a></li><li class="chapter-item expanded "><a href="datasets/obsolete/first_shutdown_summary/reference.html"><strong aria-hidden="true">5.5.9.</strong> First Shutdown Summary</a></li><li class="chapter-item expanded "><a href="datasets/obsolete/heavy_users/reference.html"><strong aria-hidden="true">5.5.10.</strong> Heavy Users</a></li><li class="chapter-item expanded "><a href="datasets/obsolete/legacy_mobile/reference.html"><strong aria-hidden="true">5.5.11.</strong> Legacy Mobile Datasets</a></li><li class="chapter-item expanded "><a href="datasets/obsolete/longitudinal/reference.html"><strong aria-hidden="true">5.5.12.</strong> Longitudinal</a></li><li class="chapter-item expanded "><a href="datasets/obsolete/retention/reference.html"><strong aria-hidden="true">5.5.13.</strong> Retention</a></li><li class="chapter-item expanded "><a href="datasets/obsolete/sync_summary/reference.html"><strong aria-hidden="true">5.5.14.</strong> Sync Summary</a></li></ol></li></ol></li><li class="chapter-item expanded "><a href="contributing/index.html"><strong aria-hidden="true">6.</strong> Contributing</a></li><li><ol class="section"><li class="chapter-item expanded "><a href="contributing/style_guide.html"><strong aria-hidden="true">6.1.</strong> Style Guide</a></li><li class="chapter-item expanded "><a href="contributing/structure.html"><strong aria-hidden="true">6.2.</strong> Structure</a></li></ol></li></ol>
</div>
<div id="sidebar-resize-handle" class="sidebar-resize-handle">
<div class="sidebar-resize-indicator"></div>
</div>
</nav>
<!-- Track and set sidebar scroll position -->
<script>
var sidebarScrollbox = document.querySelector('#sidebar .sidebar-scrollbox');
sidebarScrollbox.addEventListener('click', function(e) {
if (e.target.tagName === 'A') {
sessionStorage.setItem('sidebar-scroll', sidebarScrollbox.scrollTop);
}
}, { passive: true });
var sidebarScrollTop = sessionStorage.getItem('sidebar-scroll');
sessionStorage.removeItem('sidebar-scroll');
if (sidebarScrollTop) {
// preserve sidebar scroll position when navigating via links within sidebar
sidebarScrollbox.scrollTop = sidebarScrollTop;
} else {
// scroll sidebar to current active section when navigating via "next/previous chapter" buttons
var activeSection = document.querySelector('#sidebar .active');
if (activeSection) {
activeSection.scrollIntoView({ block: 'center' });
}
}
</script>
<div id="page-wrapper" class="page-wrapper">
<div class="page">
<div id="menu-bar-hover-placeholder"></div>
<div id="menu-bar" class="menu-bar sticky">
<div class="left-buttons">
<label id="sidebar-toggle" class="icon-button" for="sidebar-toggle-anchor" title="Toggle Table of Contents" aria-label="Toggle Table of Contents" aria-controls="sidebar">
<i class="fa fa-bars"></i>
</label>
<button id="theme-toggle" class="icon-button" type="button" title="Change theme" aria-label="Change theme" aria-haspopup="true" aria-expanded="false" aria-controls="theme-list">
<i class="fa fa-paint-brush"></i>
</button>
<ul id="theme-list" class="theme-popup" aria-label="Themes" role="menu">
<li role="none"><button role="menuitem" class="theme" id="light">Light</button></li>
<li role="none"><button role="menuitem" class="theme" id="rust">Rust</button></li>
<li role="none"><button role="menuitem" class="theme" id="coal">Coal</button></li>
<li role="none"><button role="menuitem" class="theme" id="navy">Navy</button></li>
<li role="none"><button role="menuitem" class="theme" id="ayu">Ayu</button></li>
</ul>
<button id="search-toggle" class="icon-button" type="button" title="Search. (Shortkey: s)" aria-label="Toggle Searchbar" aria-expanded="false" aria-keyshortcuts="S" aria-controls="searchbar">
<i class="fa fa-search"></i>
</button>
</div>
<h1 class="menu-title">Mozilla Data Documentation</h1>
<div class="right-buttons">
<a href="print.html" title="Print this book" aria-label="Print this book">
<i id="print-button" class="fa fa-print"></i>
</a>
<a href="https://github.com/mozilla/data-docs" title="Git repository" aria-label="Git repository">
<i id="git-repository-button" class="fa fa-github"></i>
</a>
</div>
</div>
<div id="search-wrapper" class="hidden">
<form id="searchbar-outer" class="searchbar-outer">
<input type="search" id="searchbar" name="searchbar" placeholder="Search this book ..." aria-controls="searchresults-outer" aria-describedby="searchresults-header">
</form>
<div id="searchresults-outer" class="searchresults-outer hidden">
<div id="searchresults-header" class="searchresults-header"></div>
<ul id="searchresults">
</ul>
</div>
</div>
<!-- Apply ARIA attributes after the sidebar and the sidebar toggle button are added to the DOM -->
<script>
document.getElementById('sidebar-toggle').setAttribute('aria-expanded', sidebar === 'visible');
document.getElementById('sidebar').setAttribute('aria-hidden', sidebar !== 'visible');
Array.from(document.querySelectorAll('#sidebar a')).forEach(function(link) {
link.setAttribute('tabIndex', sidebar === 'visible' ? 0 : -1);
});
</script>
<div id="content" class="content">
<main>
<h1 id="mozilla-data-documentation"><a class="header" href="#mozilla-data-documentation">Mozilla Data Documentation</a></h1>
<p>This documentation was written to help Mozillians analyze and interpret data collected by our products, such as
<a href="https://www.mozilla.org/firefox">Firefox</a> and <a href="https://www.mozilla.org/products/vpn/">Mozilla VPN</a>. Mozilla refers
to the systems that collect and process this data as <a href="./concepts/terminology.html#telemetry">Telemetry</a>.</p>
<p>At <a href="https://www.mozilla.org">Mozilla</a>, our data-gathering and data-handling practices are anchored in our
<a href="https://www.mozilla.org/en-US/privacy/principles/">Data Privacy Principles</a> and elaborated in the
<a href="https://www.mozilla.org/en-US/privacy/">Mozilla Privacy Policy</a>. You can learn more about what data Firefox
collects and the choices you can make as a Firefox user in the
<a href="https://www.mozilla.org/en-US/privacy/firefox/">Firefox Privacy Notice</a>.</p>
<p>If there's information missing from these docs, or if you'd like to contribute, see <a href="contributing/index.html">this article on contributing</a>,
and feel free to <a href="https://bugzilla.mozilla.org/enter_bug.cgi?assigned_to=nobody%40mozilla.org&amp;bug_file_loc=http%3A%2F%2F&amp;bug_ignored=0&amp;bug_severity=normal&amp;bug_status=NEW&amp;cf_fx_iteration=---&amp;cf_fx_points=---&amp;component=Documentation%20and%20Knowledge%20Repo%20%28RTMO%29&amp;contenttypemethod=autodetect&amp;contenttypeselection=text%2Fplain&amp;defined_groups=1&amp;flag_type-4=X&amp;flag_type-607=X&amp;flag_type-800=X&amp;flag_type-803=X&amp;flag_type-916=X&amp;form_name=enter_bug&amp;maketemplate=Remember%20values%20as%20bookmarkable%20template&amp;op_sys=Linux&amp;priority=--&amp;product=Data%20Platform%20and%20Tools&amp;rep_platform=x86_64&amp;target_milestone=---&amp;version=unspecified">file a bug here</a>.</p>
<p>You can locate the source for this documentation in the <a href="https://github.com/mozilla/data-docs">data-docs repository</a> on GitHub.</p>
<h2 id="using-this-document"><a class="header" href="#using-this-document">Using this document</a></h2>
<p>This documentation is divided into the following sections:</p>
<h3 id="introduction"><a class="header" href="#introduction"><a href="introduction/index.html">Introduction</a></a></h3>
<p>This section provides a <strong>quick introduction</strong> to Mozilla's Telemetry data: it should help you understand how this data is collected and how to begin analyzing it.</p>
<h3 id="cookbooks--tutorials"><a class="header" href="#cookbooks--tutorials"><a href="cookbooks/index.html">Cookbooks &amp; Tutorials</a></a></h3>
<p>This section contains tutorials presented in a simple problem/solution format, organized by topic.</p>
<h3 id="data-platform-reference"><a class="header" href="#data-platform-reference"><a href="reference/index.html">Data Platform Reference</a></a></h3>
<p>This section contains detailed reference material on the Mozilla data platform, including links to other resources where appropriate.</p>
<h3 id="dataset-reference"><a class="header" href="#dataset-reference"><a href="datasets/reference.html">Dataset Reference</a></a></h3>
<p>In-depth references for some of the major datasets we maintain for our
products.</p>
<p>For each dataset, we include a description of the dataset's purpose,
what data is included, how the data is collected,
and how you can change or augment the dataset.
You do not need to read this section end-to-end.</p>
<h3 id="historical-reference"><a class="header" href="#historical-reference"><a href="historical/index.html">Historical Reference</a></a></h3>
<p>This section contains some documentation of things that used to be part of the Mozilla Data Platform, but are no
longer. You can generally ignore this section, it is intended only to answer questions like &quot;what happened to X?&quot;.</p>
<p>You can find the <a href="https://docs.telemetry.mozilla.org">fully-rendered documentation here</a>,
rendered with <a href="https://github.com/rust-lang/mdBook">mdBook</a>, and hosted on Github pages.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/index.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="introduction-1"><a class="header" href="#introduction-1">Introduction</a></h1>
<p>This section is an introductory guide to analyzing Telemetry data:
it should give you enough knowledge and understanding to begin exploring our systems.
After reading through this section, you can look through the <a href="introduction/../cookbooks/index.html">tutorials</a>,
which has more specific guides on performing particular tasks.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/introduction/index.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="what-data-does-mozilla-collect"><a class="header" href="#what-data-does-mozilla-collect">What Data does Mozilla Collect?</a></h1>
<p>Mozilla, like many other organizations, relies on data to make product decisions.
However, <em>unlike</em> many other organizations, Mozilla balances its goal of collecting useful, high-quality data with giving its users meaningful choice and control over their own data.
Our approach to data is most succinctly described by the <a href="https://www.mozilla.org/privacy/principles/">Mozilla Privacy Principles</a>.
If you want to know what Mozilla thinks about data, the Principles will tell you that.</p>
<p>From those principles come <a href="https://www.mozilla.org/privacy/">Mozilla's Privacy Notices</a>.
They differ from product to product because the data each product deals with is different.
If you want to know what kinds of data each Mozilla product collects and what we do with it, the Privacy Notices will tell you that.</p>
<p>From the Principles and the Notices Mozilla derives operational processes to allow it to make decisions about what data it can collect, store, and publish.
Here are a few of them:</p>
<ul>
<li><a href="https://wiki.mozilla.org/Data_Collection">Data Collection</a>: Mozilla's policies around data collection</li>
<li><a href="https://wiki.mozilla.org/Data_Publishing">Data Publishing</a>: How Mozilla publishes (a subset of) of the data it collects for the public benefit</li>
</ul>
<p>If you want to know how we ensure the data Mozilla collects, store, and publish abide by the Privacy Notices and the Principles, these processes will tell you that.</p>
<p>The data Mozilla collects can roughly be categorized into three categories: product telemetry, usage logs and website telemetry.</p>
<h2 id="product-telemetry"><a class="header" href="#product-telemetry">Product Telemetry</a></h2>
<p>Most of our products, including Firefox, are instrumented to send small JSON packets called &quot;pings&quot; when telemetry is enabled.
Pings include some combination of environment data (e.g., information about operating system and hardware), measurements (e.g., for Firefox, information about the maximum number of open tabs and time spent running in JavaScript garbage collections), and events (indications that something has happened).</p>
<p>Inside Firefox, most Telemetry is collected via a module called &quot;Telemetry&quot;.
The details of our ping formats is extensively documented in the Firefox Source Docs under <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/index.html">Toolkit/Telemetry</a>.</p>
<p>In newer products like Firefox for Android, instrumentation is handled by the <a href="introduction/../concepts/glean/glean.html">Glean SDK</a>, whose design was inspired from what Mozilla learned from the implementation of the Telemetry module and has many benefits.
At some point in the near future, Mozilla plans to replace the Telemetry module with the Glean SDK.
For more information, see <a href="https://firefox-source-docs.mozilla.org/toolkit/components/glean/index.html">Firefox on Glean (FOG)</a>.</p>
<p>When ping submissions from our clients hit our end points, they are aggregated and stored into ping-level datasets.
On a daily basis, the information in these pings datasets is summarized and transformed into derived datasets which are easier to reason about and faster to query.
You can learn more about this in <a href="introduction/../tools/guiding_principles.html">Guiding Principles for Data Infrastructure</a>.</p>
<p>Both the ping and derived datasets are viewable using tools like <a href="introduction/./tools.html#glean-aggregated-metrics-dashboard-glam">GLAM</a> and <a href="introduction/./tools.html#looker">Looker</a>.
For more information, see <a href="introduction/./tools.html">Tools for Data Analysis</a>.</p>
<h2 id="usage-logs"><a class="header" href="#usage-logs">Usage Logs</a></h2>
<p>Some of our products, like <a href="https://www.mozilla.org/firefox/sync/">Firefox Sync</a>, produce logs on the server when they are used.
For analysis purposes, we take this log data, strip it user identifiers and summarize it into derived datasets which can be queried with either <a href="introduction/../cookbooks/bigquery.html">BigQuery</a> or <a href="introduction/./tools.html#looker">Looker</a>.
As with product telemetry, this data can be helpful for understanding how our products are used.
For example, it can tell us how many people from a particular locale are engaging with a particular service.</p>
<h2 id="website-telemetry"><a class="header" href="#website-telemetry">Website Telemetry</a></h2>
<p>Mozilla uses tools like Google Analytics to measure interactions on our web sites like <a href="https://mozilla.org">mozilla.org</a>.
To facilitate comparative analysis with product and usage telemetry, we export much of this data into our Data Warehouse, so that it can viewed with <a href="introduction/./tools.html#looker">Looker</a> and other tools.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/introduction/what_data.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="tools-for-data-analysis"><a class="header" href="#tools-for-data-analysis">Tools for Data Analysis</a></h1>
<p>This is a starting point for making sense of the tools used for analyzing Mozilla data. There are different tools available, each with their own strengths, tailored to a variety of use cases and skill sets.</p>
<ul>
<li><a href="introduction/tools.html#high-level-tools">High-level tools</a>
<ul>
<li><a href="introduction/tools.html#looker">Looker</a></li>
<li><a href="introduction/tools.html#glean-aggregated-metrics-dashboard-glam">Glean Aggregated Metrics Dashboard (GLAM)</a></li>
</ul>
</li>
<li><a href="introduction/tools.html#lower-level-tools">Lower-level tools</a>
<ul>
<li><a href="introduction/tools.html#sqltelemetrymozillaorg-stmo">sql.telemetry.mozilla.org (STMO)</a></li>
</ul>
</li>
<li><a href="introduction/tools.html#deprecated-tools">Deprecated tools</a>
<ul>
<li><a href="introduction/tools.html#telemetry-measurement-dashboard">Telemetry Measurement Dashboard</a></li>
</ul>
</li>
</ul>
<h2 id="high-level-tools"><a class="header" href="#high-level-tools">High-level tools</a></h2>
<p>These web-based tools do not require specialized technical knowledge (e.g. how to write an SQL query, deep knowledge of BigQuery). This is where you should start.</p>
<h3 id="looker"><a class="header" href="#looker">Looker</a></h3>
<p>In 2020, Mozilla chose <a href="https://looker.com/">Looker</a> as its primary tool for analyzing data.
It allows data exploration and visualization by experts and non-experts alike.</p>
<p>For a brief introduction to Looker, see <a href="introduction/../cookbooks/looker/intro.html">Introduction to Looker</a>.</p>
<h3 id="glean-aggregated-metrics-dashboard-glam"><a class="header" href="#glean-aggregated-metrics-dashboard-glam">Glean Aggregated Metrics Dashboard (GLAM)</a></h3>
<p>The <a href="https://glam.telemetry.mozilla.org/">Glean Aggregated Metrics Dashboard</a> (GLAM) is an interactive dashboard that is Mozillas primary self-service tool for examining the distributions of values of specific individual telemetry metrics, over time and across different user populations. It is similar to GUD in that it is meant to be usable by everyone; no specific data analysis or coding skills are needed. But while GUD is focused on a relatively small number of high level, derived product metrics about user engagement (e.g. MAU, DAU, retention, etc) GLAM is focused on a diverse and plentiful set of probes and data points that engineers capture in code and transmit back from Firefox and other Mozilla products.</p>
<p>For more information on how to use GLAM, see <a href="introduction/../cookbooks/glam.html">Introduction to GLAM</a>.</p>
<h2 id="lower-level-tools"><a class="header" href="#lower-level-tools">Lower-level tools</a></h2>
<p>These tools require more specialized knowledge to use.</p>
<h3 id="sqltelemetrymozillaorg-stmo"><a class="header" href="#sqltelemetrymozillaorg-stmo">sql.telemetry.mozilla.org (STMO)</a></h3>
<p>The <a href="https://sql.telemetry.mozilla.org"><code>sql.telemetry.mozilla.org</code></a> (STMO) site
is an instance of the very fine <a href="https://redash.io/">Redash</a> software, allowing
for SQL-based exploratory analysis and visualization / dashboard
construction.
Requires (surprise!) familiarity with SQL, and for your data to be explicitly exposed as an STMO data source.
You can learn more about how to use it in <a href="introduction/../tools/stmo.html">Introduction to STMO</a>.</p>
<p>Note that while STMO is not yet considered deprecated, Looker is the preferred solution for producing data visualizations and dashboards at Mozilla (where possible).</p>
<h2 id="deprecated-tools"><a class="header" href="#deprecated-tools">Deprecated tools</a></h2>
<p>These tools are still available, but are generally not recommended.</p>
<h3 id="telemetry-measurement-dashboard"><a class="header" href="#telemetry-measurement-dashboard">Telemetry Measurement Dashboard</a></h3>
<p>The <a href="https://telemetry.mozilla.org/new-pipeline/dist.html">Telemetry Measurement Dashboard</a> (TMO) site is the 'venerable standby' of Firefox telemetry analysis tools.
It is the predecessor to GLAM (see above) and is still lightly maintained until we are sure that GLAM covers all of its use cases.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/introduction/tools.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="terminology"><a class="header" href="#terminology">Terminology</a></h1>
<h2 id="table-of-contents"><a class="header" href="#table-of-contents">Table of Contents</a></h2>
<p>This glossary provides definitions for some common terms used in the Mozilla data universe.
If you're new to Mozilla, you may also find the <a href="https://wiki.mozilla.org/Glossary">general glossary</a> on <code>wiki.mozilla.org</code> helpful.</p>
<ul>
<li><a href="concepts/terminology.html#aet">AET</a></li>
<li><a href="concepts/terminology.html#analyst">Analyst</a></li>
<li><a href="concepts/terminology.html#amplitude">Amplitude</a></li>
<li><a href="concepts/terminology.html#bigquery">BigQuery</a></li>
<li><a href="concepts/terminology.html#build-id">Build ID</a></li>
<li><a href="concepts/terminology.html#client-id">Client ID</a></li>
<li><a href="concepts/terminology.html#data-analyst">Data Analyst</a></li>
<li><a href="concepts/terminology.html#data-engineer">Data Engineer</a></li>
<li><a href="concepts/terminology.html#data-practitioner">Data Practitioner</a></li>
<li><a href="concepts/terminology.html#data-scientist">Data Scientist</a></li>
<li><a href="concepts/terminology.html#dataset">Dataset</a></li>
<li><a href="concepts/terminology.html#dau">DAU</a></li>
<li><a href="concepts/terminology.html#derived-dataset">Derived Dataset</a></li>
<li><a href="concepts/terminology.html#glean">Glean</a></li>
<li><a href="concepts/terminology.html#gcp">GCP</a></li>
<li><a href="concepts/terminology.html#geoip">GeoIP</a></li>
<li><a href="concepts/terminology.html#ingestion">Ingestion</a></li>
<li><a href="concepts/terminology.html#intensity">Intensity</a></li>
<li><a href="concepts/terminology.html#kpi">KPI</a></li>
<li><a href="concepts/terminology.html#metric">Metric</a></li>
<li><a href="concepts/terminology.html#mau">MAU</a></li>
<li><a href="concepts/terminology.html#ping">Ping</a></li>
<li><a href="concepts/terminology.html#ping-table">Ping Table</a></li>
<li><a href="concepts/terminology.html#pipeline">Pipeline</a></li>
<li><a href="concepts/terminology.html#probe">Probe</a></li>
<li><a href="concepts/terminology.html#profile">Profile</a></li>
<li><a href="concepts/terminology.html#query">Query</a></li>
<li><a href="concepts/terminology.html#retention">Retention</a></li>
<li><a href="concepts/terminology.html#schema">Schema</a></li>
<li><a href="concepts/terminology.html#session">Session</a></li>
<li><a href="concepts/terminology.html#subsession">Subsession</a></li>
<li><a href="concepts/terminology.html#stmo-sqltelemetrymozillaorg">STMO (sql.telemetry.mozilla.org)</a></li>
<li><a href="concepts/terminology.html#telemetry">Telemetry</a></li>
<li><a href="concepts/terminology.html#uri">URI</a></li>
<li><a href="concepts/terminology.html#url">URL</a></li>
<li><a href="concepts/terminology.html#wau">WAU</a></li>
</ul>
<h2 id="aet"><a class="header" href="#aet">AET</a></h2>
<p>Account Ecosystem Telemetry (never fully launched); see the <a href="https://docs.google.com/document/d/1yRLiD8JuaZIIaKhs6DhXEa7aH8jwOau5yW0kHaldFQU/edit#">PRD</a></p>
<h2 id="analyst"><a class="header" href="#analyst">Analyst</a></h2>
<p>See <a href="concepts/terminology.html#data-analyst">Data Analyst</a>.</p>
<h2 id="amplitude"><a class="header" href="#amplitude">Amplitude</a></h2>
<p>A third-party product formerly used by several teams within Mozilla for analysis of user events.</p>
<h2 id="bigquery"><a class="header" href="#bigquery">BigQuery</a></h2>
<p><a href="https://cloud.google.com/bigquery">BigQuery</a> is Google Cloud's managed data warehouse. Most of the data described on this site is stored and queried using BigQuery. See <a href="concepts/../cookbooks/bigquery.html">Accessing and working with BigQuery</a> for more details.</p>
<h2 id="build-id"><a class="header" href="#build-id">Build ID</a></h2>
<p>A unique identifier for a build like <code>20210317095331</code>.
Often used to identify and aggregate telemetry submitted by specific versions of our software.
Note that <a href="concepts/./analysis_gotchas.html#build-ids">the format may differ across product lines</a>.</p>
<h2 id="client-id"><a class="header" href="#client-id">Client ID</a></h2>
<p>A unique id identifying the client who sent a <a href="concepts/terminology.html#ping">ping</a>.</p>
<h2 id="data-analyst"><a class="header" href="#data-analyst">Data Analyst</a></h2>
<p>This is a common job title for someone who spends a large amount of their time analyzing data. At Mozilla, we tend not to use this term or title, favoring <a href="concepts/terminology.html#data-practitioner">Data Practitioner</a> or <a href="concepts/terminology.html#data-scientist">Data Scientist</a> instead.</p>
<h2 id="data-engineer"><a class="header" href="#data-engineer">Data Engineer</a></h2>
<p>A &quot;Data Engineer&quot; at Mozilla generally refers to someone on the Data Engineering team. They implement and maintain the data platform and tools described in this document. They may also assist data scientists or other data practitioners, as needed.</p>
<h2 id="data-practitioner"><a class="header" href="#data-practitioner">Data Practitioner</a></h2>
<p>A data practitioner is someone who looks at data, identifies trends and other qualitative measurements in them, and creates charts and dashboards. It could be anyone: engineer, product manager, data engineer or data scientist.</p>
<h2 id="data-scientist"><a class="header" href="#data-scientist">Data Scientist</a></h2>
<p>A &quot;Data Scientist&quot; at Mozilla generally refers to someone on the Data Science team. They have a broad array of technical backgrounds and a core set of common professional skills:</p>
<ul>
<li>applying statistical methods to noisy data to answer questions about what, how, or why something is happening</li>
<li>transform unstructured data into usable metrics and models</li>
<li>augmenting strategic product and decision-making with empirical evidence created and curated by the team</li>
</ul>
<h2 id="dataset"><a class="header" href="#dataset">Dataset</a></h2>
<p>A set of data, which includes ping data, derived datasets, etc.; sometimes it is used synonymously with “table”; sometimes it is used technically to refer to a <a href="concepts/terminology.html#bigquery">BigQuery</a> dataset, which represents a container for one or more tables.</p>
<h2 id="dau"><a class="header" href="#dau">DAU</a></h2>
<p>Daily Active Users - The number of unique <a href="concepts/terminology.html#client-id">client ids</a> that are active each day.</p>
<p>For more details, see the <a href="https://mozilla-hub.atlassian.net/wiki/spaces/DATA/pages/314704478/Daily+Active+Users+DAU+Metric">DAU Metric</a> page on Confluence.</p>
<h2 id="derived-dataset"><a class="header" href="#derived-dataset">Derived Dataset</a></h2>
<p>A processed dataset, such as <a href="concepts/../datasets/batch_view/clients_daily/reference.html">Clients Daily</a>. At Mozilla, this is in contrast to a raw ping table which represents (more or less) the raw data submitted by our users.</p>
<h2 id="glean"><a class="header" href="#glean">Glean</a></h2>
<p>Glean is Mozillas product analytics &amp; telemetry solution that provides a consistent experience and behavior across all of our products. Most of Mozilla's mobile apps, including Fenix, have been adapted to use the Glean SDK. For more information, see the <a href="concepts/./glean/glean.html">Glean Overview</a>.</p>
<h2 id="gcp"><a class="header" href="#gcp">GCP</a></h2>
<p>Google Cloud Platform (GCP) is a suite of cloud-computing services that runs on the same infrastructure that Google uses internally for its end-user products.</p>
<h2 id="geoip"><a class="header" href="#geoip">GeoIP</a></h2>
<p>IP Geolocation involves attempting to discover the location of an IP address in the real world. IP addresses are assigned to an organization, and as these are ever-changing associations, it can be difficult to determine exactly where in the world an IP address is located. Mozillas ingestion infrastructure attempts to perform GeoIP lookup during the data decoding process and subsequently discards the IP address before the message arrives in long-term storage.</p>
<h2 id="ingestion"><a class="header" href="#ingestion">Ingestion</a></h2>
<p>Mozilla's core data platform has been built to support structured ingestion of arbitrary JSON payloads whether they come from browser products on client devices or from server-side applications that have nothing to do with Firefox; any team at Mozilla can hook into structured ingestion by defining a schema and registering it with pipeline. Once a schema is registered, everything else is automatically provisioned, from an HTTPS endpoint for accepting payloads to a set of tables in BigQuery for holding the processed data.</p>
<h2 id="intensity"><a class="header" href="#intensity">Intensity</a></h2>
<p>Intuitively, how many days per week do users use the product? Among profiles active at least once in the week ending on the date specified, the number of days on average they were active during that one-week window.</p>
<h2 id="kpi"><a class="header" href="#kpi">KPI</a></h2>
<p>Key Performance Indicator - a <a href="concepts/terminology.html#metric">metric</a> that is used to measure performance across an organization, product, or project.</p>
<h2 id="metric"><a class="header" href="#metric">Metric</a></h2>
<p>In general: a metric is anything that you want to (and can) measure. This differs from a dimension which is a qualitative attribute of data.</p>
<p>In the context of <a href="concepts/terminology.html#glean">Glean</a>, a metric refers to an instrumented measure for a specific aspect of the product (similar to a <a href="concepts/terminology.html#probe">probe</a> in Firefox Telemetry).</p>
<h2 id="mau"><a class="header" href="#mau">MAU</a></h2>
<p>Monthly Active Users - the number of unique profiles active in the 28-day period ending on a given day. The number of unique profiles active at least once during the 28-day window
ending on the specified day.</p>
<h2 id="ping"><a class="header" href="#ping">Ping</a></h2>
<p>A ping represents a message that is sent from the Firefox browser to Mozillas Telemetry servers. It typically includes information about the browsers state, user actions, etc.
For more information, see <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/data/common-ping.html">Common ping format</a>.</p>
<h2 id="ping-table"><a class="header" href="#ping-table">Ping Table</a></h2>
<p>A set of pings that is stored in a BigQuery table. See article on <a href="concepts/../datasets/pings.html">raw ping datasets</a>.</p>
<h2 id="pipeline"><a class="header" href="#pipeline">Pipeline</a></h2>
<p>Mozillas data pipeline, which is used to collect Telemetry data from Mozillas products and logs from various services.
The bulk of the data that is handled by this pipeline is Firefox Telemetry data. The same tool-chain is used to collect, store, and analyze data that comes from many sources.</p>
<p>For more information, see <a href="concepts/./pipeline/gcp_data_pipeline.html">An overview of Mozillas Data Pipeline</a>.</p>
<h2 id="probe"><a class="header" href="#probe">Probe</a></h2>
<p>Measurements for a specific aspect of Firefox are called probes. A single telemetry ping sends many different probes. Probes are either Histograms (recording distributions of data points) or Scalars (recording a single value).</p>
<p>You can search for details about probes by using the <a href="https://probes.telemetry.mozilla.org/">Probe Dictionary</a>. For each probe, the probe dictionary provides:</p>
<ul>
<li>A description of the probe</li>
<li>When a probe started being collected</li>
<li>Whether data from this probe is collected in the release channel</li>
</ul>
<p>Newer measurements implemented using <a href="concepts/terminology.html#glean">Glean</a> are referred to as <a href="concepts/terminology.html#metric">metrics</a> instead of probes, but the basic outline is the same. Details about Glean Metrics are collected inside the <a href="https://dictionary.telemetry.mozilla.org">Glean Dictionary</a>.</p>
<h2 id="profile"><a class="header" href="#profile">Profile</a></h2>
<p>All of the changes a user makes in Firefox, like the home page, what toolbars you use, installed addons, saved passwords and your bookmarks, are all stored in a special folder, called a profile. Telemetry stores archived and pending pings in the profile directory as well as metadata like the <a href="concepts/terminology.html#client-id">client id</a>. See also <a href="concepts/./profile/profile_creation.html">Profile Creation</a>.</p>
<h2 id="query"><a class="header" href="#query">Query</a></h2>
<p>Typically refers to a query written in the SQL syntax, run on (for example) <a href="concepts/terminology.html#stmo-sqltelemetrymozillaorg">STMO</a>.</p>
<h2 id="retention"><a class="header" href="#retention">Retention</a></h2>
<ul>
<li>
<p>As in “Data retention” - how long data is stored before it is automatically deleted/archived?</p>
</li>
<li>
<p>As in “User retention” - how likely is a user to continue using a product?</p>
</li>
</ul>
<h2 id="schema"><a class="header" href="#schema">Schema</a></h2>
<p>A schema is the organization or structure for our data. We use schemas at many levels (in data ingestion and storage) to make sure the data we submit is valid and possible to be processed efficiently.</p>
<h2 id="session"><a class="header" href="#session">Session</a></h2>
<p>The period of time that it takes between Firefox being started until it is shut down. See also <a href="concepts/terminology.html#subsession">subsession</a>.</p>
<h2 id="subsession"><a class="header" href="#subsession">Subsession</a></h2>
<p>In Firefox, <a href="concepts/terminology.html#session">Sessions</a> are split into subsessions after every 24-hour time period has passed or the environment has changed. See <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/concepts/sessions.html?highlight=subsession">here for more details</a>.</p>
<h2 id="stmo-sqltelemetrymozillaorg"><a class="header" href="#stmo-sqltelemetrymozillaorg">STMO (sql.telemetry.mozilla.org)</a></h2>
<p>A service for creating queries and dashboards. See <a href="concepts/../introduction/tools.html#sqltelemetrymozillaorg-stmo">STMO under analysis tools</a>.</p>
<h2 id="telemetry"><a class="header" href="#telemetry">Telemetry</a></h2>
<p>As you use Firefox, Telemetry measures and collects non-personal information, such as performance, hardware, usage and customizations. It then sends this information to Mozilla on a daily basis and we use it to improve Firefox.</p>
<h2 id="uri"><a class="header" href="#uri">URI</a></h2>
<p>Uniform Resource Identifier - a string that refers to a resource. The most common are <a href="concepts/terminology.html#url">URLs</a>, which identify the resource by giving its location on the Web (<a href="https://developer.mozilla.org/en-US/docs/Glossary/URI">source</a>).</p>
<h2 id="url"><a class="header" href="#url">URL</a></h2>
<p>Uniform Resource Locator - a text string that specifies where a resource (such as a web page, image, or video) can be found on the Internet (<a href="https://developer.mozilla.org/en-US/docs/Glossary/URL">source</a>). For example, <code>https://www.mozilla.org</code> is a URL.</p>
<h2 id="wau"><a class="header" href="#wau">WAU</a></h2>
<p>Weekly Active Users - The number of unique profiles active at least once during the 7-day window
ending on the specified day.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/concepts/terminology.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="tutorials--cookbooks"><a class="header" href="#tutorials--cookbooks">Tutorials &amp; Cookbooks</a></h1>
<p>This section contains documentation describing how to perform specific tasks. It includes the following sections:</p>
<ul>
<li><a href="cookbooks/getting_started/index.html">Getting Started</a>: How to get started.</li>
<li><a href="cookbooks/analysis/index.html">Analysis Cookbooks</a>: Tutorials on analyzing data.</li>
<li><a href="cookbooks/operational/index.html">Operational Cookbooks</a>: Tutorials describing how to perform various operational tasks.</li>
<li><a href="cookbooks//datasets/new_data.html">Sending Telemetry</a>: Tutorials on adding new Telemetry.</li>
</ul>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/index.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="getting-started"><a class="header" href="#getting-started">Getting started</a></h1>
<p>This section contains some basic tutorials on how to get up and running with Mozilla's data.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/getting_started/index.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="accessing-telemetry-data"><a class="header" href="#accessing-telemetry-data">Accessing Telemetry data</a></h1>
<h2 id="public-data"><a class="header" href="#public-data">Public Data</a></h2>
<p>Aggregated information on the Firefox user population (including hardware, operating system, and other usage characteristics) is available at the <a href="https://data.firefox.com/">Firefox Public Data Report</a> portal.</p>
<p>In addition, a set of curated datasets are available to the public for research purposes. See the <a href="concepts/../cookbooks/public_data.html">public data cookbook</a> for more information.</p>
<h2 id="non-public-data"><a class="header" href="#non-public-data">Non-public Data</a></h2>
<p>Access to other Telemetry data is limited to two groups:</p>
<ol>
<li>Mozilla employees and contractors</li>
<li>Contributors who have signed a <a href="https://wiki.mozilla.org/NDA">non-disclosure agreement</a>, have a sustained track record of contribution, and have a demonstrated need to access this data</li>
</ol>
<p>If you are an employee or contractor, you should already have the necessary permissions.</p>
<p>If you are a contributor and want to request access to Mozilla Telemetry data, <a href="https://bugzilla.mozilla.org/enter_bug.cgi?product=Data%20Platform%20and%20Tools&amp;component=Operations">file a bug in the operations component</a>
and ask an established Mozilla contributor or employee to vouch for you.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/concepts/gaining_access.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="getting-help"><a class="header" href="#getting-help">Getting Help</a></h1>
<h2 id="mailing-lists"><a class="header" href="#mailing-lists">Mailing lists</a></h2>
<p>Telemetry-related announcements that include new datasets, outages, feature
releases, etc. are sent to <a href="https://groups.google.com/a/mozilla.org/g/fx-data-dev"><code>fx-data-dev@mozilla.org</code></a>, a public
mailing list. Follow the link for archives and information on how to subscribe.</p>
<h2 id="matrix"><a class="header" href="#matrix">Matrix</a></h2>
<p>You can locate us in the <a href="https://chat.mozilla.org/#/room/#telemetry:mozilla.org">#telemetry:mozilla.org</a> channel on <a href="https://wiki.mozilla.org/Matrix">Mozilla's instance of matrix</a>.</p>
<h2 id="slack"><a class="header" href="#slack">Slack</a></h2>
<p>You can ask questions (and get answers!) in <a href="https://mozilla.slack.com/messages/data-help"><code>#data-help</code></a> on Mozilla Internal's
Slack. See also <a href="https://mozilla.slack.com/messages/data"><code>#data</code></a> for general data-related discussion.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/concepts/getting_help.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="reporting-a-problem"><a class="header" href="#reporting-a-problem">Reporting a problem</a></h1>
<p>If you see a problem with data tools, datasets, or other pieces of infrastructure,
report it!</p>
<p>Defects in the data platform and tools are tracked in Bugzilla in the <a href="https://bugzilla.mozilla.org/enter_bug.cgi?product=Data%20Platform%20and%20Tools">Data Platform and Tools</a> product.</p>
<p>Bugs need to be filed in the closest-matching component in the Data Platform and Tools
product. If you are not able to locate an appropriate component for the item in question, file an issue
in the <a href="https://bugzilla.mozilla.org/enter_bug.cgi?product=Data%20Platform%20and%20Tools&amp;component=General">General component</a>.</p>
<p>Components are triaged at least weekly by the component owner(s). For any issues that need
urgent attention, it is recommended that you use the <code>needinfo</code> flag to attract attention
from a specific person. If an issue does not receive the appropriate attention in a
week (or it is urgent), see <a href="concepts/getting_help.html">getting help</a>.</p>
<p>When a bug is triaged, it is assigned a <strong>priority</strong> and <strong>points</strong>. <strong>Priorities</strong> are processed as follows:</p>
<ul>
<li><strong><code>P1</code></strong>: in active development in the current sprint</li>
<li><strong><code>P2</code></strong>: planned to be worked on in the current quarter</li>
<li><strong><code>P3</code></strong>: planned to be worked on next quarter</li>
<li><strong><code>P4</code></strong> and beyond: nice to have, would accept a patch, but not actively being worked on.</li>
</ul>
<p><strong>Points</strong> reflect the amount of effort that is required for a bug. They are assigned as follows:</p>
<ul>
<li><strong>1 point</strong>: one day or less of effort</li>
<li><strong>2 points</strong>: two days of effort</li>
<li><strong>3 points</strong>: three days to a week of effort</li>
<li><strong>5 points</strong> or more: SO MUCH EFFORT, major project.</li>
</ul>
<h3 id="problems-with-the-data"><a class="header" href="#problems-with-the-data">Problems with the data</a></h3>
<p>There are Bugzilla components for several core datasets, as
described in this documentation. If at all possible, assign a specific component to the issue.</p>
<p>If there is an issue with a dataset to which you are unable to assign its own component,
file an issue in the <a href="https://bugzilla.mozilla.org/enter_bug.cgi?product=Data%20Platform%20and%20Tools&amp;component=Datasets%3A%20General">Datasets: General component</a>.</p>
<h3 id="problems-with-tools"><a class="header" href="#problems-with-tools">Problems with tools</a></h3>
<p>There are Bugzilla components for several of the <a href="concepts/../introduction/tools.html">tools</a> that
comprise the <a href="https://bugzilla.mozilla.org/enter_bug.cgi?product=Data%20Platform%20and%20Tools">Data Platform</a>.
File a bug in the specific component that most closely matches the tool in question.</p>
<p>Operational issues, such as services being unavailable, need to be filed in the <a href="https://mozilla-hub.atlassian.net/secure/CreateIssue.jspa?pid=10058">Data SRE Jira Project</a>.</p>
<ul>
<li>The ticket should contain the following information:
<ul>
<li>Service details</li>
<li>Steps to reproduce</li>
<li>Impact to users</li>
</ul>
</li>
</ul>
<h3 id="other-issues"><a class="header" href="#other-issues">Other issues</a></h3>
<p>When in doubt, file issues in the <a href="https://bugzilla.mozilla.org/enter_bug.cgi?product=Data%20Platform%20and%20Tools&amp;component=General">General component</a>.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/concepts/reporting_a_problem.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="analysis"><a class="header" href="#analysis">Analysis</a></h1>
<p>This section contains tutorials on how to analyze Telemetry data.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/analysis/index.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="data-analysis-tools"><a class="header" href="#data-analysis-tools">Data Analysis Tools</a></h1>
<p>This section covers data tools that you can use for discovering what data is available about our products.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/analysis/data_discovery_tools.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="the-data-catalog"><a class="header" href="#the-data-catalog">The Data Catalog</a></h1>
<p>Reference material for data assets (tables, dashboards, pings, etc.) can primarily be found in the Data Catalog: https://mozilla.acryl.io.
It provides an automatically updated &quot;map&quot; of data assets, including lineage and descriptions, without the need for manual curation.</p>
<h2 id="what-do-i-use-it-for"><a class="header" href="#what-do-i-use-it-for">What do I use it for?</a></h2>
<p>The primary use case for the catalog is finding out (at a glance) which data assets exist and how they relate to one another. A few examples:</p>
<ul>
<li>Finding the source ping or table from a Looker dashboard.</li>
<li>Finding out whether a source ping or table has any downstream dependencies.</li>
<li>Getting a high-level overview of how tables are transformed before data shows up in a dashboard.</li>
<li>Tracing a column through various BigQuery tables.</li>
<li>Finding the source query or DAG that powers a particular BigQuery table.</li>
</ul>
<h2 id="how-do-i-use-it"><a class="header" href="#how-do-i-use-it">How do I use it?</a></h2>
<p>Navigate to https://mozilla.acryl.io and log in via SSO. Once logged in, you should be able to explore assets via the
search bar or by clicking on a platform (e.g. BigQuery or Glean).</p>
<h2 id="when-was-this-implemented"><a class="header" href="#when-was-this-implemented">When was this implemented?</a></h2>
<p>We tested a number of tools in 2022 and finally settled on Acryl. Integration work proceeded from there and continues as
we add more tools and assets to our data platform.</p>
<h2 id="is-the-data-catalog-a-replacement-for-tools-like-the-glean-dictionary-or-the-looker-data-dictionary"><a class="header" href="#is-the-data-catalog-a-replacement-for-tools-like-the-glean-dictionary-or-the-looker-data-dictionary">Is the Data Catalog a replacement for tools like the Glean Dictionary or the Looker Data Dictionary?</a></h2>
<p>No. While the features between the Data Catalog and tools such as the Glean Dictionary and Looker Data Dictionary overlap,
the Data Catalog is meant to be less focused on any single tool and more on assets from all the tools in our data platform,
providing lineage and reference material that links them together.</p>
<h2 id="what-software-does-it-use"><a class="header" href="#what-software-does-it-use">What software does it use?</a></h2>
<p>The catalog is a managed version of open source <a href="https://datahubproject.io">DataHub</a>, a metadata platform built and
maintained by the company <a href="https://www.acryldata.io">Acryl</a>.</p>
<h2 id="how-is-the-metadata-populated"><a class="header" href="#how-is-the-metadata-populated">How is the metadata populated?</a></h2>
<p>Metadata is pulled from each included platform. Depending on the source, metadata ingestion is either managed
in the Acryl UI or via our custom ingestion code:</p>
<ul>
<li>Glean - Pings are ingested from the Glean Dictionary API. This is scheduled nightly in CircleCI. The ingestion code is located in the <a href="https://github.com/mozilla/mozilla-datahub-ingestion">mozilla-datahub-ingestion repository</a>.</li>
<li>Legacy Telemetry - Pings are ingested from the <a href="https://github.com/mozilla-services/mozilla-pipeline-schemas/">Mozilla Pipeline Schemas repository</a>.This is scheduled nightly in CircleCI. The ingestion code is located in the <a href="https://github.com/mozilla/mozilla-datahub-ingestion">mozilla-datahub-ingestion repository</a>.</li>
<li>BigQuery - Views, Tables, Datasets, and Projects are ingested from the BigQuery audit log and query jobs. This is scheduled nightly in the Acryl UI. The documentation can be found on The <a href="https://datahubproject.io/docs/generated/ingestion/sources/bigquery/">DataHub docs page</a>.</li>
<li>Looker - Views, Explores, and Dashboards are ingested from both our LookML source repositories (e.g. <a href="https://github.com/mozilla/looker-spoke-default/">spoke-default</a>) and the Looker API. This is scheduled nightly in the Acryl UI. The documentation can be found on the <a href="https://datahubproject.io/docs/generated/ingestion/sources/looker/">DataHub docs page</a>.</li>
<li>Metric-Hub - Metrics are ingested from the <a href="https://github.com/mozilla/metric-hub/">metric-hub repository</a> and loaded into the Business Glossary. This is scheduled nightly in CircleCI. The ingestion code is located in the <a href="https://github.com/mozilla/mozilla-datahub-ingestion">mozilla-datahub-ingestion repository</a> and the documentation can be found on the <a href="https://datahubproject.io/docs/generated/ingestion/sources/business-glossary/">DataHub docs page</a>.</li>
</ul>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/analysis/data_catalog.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="using-the-glean-dictionary"><a class="header" href="#using-the-glean-dictionary">Using the Glean Dictionary</a></h1>
<p>The Glean Dictionary is a web-based tool that allows you to look up information on all the metrics<sup class="footnote-reference"><a href="#1">1</a></sup> defined in applications built using <a href="cookbooks/analysis/../../concepts/glean/glean.html">Glean</a>, Mozilla's next-generation Telemetry SDK.
Like Glean itself, it is built using lessons learned in the implementation of what came before (the <a href="cookbooks/analysis/./probe_dictionary.html">probe dictionary</a> in this case).
In particular, the Glean Dictionary is designed to be more accessible to those without deep knowledge of instrumentation and/or data platform internals.</p>
<h2 id="how-to-use"><a class="header" href="#how-to-use">How to use</a></h2>
<p>You can visit the Glean Dictionary at <a href="https://dictionary.telemetry.mozilla.org/"><code>dictionary.telemetry.mozilla.org</code></a>.
As its content is generated entirely from publicly available source code, there is no access control.</p>
<p>From the top level, you can select an application you want to view the metrics for.
After doing so, you can search for metrics by name (e.g.: <code>addons.enabled_addons</code>), type (e.g.: <code>string_list</code>), or tags (e.g. <code>WebExtensions</code>).</p>
<p>After selecting a metric, you can get more information on it including a reference to its definition in the source code as well as information on how to get the data submitted by this probe in some of our data tools like <a href="cookbooks/analysis/../../introduction/tools.html#sqltelemetrymozillaorg-stmo">STMO</a>, <a href="cookbooks/analysis/../../introduction/tools.html#looker">Looker</a>, and <a href="cookbooks/analysis/../../introduction/tools.html#glean-aggregated-metrics-dashboard-glam">GLAM</a>.</p>
<h2 id="common-questions"><a class="header" href="#common-questions">Common Questions</a></h2>
<h3 id="how-can-i-go-from-a-metric-to-querying-it-in-bigquery"><a class="header" href="#how-can-i-go-from-a-metric-to-querying-it-in-bigquery">How can I go from a metric to querying it in BigQuery?</a></h3>
<p>Underneath the metric definition, look for the section marked &quot;access&quot;. This should tell you the BigQuery table where the data for the metric is stored, along with the column name needed to access it.</p>
<p>For several examples of this along with a more complete explanation, see <a href="cookbooks/analysis/../accessing_glean_data.html">Accessing Glean Data</a>.</p>
<div class="footnote-definition" id="1"><sup class="footnote-definition-label">1</sup>
<p>Note that Glean refers to &quot;probes&quot; (in the old-school Firefox parlance) as &quot;metrics&quot;.</p>
</div>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/analysis/glean_dictionary.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="using-the-probe-dictionary"><a class="header" href="#using-the-probe-dictionary">Using the Probe Dictionary</a></h1>
<p>The Probe Dictionary is a web-based tool that allows you to look up information on all the probes defined in Firefox's source code.
Until <a href="https://firefox-source-docs.mozilla.org/toolkit/components/glean/index.html">Firefox on Glean</a> is finished, the Probe Dictionary is the best way to look up what data is submitted by Firefox.</p>
<p>Note that the Probe Dictionary has not kept pace with many changes that have made to the Mozilla data platform in the last couple of years.
However, with some knowledge of how Firefox and the data platform work, you can still quickly find the data that you need.
If you have questions, don't hesitate to <a href="cookbooks/analysis/../../concepts/getting_help.html">ask for help</a>.</p>
<h2 id="how-to-use-1"><a class="header" href="#how-to-use-1">How to use</a></h2>
<p>You can visit the Probe Dictionary at <a href="https://probes.telemetry.mozilla.org/"><code>probes.telemetry.mozilla.org</code></a>.
As its content is generated entirely from publicly available source code in Firefox, there is no access control.</p>
<p>From the top level, you can search for a probe by name, descriptive, or other category by entering the appropriate text in the search box.</p>
<p>If you click on a probe, you can get more information on it including a reference to its definition in the source code as well as information on how to get the data submitted by this probe in some of our data tools like STMO and the Telemetry Dashboard.</p>
<h2 id="common-questions-1"><a class="header" href="#common-questions-1">Common Questions</a></h2>
<h3 id="how-can-i-tell-if-a-probe-is-still-active"><a class="header" href="#how-can-i-tell-if-a-probe-is-still-active">How can I tell if a probe is still active?</a></h3>
<p>Look at the &quot;recorded (nightly)&quot; column after the probe definition in the summary.
If it gives a range and it ends before the current release, the probe is not active anymore.
For example, the <code>a11y.sitezoom</code> probe was only recorded in Nightly from Firefox 73 to 77.</p>
<p><img src="cookbooks/analysis/../../assets/PTMO_example_expired.png" alt="" /></p>
<h3 id="how-can-i-go-from-a-probe-to-querying-it-in-bigquery"><a class="header" href="#how-can-i-go-from-a-probe-to-querying-it-in-bigquery">How can I go from a probe to querying it in BigQuery?</a></h3>
<p>Look in the &quot;available in&quot; section underneath the probe.</p>
<p><img src="cookbooks/analysis/../../assets/PTMO_example.png" alt="" /></p>
<p>You can use this information to query the data submitted by the probe in BigQuery using <a href="cookbooks/analysis/../../tools/stmo.html">STMO</a> or other tools.
For example, for example this query gives you the counts of the distinct values for <code>a11y.hcm_background</code>:</p>
<pre><code class="language-sql">SELECT payload.processes.parent.scalars.a11y_hcm_background AS a11y_hcm_background,
count(*) AS COUNT
FROM telemetry.main_1pct
WHERE DATE(submission_timestamp)='2021-08-01' group by 1;
</code></pre>
<p><a href="https://sql.telemetry.mozilla.org/queries/81415/source"><code>STMO#81415</code></a></p>
<p>Other measurement types may require more complicated queries.
For information on querying exponential histograms, see <a href="cookbooks/analysis/../main_ping_exponential_histograms.html">Visualizing Percentiles of a Main Ping Exponential Histogram</a>.</p>
<p>Note that the metric's information may also appear in derived datasets, not just the raw ping tables which we are talking about above.
For more information on this (and how to explore data stored in derived data sets), see <a href="cookbooks/analysis/../bigquery/accessing_desktop_data.html">Accessing Desktop Data</a>.</p>
<h3 id="for-keyed-scalars-how-can-i-find-out-what-the-keys-mean"><a class="header" href="#for-keyed-scalars-how-can-i-find-out-what-the-keys-mean">For keyed scalars, how can I find out what the keys mean?</a></h3>
<p>First, check the probe description: basic documentation is often there. For example, in the <code>a11y.theme</code> probe it says:</p>
<pre><code>OS high contrast or other accessibility theme is enabled. The result is split into keys which represent the values of browser.display.document_color_use: &quot;default&quot;, &quot;always&quot;, or &quot;never&quot;.
</code></pre>
<p>If this is not given, your best option is probably to look at the Firefox source code using <a href="https://searchfox.org/">Searchfox</a> (a link to a sample query is provided by Probe Dictionary).
Again, feel free to <a href="cookbooks/analysis/../../concepts/getting_help.html">ask for help</a> if you need it.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/analysis/probe_dictionary.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="introduction-to-bigeye-data-observability-platform"><a class="header" href="#introduction-to-bigeye-data-observability-platform">Introduction to Bigeye (Data Observability Platform)</a></h1>
<p>Mozilla uses Bigeye as its data observability platform to ensure high data quality and reliability across its pipelines. Bigeye offers powerful features like automated anomaly detection, detailed data lineage tracking, and customizable monitoring. These capabilities allow teams to swiftly identify, diagnose, and resolve data issues, enhancing overall data integrity and operational efficiency.</p>
<h2 id="accessing-bigeye"><a class="header" href="#accessing-bigeye">Accessing Bigeye</a></h2>
<p>You can access Mozilla's instance of Bigeye at <a href="https://app.bigeye.com/">app.bigeye.com</a>.
If you do not have the necessary access or permissions, please <a href="https://mozilla-hub.atlassian.net/browse/DENG-4728">submit a Jira ticket</a>.</p>
<h2 id="getting-started-1"><a class="header" href="#getting-started-1">Getting Started</a></h2>
<p>Watch the <a href="https://www.youtube.com/watch?v=8DWyZuU-w1c&amp;t=9s">Bigeye tutorial</a> to get an overview of the platform.
Please refer to additional helpful videos available on the Bigeye channel.</p>
<h2 id="stay-updated-on-whats-happening"><a class="header" href="#stay-updated-on-whats-happening">Stay Updated on What's Happening</a></h2>
<p><a href="https://mozilla-hub.atlassian.net/browse/DENG-4563">Jira Epic Link</a></p>
<h2 id="have-an-issue-or-looking-for-a-new-feature-in-bigeye"><a class="header" href="#have-an-issue-or-looking-for-a-new-feature-in-bigeye">Have an issue or looking for a new feature in Bigeye</a></h2>
<p>If you're experiencing any issues while using the issues tracker for <a href="https://docs.google.com/spreadsheets/d/1L7JrbgTaVsKKFIK3ilKKttUfuzHh91xSeBFC0VZTp68">Bigeye</a></p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/data_monitoring/intro.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="bigeye-interface"><a class="header" href="#bigeye-interface">Bigeye Interface</a></h1>
<h2 id="catalog"><a class="header" href="#catalog">Catalog</a></h2>
<p>The Catalog tab in the left-hand menu offers a comprehensive view of all data sources connected to Bigeye, making it simple to navigate and manage your entire data ecosystem.</p>
<p>If you are an admin, you will have access to the &quot;Add Source&quot; button, allowing you to easily integrate new data sources and BI tools.</p>
<p><img src="cookbooks/data_monitoring/../../assets/Bigeye/Bigeye-Catalog.png" alt="" /></p>
<p>The Bigeye catalog refreshes automatically every 24 hours to detect new datasets and schema changes. You can also manually refresh the catalog anytime by clicking 'Rescan' on the schema changes tab.</p>
<p>For more detailed information about the Catalog, please refer to the <a href="https://docs.bigeye.com/docs/catalog">Catalog documentation page</a></p>
<p>Watch the <a href="https://www.youtube.com/watch?v=8DWyZuU-w1c&amp;t=9s">Bigeye tutorial</a> on how to navigate Bigeye Catalog.</p>
<h2 id="workspaces"><a class="header" href="#workspaces">Workspaces</a></h2>
<p>Workspaces in Bigeye allow multiple teams to collaborate simultaneously, with each team managing and monitoring their own data independently.
Each Bigeye workspace includes its own Catalog, BI tools, and ETL tools, Metrics and issues, Templates and schedules, Collections, Deltas.</p>
<p>We are in the process of setting up user workspaces that will align with our existing data access restrictions.</p>
<p>If you do not find a suitable workspace, please submit a <a href="https://mozilla-hub.atlassian.net/browse/DENG-4727">Jira ticket</a></p>
<p><img src="cookbooks/data_monitoring/../../assets/Bigeye/Bigeye-Workspace.png" alt="" /></p>
<h2 id="collections"><a class="header" href="#collections">Collections</a></h2>
<p>Collections in Bigeye allow you to group related metrics, making it easier to manage and monitor them together.</p>
<p>If you dont find a collection that meets your product or requirements, admins can create a new collection.
If you're not an admin, please submit a <a href="https://mozilla-hub.atlassian.net/browse/DENG-4726">Jira ticket</a> with the necessary details.</p>
<p>Watch the <a href="https://www.youtube.com/watch?v=4H5AM0a71bs&amp;list=PLUmsPWeo8j4U9SpGCnAe9syilD4_jBgSI&amp;index=8">Bigeye tutorial</a> on how to navigate Bigeye Catalog.</p>
<h2 id="issues"><a class="header" href="#issues">Issues</a></h2>
<p>Bigeye's Issues feature helps you track and manage data quality issues detected by the platform.
You can assign, prioritize, and resolve issues within the platform, ensuring that your data quality remains high. Issues can be categorized and filtered to streamline the resolution process across teams.</p>
<p>For more details, refer to the Bigeye documentation on <a href="https://docs.bigeye.com/docs/issues">Issues page</a></p>
<h2 id="dashboard"><a class="header" href="#dashboard">Dashboard</a></h2>
<p>Users can monitor the data quality metrics and issues in a centralized view. It highlights key features such as customizable widgets, real-time metric tracking, and the ability to visualize data health at a glance. Users can configure dashboards to focus on specific metrics or tables and receive immediate insights into their data pipelines' performance.</p>
<p>For additional guidance on using Bigeye Dashboard, please refer to the following documentation:</p>
<ul>
<li><a href="https://docs.bigeye.com/docs/dashboard#set-filters">How to set filters</a></li>
<li><a href="https://docs.bigeye.com/docs/dashboard#monitoring-coverage">Monitoring Coverage</a></li>
<li><a href="https://docs.bigeye.com/docs/dashboard#data-quality">Tracking Data Quality</a></li>
<li><a href="https://docs.bigeye.com/docs/dashboard#issue-response">Issue Response Metrics</a></li>
</ul>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/data_monitoring/interface.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="deploying-metrics"><a class="header" href="#deploying-metrics">Deploying Metrics</a></h1>
<p>To deploy metrics in Bigeye, navigate to a schema, table, or column and click &quot;Add Monitoring.&quot;
You can choose to add metrics via 4 options</p>
<ol>
<li>Freshness and Volume (Pipeline Reliability)</li>
<li>Data Quality</li>
<li>All metrics</li>
<li>Custom SQL, and select the metrics you wish to deploy.</li>
</ol>
<p><img src="cookbooks/data_monitoring/../../assets/Bigeye/Bigeye-Deploymetrics.png" alt="" /></p>
<p><img src="cookbooks/data_monitoring/../../assets/Bigeye/Bigeye-Deploymetrics-2.png" alt="" /></p>
<p>Next, choose the columns to monitor, set schedules, thresholds, and filters, and confirm your selections.
You can assign the metrics to the relevant collection, grouping related metrics for easier management and monitoring.
Once deployed, the metrics appear under the Metrics tab on the relevant schema, table, or column page.
We can backfill the metrics for up to 28 days in the past.</p>
<p><img src="cookbooks/data_monitoring/../../assets/Bigeye/Bigeye-Deploymetrics-3.png" alt="" /></p>
<iframe width="560" height="315" src="https://www.youtube.com/embed/LRhbxFMPTwI?si=yJ98NhvlmGt02eq7" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>
<p>For more details, please refer to the Bigeye documentation on <a href="https://docs.bigeye.com/docs/deploy-metrics">how to deploy metrics</a>.</p>
<p>Watch the Bigeye tutorial on <a href="https://www.youtube.com/watch?v=jNzSki59AWQ">how to use the metrics page</a></p>
<h2 id="freshness-and-volume-pipeline-reliability"><a class="header" href="#freshness-and-volume-pipeline-reliability">Freshness and Volume (Pipeline Reliability)</a></h2>
<p>Bigeye tracks data quality by monitoring the timeliness (freshness) and completeness (volume) of your data and checks them hourly.
Initially, it looks back 28 days, then 2 days for subsequent runs. For volume, it aggregates row counts into hourly buckets, using the same lookback periods. We have an option to select <a href="https://docs.bigeye.com/docs/manual-thresholds"><code>manual</code></a> thresholds vs <a href="https://docs.bigeye.com/docs/autothresholds"><code>Autothresholds</code></a> that learn typical patterns and alert on anomalies.</p>
<p>Only one Freshness and one Volume metric can be deployed per table.
<strong>Cost Consideration</strong> Freshness and Volume metrics are included by default for each table and are free of charge.</p>
<p><img src="cookbooks/data_monitoring/../../assets/Bigeye/Bigeye-Pipelinereliability.png" alt="" /></p>
<p><img src="cookbooks/data_monitoring/../../assets/Bigeye/Bigeye-Pipelinereliability-2.png" alt="" /></p>
<p>Please refer to Bigeye documentation for more details on <a href="https://docs.bigeye.com/docs/freshness-and-volume-pipeline-reliability-copy">Freshness and Volume metrics</a>.</p>
<h2 id="list-of-available-metrics"><a class="header" href="#list-of-available-metrics">List of available metrics</a></h2>
<p>Bigeye offers a range of available metrics to monitor data quality and reliability across your data pipelines.
These metrics cover areas such as data freshness, volume, distribution, schema changes, and anomalies. You can deploy these metrics to track key performance indicators and ensure your data meets expected standards.</p>
<p>Please refer to the Bigeye documentation for <a href="https://docs.bigeye.com/docs/available-metrics">list of available metrics</a>.</p>
<p>Watch the Bigeye tutorial on the <a href="https://www.youtube.com/watch?v=jNzSki59AWQ">metrics types</a></p>
<h2 id="autometrics"><a class="header" href="#autometrics">Autometrics</a></h2>
<p>Autometrics are suggested metrics that monitor anomalies in column-level data, automatically generated for all new datasets in Bigeye. They can be found under the Autometrics tab in the Catalog when viewing a source, schema, table, or column page.</p>
<blockquote>
<p><strong>Try to avoid this option!</strong>: On tables with many columns a large number of monitors might get deployed. This increases noise and cost. Instead, it is recommended to <a href="cookbooks/data_monitoring/deploying_metrics.html#list-of-available-metrics">choose relevant metrics from the list of available metrics</a> manually.</p>
</blockquote>
<p><img src="cookbooks/data_monitoring/../../assets/Bigeye/Bigeye-Autometrics.png" alt="" /></p>
<h2 id="custom-sql"><a class="header" href="#custom-sql">Custom SQL</a></h2>
<p>Custom rules are useful for addressing unique data quality requirements that standard metrics may not cover. Once set, these rules integrate into your monitoring workflow.</p>
<h2 id="recommendations--best-practices-to-deploying-metrics"><a class="header" href="#recommendations--best-practices-to-deploying-metrics">Recommendations / Best Practices to deploying metrics</a></h2>
<ul>
<li>
<p>It's recommended to avoid deploying Autometrics extensively, as this could result in a high signal-to-noise ratio, leading to unnecessary alerts and potential distraction.</p>
</li>
<li>
<p>When deploying metrics on search tables, we observed that the <code>median</code> calculation using the BigQuery function does not work as expected. Due to this limitation, it is recommended to avoid using the median metric in these scenarios to ensure accurate results.</p>
</li>
<li>
<p>Autothresholds are recommended for freshness and volume metrics, as they automatically adjust based on typical patterns. For other metrics, it's advisable to manually set thresholds to ensure accuracy and relevance.</p>
</li>
<li>
<p>It is recommended to add metrics at the view level rather than directly on tables. This ensures that even if a table becomes obsolete or is upgraded, unnecessary checks on previous versions are avoided. The only exception to this rule is for freshness and volume metrics, which must be deployed directly on tables.</p>
</li>
</ul>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/data_monitoring/deploying_metrics.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="collections-in-bigeye"><a class="header" href="#collections-in-bigeye">Collections in Bigeye</a></h1>
<p>Collections help you organize and focus on specific areas of interest, making it simpler to track and address data quality across different segments of your data landscape. This feature enhances efficiency by allowing users to monitor grouped entities in a cohesive manner.</p>
<p><img src="cookbooks/data_monitoring/../../assets/Bigeye/Bigeye-Collections.png" alt="" /></p>
<h2 id="creating-a-new-collection"><a class="header" href="#creating-a-new-collection">Creating a new collection</a></h2>
<p>If you dont find a collection that meets your product or requirements, admins can create a new collection.</p>
<iframe width="560" height="315" src="https://www.youtube.com/embed/YTOOTFw5MLw?si=X3VybaWasts-sdjw" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>
<p>If you're not an admin, please <a href="https://mozilla-hub.atlassian.net/browse/DENG-4726">submit a Jira ticket</a> with the necessary details.</p>
<h2 id="adding-metrics-to-a-collection"><a class="header" href="#adding-metrics-to-a-collection">Adding metrics to a collection</a></h2>
<p>To add metrics to a collection in Bigeye, navigate to the collection you want to update and click &quot;Add Metrics.&quot; You can search or filter for specific metrics that align with your monitoring goals.</p>
<iframe width="560" height="315" src="https://www.youtube.com/embed/ZFRAaeX6z8w?si=ba3jYHTQNZPDi9ua" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>
<h2 id="adding-notifications-to-a-collection"><a class="header" href="#adding-notifications-to-a-collection">Adding notifications to a collection</a></h2>
<p>One useful feature of collections is the ability to add notifications. To set this up, click the &quot;Edit&quot; button, then navigate to the &quot;Notifications&quot; tab in the modal that appears.</p>
<iframe width="560" height="315" src="https://www.youtube.com/embed/QH37mnkkuW8?si=7cMx8eJq_wgbSFX_" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/data_monitoring/collections.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="issues-management"><a class="header" href="#issues-management">Issues management</a></h1>
<p>The <code>Issues</code> tab allows filtering issues by parameters like severity or date, and reviewing details such as impacted metrics and tables.</p>
<p><img src="cookbooks/data_monitoring/../../assets/Bigeye/issues-tab.png" alt="The Issues overview on BigEye, allowing to filter issues" /></p>
<h2 id="view-issue-details"><a class="header" href="#view-issue-details">View issue details</a></h2>
<p>Once we click on the issue we can view a metric chart that displays a time series visualization of the alerting metric.</p>
<p><img src="cookbooks/data_monitoring/../../assets/Bigeye/Issues-2.png" alt="" /></p>
<p>The metric chart in Bigeye displays a time series visualization of the alerting metric</p>
<p><img src="cookbooks/data_monitoring/../../assets/Bigeye/Issues-2.png" alt="" /></p>
<h2 id="status-of-issue"><a class="header" href="#status-of-issue">Status of Issue</a></h2>
<p>Users can change the status to &quot;Active,&quot; &quot;Resolved,&quot; &quot;Muted,&quot; or &quot;Dismissed,&quot; depending on the issue's current state. This allows for better tracking and management of issues across data pipelines. Status updates are reflected in the timeline, providing a clear history of the issue's progression.</p>
<p><img src="cookbooks/data_monitoring/../../assets/Bigeye/Issues-4.png" alt="" /></p>
<p>Use the Mute button above the timeline to mute an issue and stop being notified about subsequent issue alerts. The available mute options in the Bigeye are Unmute, 1 hour, 4 hours, 12 hours, 24 hours, or 1 week</p>
<p><img src="cookbooks/data_monitoring/../../assets/Bigeye/Issues-5.png" alt="" /></p>
<p>For more details, refer to the <a href="https://docs.bigeye.com/docs/change-the-issue-status">Bigeye documentation</a>.</p>
<h2 id="debug"><a class="header" href="#debug">Debug</a></h2>
<p>Use the queries in the Debug tab to troubleshoot your issue.</p>
<iframe width="560" height="315" src="https://www.youtube.com/embed/rSWAl7f8vcc?si=YA2HgEcRIyL0FKMC" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>
<h2 id="jira-integration"><a class="header" href="#jira-integration">Jira Integration</a></h2>
<p>The Bigeye integration with Jira enables teams to track critical data quality issues seamlessly through Jira's flexible interface. The bi-directional integration ensures that updates made in either Jira or Bigeye are synced across both systems. Once a Jira ticket is created in Bigeye, any status changes or comments are automatically reflected in both platforms, keeping all team members informed.</p>
<iframe width="560" height="315" src="https://www.youtube.com/embed/5UtAkwvjt5U?si=wnpb6fqhQMPvS6wO" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>
<h2 id="slack-integration-coming-soon"><a class="header" href="#slack-integration-coming-soon">Slack Integration [Coming Soon]</a></h2>
<p>Bigeye enables users to take direct actions on issues on Slack messages without needing to navigate to the Bigeye web interface.
Users can resolve, mute, or dismiss alerts directly from Slack messages, ensuring efficient workflows and quick responses to data quality issues.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/data_monitoring/issues_management.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="cost-considerations"><a class="header" href="#cost-considerations">Cost considerations</a></h1>
<p>The Freshness and Volume metrics, which represent Pipeline Reliability, are included in the free tier. There is no charge when these metrics are added to a table.</p>
<p>However, if additional monitors are added to a table, it becomes a billable table, and charges will apply based on the number of billable tables.</p>
<p><strong>Try to avoid Autometrics!</strong> - On tables with many columns a large number of monitors might get deployed. This increases noise and cost. Instead, it is recommended to <a href="cookbooks/data_monitoring/deploying_metrics.html#list-of-available-metrics">choose relevant metrics from the list of available metrics</a> manually.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/data_monitoring/cost_considerations.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="bigeye---further-reading"><a class="header" href="#bigeye---further-reading">Bigeye - Further reading</a></h1>
<ul>
<li>
<p>Kinds of <a href="https://www.youtube.com/watch?v=8DWyZuU-w1c&amp;t=9s">alert thresholds</a> that Bigeye supports</p>
</li>
<li>
<p>Use <a href="https://www.youtube.com/watch?v=la8pq7rjT-M&amp;list=PLUmsPWeo8j4U9SpGCnAe9syilD4_jBgSI&amp;index=9">Deltas</a> to compare datasets and validate successful data replication, migration, or model code changes.</p>
</li>
<li>
<p><a href="https://www.youtube.com/watch?v=TmwyEfq1Xgo&amp;list=PLUmsPWeo8j4U9SpGCnAe9syilD4_jBgSI&amp;index=4">Row Creation</a> - Concept of row creation time and how helps save costs and optimize your data observability</p>
</li>
<li>
<p><a href="https://docs.bigeye.com/docs/schedules">Creating a schedule and assigning it to a metric</a></p>
</li>
<li>
<p>Creating a <a href="https://www.youtube.com/watch?v=8YVIQIbec6k&amp;list=PLUmsPWeo8j4U9SpGCnAe9syilD4_jBgSI&amp;index=14">Metric Template</a> in order to provide specific functionality.</p>
</li>
<li>
<p><a href="https://www.youtube.com/watch?v=mhPttWI15ro&amp;list=PLUmsPWeo8j4U9SpGCnAe9syilD4_jBgSI&amp;index=8">Grouped metrics</a> allow you to employ group by aggregation.</p>
</li>
<li>
<p><a href="https://docs.bigeye.com/docs/metric-window-types">Lookback window</a></p>
</li>
</ul>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/data_monitoring/further_reading.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="data-modeling-guidelines"><a class="header" href="#data-modeling-guidelines">Data Modeling guidelines</a></h1>
<p>This section contains guidelines to consider when implementing new data models.</p>
<p>It's also a collection of best practices related to different parts of the Data Modeling process, including the review of Pull Requests and techniques for performance improvements and cost reduction.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/data_modeling/index.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="where-to-store-our-analytics-data-models"><a class="header" href="#where-to-store-our-analytics-data-models">Where to Store our analytics data models</a></h1>
<p>Mozillas current setup allows for the implementation of data modeling and business logic in different locations e.g BigQuery or Looker. The purpose of this document is to provide guidelines to decide where to store a new asset and to review pull requests that implement them.</p>
<p>For the purpose of this documentation, the analytics data model is defined as the set of data assets designed to collect meaningful data from our raw datasets and structure it for an efficient understanding and analysis of our products, business processes and events.</p>
<h2 id="what-to-store-in-bigquery-datasets-and-the-bigquery-etl-repository"><a class="header" href="#what-to-store-in-bigquery-datasets-and-the-bigquery-etl-repository">What to store in BigQuery datasets and the bigquery-etl repository</a></h2>
<p>bigquery-etl is the repository for tools and also transformations and business logic that is stored in a BigQuery dataset (derived table, aggregate table, view, materialized view).</p>
<p>Some examples of logic expected in bigquery-etl:</p>
<ul>
<li>The calculation of <a href="https://docs.telemetry.mozilla.org/metrics/index.html">core metrics</a>: DAU, WAU, MAU, new profiles.</li>
<li>Calculation of <a href="https://docs.telemetry.mozilla.org/datasets/search.html?highlight=search#terminology">search metrics</a>. E.g. Ad clicks, search with ads, organic search.</li>
<li>Calculation of acquisition, retention and churn metrics.</li>
<li>Mapping from partner code to platform for Bing revenue.</li>
<li>Segmentation of clients that require the implementation of business logic, not just filtering on specific columns.</li>
</ul>
<h2 id="what-to-store-in-looker"><a class="header" href="#what-to-store-in-looker">What to store in Looker</a></h2>
<p>Data aggregations or aggregate awareness to improve performance, preferably that don't implement or replicate business logic.</p>
<p>Some examples:</p>
<ul>
<li>Aggregates for summarization or creating a subset from a BigQuery dataset, and that dont include business logic. Some examples:
<ul>
<li>A subset of data for a specific year. See this <a href="https://github.com/mozilla/looker-spoke-default/blob/4ee892234963d3305f087b99a38caa501e45098f/activity_stream/explores/pocket_tile_impressions.explore.lkml#L6">aggregate for data after 2019 in Looker</a>.</li>
<li>A subset of data with the most used dimensions. See this <a href="https://github.com/mozilla/looker-spoke-default/blob/e1315853507fc1ac6e78d252d53dc8df5f5f322b/mozilla_vpn/explores/subscriptions.explore.lkml#L66">aggregate for specific dimensions and a time frame</a>.</li>
<li>An aggregate that covers a commonly used dashboard or view. See this aggregate to support the views that include a <a href="https://github.com/mozilla/looker-spoke-default/blob/c3e1dba99fe29364fdc8d46bf3a4ea53cfa87c56/combined_browser_metrics/combined_browser_metrics.model.lkml#L18">year over year analysis</a>.</li>
</ul>
</li>
<li>Percentages, (e.g. in this view for <a href="https://mozilla.cloud.looker.com/looks/499">Focus Android DAU</a>, click through rates). These calculations are highly dependent on the dimensions and filters used and not always can be summed directly, therefore it is not recommended calculating them in BigQuery.</li>
<li>Cumulative days of use. E.g. Implemented as a SUM in the <a href="https://github.com/mozilla/looker-spoke-default/blob/c09b5dd11f977a0c20cf04c872e997712cbe6418/kpi/views/browser_kpis.view.lkml#L40">Browsers KPIs view</a>.</li>
</ul>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/data_modeling/where_to_store.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="using-aggregates-for-cost-saving-and-performance-improvement"><a class="header" href="#using-aggregates-for-cost-saving-and-performance-improvement">Using aggregates for cost saving and performance improvement</a></h1>
<p>A good approach to better performance and reduced cost is reducing the amount of data scanned in queries, which can be achieved by summarizing and pre-calculating data in aggregates.</p>
<p>This doc is about when to use different options to aggregate data, their limitations, benefits, and examples.</p>
<!-- TOC -->
<ul>
<li><a href="cookbooks/data_modeling/using_aggregates.html#what-are-the-options-available">What are the options available?</a>
<ul>
<li><a href="cookbooks/data_modeling/using_aggregates.html#bigquery-aggregate-tables">BigQuery Aggregate Tables</a></li>
<li><a href="cookbooks/data_modeling/using_aggregates.html#bigquery-materialized-views">BigQuery Materialized Views</a></li>
<li><a href="cookbooks/data_modeling/using_aggregates.html#looker-pdts--aggregate-awareness">Looker PDTs &amp; aggregate awareness</a></li>
</ul>
</li>
<li><a href="cookbooks/data_modeling/using_aggregates.html#important-considerations">Important considerations:</a></li>
<li><a href="cookbooks/data_modeling/using_aggregates.html#when-to-use-each-of-these-aggregates">When to use each of these aggregates?</a>
<ul>
<li><a href="cookbooks/data_modeling/using_aggregates.html#a-bigquery-aggregate-table-is-suitable-when">A BigQuery aggregate table is suitable when:</a></li>
<li><a href="cookbooks/data_modeling/using_aggregates.html#a-materialized-view-is-suitable-when">A Materialized View is suitable when:</a></li>
<li><a href="cookbooks/data_modeling/using_aggregates.html#a-looker-pdt-is-suitable-when">A Looker PDT is suitable when:</a></li>
</ul>
</li>
<li><a href="cookbooks/data_modeling/using_aggregates.html#how-to-measure-the-benefit-and-savings">How to measure the benefit and savings?</a></li>
</ul>
<!-- TOC -->
<h2 id="what-are-the-options-available"><a class="header" href="#what-are-the-options-available">What are the options available?</a></h2>
<h4 id="bigquery-aggregate-tables"><a class="header" href="#bigquery-aggregate-tables">BigQuery Aggregate Tables</a></h4>
<p>Aggregates tables contain pre-aggregated results of scheduled queries running some business logic. These tables are created and maintained by the Data Team and scheduled to get updated periodically via Airflow.</p>
<ul>
<li><a href="https://mozilla.github.io/bigquery-etl/cookbooks/common_workflows/#adding-a-new-scheduled-query">Process to create an aggregate table in BigQuery</a>.</li>
</ul>
<h4 id="bigquery-materialized-views"><a class="header" href="#bigquery-materialized-views"><a href="https://cloud.google.com/bigquery/docs/materialized-views-intro">BigQuery Materialized Views</a></a></h4>
<p>These are views defined by the developer and then created, managed and incrementally updated by BigQuery, reading <em>only</em> the changes in the base table to compute results. Materialized view definitions <a href="https://cloud.google.com/bigquery/docs/materialized-views-intro#limitations"><em>do not support</em> certain BigQuery features and expressions</a>, such as UDFs, certain aggregate functions, backfilling or nesting. - There is a limit of 20 materialized views per table.</p>
<ul>
<li>
<p><a href="https://console.cloud.google.com/bigquery?ws=!1m7!1m6!12m5!1m3!1smozdata!2sus-central1!3s8403c62c-e243-4e57-8d91-5c1fcdf26828!2e1">Template to create a Materialized View</a>.</p>
</li>
<li>
<p>Example.</p>
<pre><code>CREATE MATERIALIZED VIEW `moz-fx-data-shared-prod.monitoring_derived.suggest_click_rate_live_v1`
OPTIONS
(enable_refresh = TRUE, refresh_interval_minutes = 5)
AS
SELECT
TIMESTAMP_TRUNC(submission_timestamp, minute) AS submission_minute,
COUNT(*) AS n,
COUNTIF(release_channel = &quot;release&quot;) AS n_release,
COUNTIF(release_channel = &quot;beta&quot;) AS n_beta,
COUNTIF(release_channel = &quot;nightly&quot;) AS n_nightly,
COUNT(request_id) AS n_merino,
COUNTIF(request_id IS NOT NULL AND release_channel = &quot;release&quot;) AS n_merino_release,
COUNTIF(request_id IS NOT NULL AND release_channel = &quot;beta&quot;) AS n_merino_beta,
COUNTIF(request_id IS NOT NULL AND release_channel = &quot;nightly&quot;) AS n_merino_nightly,
FROM
`moz-fx-data-shared-prod.contextual_services_live.quicksuggest_click_v1`
WHERE
DATE(submission_timestamp) &gt; '2010-01-01'
GROUP BY
1
</code></pre>
</li>
</ul>
<h4 id="looker-pdts--aggregate-awareness"><a class="header" href="#looker-pdts--aggregate-awareness"><a href="https://cloud.google.com/looker/docs/aggregate_awareness">Looker PDTs &amp; aggregate awareness</a></a></h4>
<p>These are aggregations that a developer defines in an Explore file (<code>explore.lkml</code>). From this definition, Looker creates a table in BigQuery's <code>mozdata.tmp</code> using the naming convention<code>scratch schema + table status code + hash value + view name</code> and runs the scheduled update of the data.
Looker's PDTs and aggregate awareness are <em>only</em> referenced in Looker when at least one of the columns is used in a Looker object. These aggregates can be particularly beneficial to avoid having to rebuild dashboards after a schema change.</p>
<ul>
<li>
<p>Template to create aggregate awareness in a Looker Explore, replacing the text inside &lt;&gt; with the actual values:</p>
<pre><code>aggregate_table: &lt;aggregate_name: Descriptive name of this aggregation.&gt; {
query: {
dimensions: [&lt;table&gt;.&lt;columns&gt;]
measures: [&lt;table&gt;.&lt;columns&gt;]
filters: [&lt;table&gt;.&lt;partition_column&gt;: &quot;&lt;Period to materialize in the aggregate e.g. 2 years&gt;&quot;]
}
materialization: {
sql_trigger_value: SELECT CURRENT_DATE() ;;
increment_key: &lt;table&gt;.&lt;partition_column&gt;
increment_offset: &lt;INT: number of periods to update, recommended is 1.&gt; }
}
</code></pre>
</li>
<li>
<p><a href="https://mozilla.cloud.looker.com/projects/spoke-default/files/combined_browser_metrics/explores/active_users_aggregates.explore.lkml">Example of aggregate awareness in a Looker Explore</a></p>
</li>
</ul>
<h2 id="important-considerations"><a class="header" href="#important-considerations">Important considerations</a></h2>
<ul>
<li><a href="https://docs.telemetry.mozilla.org/cookbooks/data_modeling/where_to_store">Store the business logic in BigQuery</a>, preferably in a client_id level table to aggregate from.</li>
<li>All aggregates are version controlled in the git repositories <code>bigquery-etl</code> and <code>spoke-default</code>.</li>
<li>All aggregates require a backfill or update when the source data changes:
<ul>
<li>BigQuery aggregate tables are backfilled using the managed-backfill process.</li>
<li>Materialized views cannot be backfilled, instead a materialized view needs to be re-created. Schema changes in base tables also invalidate the view and requires it to be re-created. Materialized views scan all historical data of their referenced base tables by default, so <em>ensure to set a date filter to reduce the amount of data to be scanned</em>.
<ul>
<li>Add a date filter to materialized views to limit the amount of data to be scanned when these views get deployed initially. Otherwise, they will scan the entire data in referenced base tables.</li>
</ul>
</li>
<li>Looker PDTs require following the [protocol to backfill described in the <a href="https://mozilla.udemy.com/course/looker-training-for-developers/learn/lecture/35440216#overview">Mozilla Looker Developers course</a>.</li>
</ul>
</li>
<li>Indices, partition and clustering are allowed for all cases. Looker PDTs and aggregates require that these are defined in the base table.</li>
<li><strong>For Cost savings</strong>: BigQuery retries the update of materialized views after failures, which results in increased costs due to querying data multiple times.
<ul>
<li>Monitor for broken materialized views in the <a href="https://mozilla.cloud.looker.com/x/uTZhF7sqlOOvrV4o7It1Cc">BigQuery Usage Explore</a></li>
<li>Use the command <code>bq cancel</code> to stop unnecessary updates. E.g. <code>bq --project_id moz-fx-data-shared-prod cancel moz-fx-data-shared-prod:US.&lt;materialized view&gt;</code>. The permission to use this command is assigned to Data Engineering and Airflow.</li>
</ul>
</li>
</ul>
<h2 id="when-to-use-each-of-these-aggregates"><a class="header" href="#when-to-use-each-of-these-aggregates">When to use each of these aggregates?</a></h2>
<h4 id="a-bigquery-aggregate-table-is-suitable-when"><a class="header" href="#a-bigquery-aggregate-table-is-suitable-when">A BigQuery aggregate table is suitable when:</a></h4>
<ul>
<li>The query requires full flexibility to use DML, data types, aggregation functions and different types of JOIN.</li>
<li>The results should not be unexpectedly affected by Shredder.</li>
<li>The metric requires strict change control.</li>
<li>A scheduled alert is required in case of failure or data out of sync. Airflow sends emails and alerts on failure for BigQuery aggregate tables, which are addressed daily by the Data Engineering team during the Airflow Triage.</li>
<li>The table will be queried directly or used as a source for other analysis. Looker PDTs are not designed to be queried directly.</li>
</ul>
<h4 id="a-materialized-view-is-suitable-when"><a class="header" href="#a-materialized-view-is-suitable-when">A Materialized View is suitable when:</a></h4>
<ul>
<li>Your goal is to aggregate data in real-time (for example, for implementing real-time monitoring of certain metrics).</li>
<li>The results can be based on shredded data (tables with client_id).</li>
<li>The view will be queried directly or used as a source for other analysis.</li>
<li>Change control is <em>not</em> required or is already implemented in the base table. This can be verified by looking for the label <code>change_controlled: true</code> in the table's metadata.</li>
<li>A scheduled alert on failure is not required. Failures must be actively monitored in the <a href="https://mozilla.cloud.looker.com/x/uTZhF7sqlOOvrV4o7It1Cc">BigQuery Usage Explore</a>.</li>
<li>The metric does <em>not</em> require non-deterministic functions that are not supported: RAND(), CURRENT_TIMESTAMP, CURRENT_DATE(), or CURRENT_TIME().</li>
<li>The query does <em>not</em> require UDFs, UNNESTING arrays, COUNT DISTINCT, ORDER BY or any DML operation different from SELECT.</li>
<li>The query uses a WITH clause, COUNTIF, INNER JOIN or TIMESTAMP_ADD. These are all supported.</li>
<li>The data does not need to be backfilled.</li>
<li>When considering materialized views, a common practice is to use a combination of an aggregate table to store historical data (e.g. older than 2 days) and use a materialized view to track data in real-time (e.g. aggregate data that is just coming in). This allows to run backfills on the aggregate table.</li>
</ul>
<h4 id="a-looker-pdt-is-suitable-when"><a class="header" href="#a-looker-pdt-is-suitable-when">A Looker PDT is suitable when:</a></h4>
<ul>
<li>Your goal is to improve the performance and query response in dashboards by aggregating data using common query patterns, with the added benefit of not having to re-create the dashboard every time the base table changes.</li>
<li>The results can be based on shredded data (tables with client_id).</li>
<li>Change control is not required or is already implemented in the base table. This can be verified by looking for the label <code>change_controlled: true</code> in the metadata.</li>
<li>A scheduled alert on failure is <em>not</em> required. Failures must be monitored in the <a href="https://mozilla.cloud.looker.com/admin/pdts">PDT Admin Dashboard</a> or in the <a href="https://mozilla.cloud.looker.com/dashboards/system__activity::errors_and_broken_content">Errors and Broken Content Dashboard</a>.</li>
<li>The metrics defined in the Explore use only any of these aggregations: SUM, COUNT, COUNTIF, MIN, MAX or AVERAGE.</li>
<li>The metrics defined in the Explore use only any of these data types: NUMBER, DATE, STRING or YESNO.</li>
<li>The aggregate uses a DISTINCT COUNT <em>and</em> the query matches exactly the Explore query.</li>
<li>The base table for the Explore is expected to change with added columns and Looker Explore will require modifications that also require re-creating the dashboards. When using aggregate awareness this re-create is <em>not</em> neccesary.</li>
</ul>
<h2 id="how-to-measure-the-benefit-and-savings"><a class="header" href="#how-to-measure-the-benefit-and-savings">How to measure the benefit and savings?</a></h2>
<ul>
<li>
<p>Looker displays the amount of data that will be processed with and without using the aggregates, in the top right corner of a view or explore when in development mode.
<img src="cookbooks/data_modeling/looker_cost_saving.png" alt="Looker cost saving" /></p>
</li>
<li>
<p>BigQuery displays also in the top right corner of the window, the amount of data that will be scanned by a query written in the console.</p>
</li>
<li>
<p>Alternatively, query the information schema to return the bytes processed and cost. With this information is possible to compare and calculate the savings that result from using an aggregate instead of querying the base table directly. Using sample data for the comparison will save costs.</p>
<pre><code> SELECT destination_table.project_id AS project_id,
destination_table.dataset_id AS dataset,
SUBSTR(destination_table.table_id, 0, INSTR(destination_table.table_id, '$') -1) AS table_id,
SUM(total_bytes_processed/(1024*1024*1024)) as TB_processed,
SUM((total_slot_ms * 0.06) / (60 * 60 * 1000)) AS cost
FROM `moz-fx-data-shared-prod`.`region-us`.INFORMATION_SCHEMA.JOBS_BY_PROJECT
WHERE EXTRACT(DATE FROM creation_time) BETWEEN &lt;ini_date&gt; AND CURRENT_DATE
AND destination_table.dataset_id = &lt;dataset_name&gt;
AND user_email = &lt;user_email&gt;
AND destination_table.table_id = &lt;table_name&gt;
GROUP BY ALL;
</code></pre>
</li>
</ul>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/data_modeling/using_aggregates.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><p>In 2020, Mozilla chose <a href="https://looker.com/">Looker</a> as its primary tool for analyzing data.
It allows data exploration and visualization by experts and non-experts alike.</p>
<p>This section provides an introduction to Looker as well as some tutorials on how to use it.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/looker/index.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="introduction-to-looker"><a class="header" href="#introduction-to-looker">Introduction to Looker</a></h1>
<p>In 2020, Mozilla chose <a href="https://looker.com/">Looker</a> as its primary tool for analyzing data.
It allows data exploration and visualization by experts and non-experts alike.</p>
<p>Access to Looker is currently limited to Mozilla employees and designated contributors. For more information, see <a href="cookbooks/looker/../../concepts/gaining_access.html">gaining access</a>.</p>
<h2 id="table-of-contents-1"><a class="header" href="#table-of-contents-1">Table of Contents</a></h2>
<ul>
<li><a href="cookbooks/looker/intro.html#accessing-looker">Accessing Looker</a></li>
<li><a href="cookbooks/looker/intro.html#getting-started">Getting Started</a>
<ul>
<li><a href="cookbooks/looker/intro.html#front-page">Front page</a></li>
<li><a href="cookbooks/looker/intro.html#explores">Explores</a></li>
<li><a href="cookbooks/looker/intro.html#using-the-glean-dictionary-with-looker">Using the Glean Dictionary with Looker</a></li>
</ul>
</li>
<li><a href="cookbooks/looker/intro.html#going-deeper">Going Deeper</a></li>
</ul>
<h2 id="accessing-looker"><a class="header" href="#accessing-looker">Accessing Looker</a></h2>
<p>You can access Mozilla's instance of Looker at <a href="https://mozilla.cloud.looker.com"><code>mozilla.cloud.looker.com</code></a>.</p>
<h2 id="getting-started-2"><a class="header" href="#getting-started-2">Getting Started</a></h2>
<h3 id="front-page"><a class="header" href="#front-page">Front page</a></h3>
<p>By default, on the front page you will see a list of default folders, which contain links to dashboards.
These are organized by project.
Of particular note is the <a href="https://mozilla.cloud.looker.com/folders/706">KPI Metrics Folder</a>, which includes several Data-produced and vetted dashboards like the Firefox Corporate KPI Dashboard.</p>
<p><img src="cookbooks/looker/../../assets/Looker_screenshots/front-page.png" alt="" /></p>
<h3 id="explores"><a class="header" href="#explores">Explores</a></h3>
<p>One of the core concepts of Looker are <a href="https://docs.looker.com/exploring-data/exploring-data">Explores</a>.
These allow you to quickly explore datasets (both ping-level and derived datasets) within an intuitive user interface.</p>
<p><img src="cookbooks/looker/../../assets/Looker_screenshots/explores.png" alt="" /></p>
<p>You can access the full list of explores available in Looker.
From the main page on the left, select &quot;Explore&quot;.
From there, you can select an explore to view.
Most explores are grouped by application.
For example, there are a set of explores for both &quot;Firefox Desktop&quot; and &quot;Firefox for Android&quot;.</p>
<h3 id="using-the-glean-dictionary-with-looker"><a class="header" href="#using-the-glean-dictionary-with-looker">Using the Glean Dictionary with Looker</a></h3>
<p>The above list of explores can sometimes be overwhelming.
If your application uses <a href="cookbooks/looker/../../concepts/glean/glean.html">Glean</a> to collect data, one very viable workflow is to look up information on the metric(s) you're interested in using the <a href="cookbooks/looker/../analysis/glean_dictionary.html">Glean Dictionary</a>, then use the &quot;Access&quot; section at the bottom, which links directly out to the Looker explore(s) where you can access the data.</p>
<p><img src="cookbooks/looker/../../assets/Looker_screenshots/glean-dictionary-links.png" alt="" /></p>
<p>The following video demonstrates this workflow in detail:</p>
<center>
<iframe width="560" height="315" src="https://www.youtube.com/embed/B635wgZy7Iw" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
</center>
<h2 id="going-deeper"><a class="header" href="#going-deeper">Going Deeper</a></h2>
<p>If you want to explore Looker more deeply, you can check out:</p>
<ul>
<li><a href="https://www.cloudskillsboost.google/journeys/28">&quot;BI and Analytics with Looker&quot; training hub</a>: A collection of self-paced video training courses for new users. Full courses are free, but require registration, but course descriptions contain material that is useful on its own.</li>
<li><a href="https://docs.looker.com/">Looker Documentation</a>: Extensive text and video documentation, a “textbook” reference on how the product works.</li>
<li><a href="https://help.looker.com/">Looker Help Center</a>: Contains articles on common problems, specific use cases, error messages, and best practices.</li>
<li><a href="https://community.looker.com/">Looker Community</a> has customer-written material, broadcasts from Looker employees (primarily release notes), and topics written by Looker employees that are not officially supported by Looker.</li>
</ul>
<p>You can find additional Looker training resources on the <a href="https://mana.mozilla.org/wiki/display/DATA/Looker+Training+Resources">Looker Training Resources</a> mana page (LDAP access required).</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/looker/intro.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="normalizing-country-data"><a class="header" href="#normalizing-country-data">Normalizing Country Data</a></h1>
<p>This how-to guide is about getting standard country data in your Looker Explores, Looks and Dashboards:</p>
<p><img src="cookbooks/looker/../../assets/Looker_screenshots/countries_explore.png" alt="Country View in Explore Image" /></p>
<p>This guide has only two steps: Normalizing Aliases and Accessing Standard Country Data.</p>
<blockquote>
<p>⚠️ Some steps in this guide require knowledge of SQL and LookML - ask in #data-help for assistance if needed.</p>
</blockquote>
<p>We get country data from many sources: partners, telemetry, third-party tools etc.
In order to analyze these in a standard way, i.e. make different analyses comparable,
we can conform these sources to a set of standard country codes, names, regions,
sub-regions, etc.</p>
<h2 id="step-one---normalizing-aliases"><a class="header" href="#step-one---normalizing-aliases">Step One - Normalizing Aliases</a></h2>
<blockquote>
<p>⚠️ If your country data already consists of two-character ISO3166 codes, you can skip to Step Two!</p>
</blockquote>
<p>We refer to a different input name for the same country as &quot;alias&quot;. For example, your data might contain
the country value &quot;US&quot;, another might contain &quot;USA&quot; and yet another might contain &quot;United States&quot;, etc.
This can be confusing when read in a table or seen on a graph.</p>
<p>To normalize this, we maintain a mapping of <a href="https://github.com/mozilla/bigquery-etl/blob/main/sql_generators/country_code_lookup/aliases.yaml">aliases</a>
from each country to its two-character <a href="https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes">ISO3166 code</a>
This includes misspellings and alternate language encoding that we encounter in various datasets.
For example:</p>
<pre><code class="language-yaml">CI:
- &quot;Ivory Coast&quot;
- &quot;Côte dIvoire&quot;
- &quot;Côte d'Ivoire&quot;
- &quot;Cote d'Ivoire&quot;
- &quot;Côte d’Ivoire&quot;
- &quot;The Republic of Côte d'Ivoire&quot;
</code></pre>
<p>To map (normalize) your input alias to its country code, add a LEFT join from your table or view to the alias <code>country_lookup</code>
table: <code>mozdata.static.country_names_v1</code>. For example:</p>
<pre><code class="language-sql">SELECT
...
your_table.country_field,
COALESCE(country_lookup.code, your_table.country, '??') as country_code
...
FROM
your_table
LEFT JOIN mozdata.static.country_names_v1 country_lookup ON your_table.country_field = country_lookup.name
</code></pre>
<p>Note: we use <code>??</code> as a country-code for empty country data from data sources. This will map to &quot;Unknown Country&quot;,
&quot;Unknown Region&quot;, etc.</p>
<p>At this point, you should check for cases where the resulting <code>country_code</code> matches <code>your_table.country</code> but does
not match any values in the <code>country_lookup</code> table - you may have discovered a new alias, in which case please add it to the list!
You can do this via a bigquery-etl pull request for example: https://github.com/mozilla/bigquery-etl/pull/2858.</p>
<blockquote>
<p>⚠️ This list of aliases is public. If you are working with sensitive data, please do not add to the public list of
aliases, you should handle it in custom logic in code that interfaces with your sensitive data for example in
<a href="https://github.com/mozilla/private-bigquery-etl">private-bigquery-etl</a> or the
<a href="https://github.com/mozilla/looker-spoke-private">private Looker spoke</a>.</p>
</blockquote>
<p>If you are satisfied that the <code>country_code</code> field is appropriately normalized, move on to Step Two!</p>
<h2 id="step-two---accessing-standard-country-data"><a class="header" href="#step-two---accessing-standard-country-data">Step Two - Accessing Standard Country Data</a></h2>
<p>Standard country data is contained in the <code>mozdata.static.country_codes_v1</code> table and by extension the
<code>shared/views/countries</code> Looker View.</p>
<p>Add the following join to your Explore (either in the <code>.explore.lkml</code> or <code>.model.lkml</code> file):</p>
<pre><code class="language-lookml">include: &quot;/shared/views/*&quot;
...
join: countries {
type: left_outer
relationship: one_to_one
sql_on: ${your_table.country_code} = ${countries.code} ;;
}
</code></pre>
<p>Now, you should be able to see the Countries View in your Explore 🎉</p>
<p><img src="cookbooks/looker/../../assets/Looker_screenshots/countries_explore.png" alt="Country View in Explore Image" /></p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/looker/countries.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="normalizing-browser-version-data"><a class="header" href="#normalizing-browser-version-data">Normalizing Browser Version Data</a></h1>
<p>This how-to guide is about getting numerical browser version data in your Looker Explores, Looks and Dashboards:</p>
<p>This guide only has one step: Normalizing Version Strings</p>
<blockquote>
<p>⚠️ Some steps in this guide require knowledge of SQL - ask in #data-help for assistance if needed.</p>
</blockquote>
<p>Many of our data sources (particularly browser telemetry) have <code>version_id</code>'s: A string that (most of the time)
looks like <code>&quot;99.1.0&quot;</code> in the format <code>&quot;major.minor.patch&quot;</code>.</p>
<blockquote>
<p>⚠️ In SQL you might be tempted to compare these version identifiers. This might however, return misleading results!
<code>&quot;99&quot; &gt; &quot;100&quot;</code> but <code>99 &lt; 100</code>. Note the string vs number comparison.</p>
</blockquote>
<h2 id="step-one---normalizing-version-strings"><a class="header" href="#step-one---normalizing-version-strings">Step One - Normalizing Version Strings</a></h2>
<p>In your <code>view.sql</code> file, locate the browser version identifier. In many tables/views, this is called <code>app_version</code>.</p>
<p>To extract the numerical version data you have two options:</p>
<h3 id="1-the-truncate-version-udf---truncate_version"><a class="header" href="#1-the-truncate-version-udf---truncate_version">1. The <a href="https://mozilla.github.io/bigquery-etl/mozfun/norm/#truncate_version-udf">truncate version UDF</a> - <code>truncate_version</code></a></h3>
<p>This extracts the major or minor version from the version identifier. See the Mozfun Docs for a detailed description.</p>
<p>Modify your <code>view.sql</code>:</p>
<pre><code class="language-SQL">CREATE OR REPLACE VIEW
`project.dataset.view`
AS
SELECT
*,
`mozfun.norm.truncate_version`(app_version, &quot;major&quot;) as major_browser_version -- &lt;--- New Line
FROM
`project.dataset_derived.table`
</code></pre>
<p><code>major_version</code> will be added as a new field containing the numerical major browser version. </p>
<h3 id="2-the-browser-version-info-udf---browser_version_info"><a class="header" href="#2-the-browser-version-info-udf---browser_version_info">2. The <a href="https://mozilla.github.io/bigquery-etl/mozfun/norm/#browser_version_info-udf">browser version info UDF</a> - <code>browser_version_info</code></a></h3>
<p>This extracts a number of useful fields from the version identifier. See the Mozfun Docs for a detailed description.</p>
<p>Modify your <code>view.sql</code>: </p>
<pre><code class="language-SQL">CREATE OR REPLACE VIEW
`project.dataset.view`
AS
SELECT
*,
`mozfun.norm.browser_version_info`(app_version) as browser_version_info -- &lt;--- New Line
FROM
`project.dataset_derived.table`
</code></pre>
<p><code>browser_version_info</code> will be added as a new struct field containing numerical version fields and other useful metadata. </p>
<p>After choosing an option, open a Pull Request (<a href="https://github.com/mozilla/bigquery-etl/pull/2898">for example</a>) and get a review.
Once your change is merged, the updated field will be available in Looker once the lookml-generator runs
(usually by the next calendar day, or <a href="https://github.com/mozilla/lookml-generator/#deploying-new-lookml-generator-changes">by manually running it on Airflow</a>).</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/looker/browser_versions.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="growth-and-usage-dashboards"><a class="header" href="#growth-and-usage-dashboards">Growth and Usage dashboards</a></h1>
<p>The Mozilla Growth &amp; Usage dashboards (GUD) are visualizations of growth metrics in a standard way across Mozillas products.</p>
<p>In Looker home screen go to <code>Shared &gt; Browsers</code> and find the Desktop and Mobile consolidated dashboards inside the corresponding folder.</p>
<blockquote>
<p>⚠️ Find Looker in the SSO Dashboard.</p>
</blockquote>
<p><img src="cookbooks/looker/../../assets/Looker_screenshots/browsers_desktop.jpg" alt="" /></p>
<h2 id="growth-dashboard"><a class="header" href="#growth-dashboard">Growth Dashboard</a></h2>
<p>Find in this dashboard:</p>
<ul>
<li>The visualizations of daily, weekly and monthly active users in comparison with the previous year.</li>
<li>The visualization of new profiles in comparison with the previous period.</li>
</ul>
<p><img src="cookbooks/looker/../../assets/Looker_screenshots/browsers_growth.jpg" alt="Growth dashboard" /></p>
<h2 id="usage-dashboard"><a class="header" href="#usage-dashboard">Usage Dashboard</a></h2>
<p>Find in this dashboard:</p>
<ul>
<li>The retention curve for cohorts over a period of 180 days from the first seen date.</li>
<li>The visualization of search, organic search and search with adds.</li>
<li>The visualization of Ad clicks behaviour.</li>
</ul>
<p><img src="cookbooks/looker/../../assets/Looker_screenshots/browsers_usage.jpg" alt="Usage dashboard" /></p>
<h2 id="browser-looks"><a class="header" href="#browser-looks">Browser Looks</a></h2>
<p>If you want to explore each individual visualization in Looker for the browsers, find them in the <code>Usage</code> folder inside the relevant <code>browser name</code> folder:</p>
<pre><code>Shared &gt; Browsers &gt; Desktop &gt; Usage
Shared &gt; Browsers &gt; Mobile &gt; Firefox Focus for Android &gt; Usage
Shared &gt; Browsers &gt; Mobile &gt; Firefox Focus for iOS &gt; Usage
Shared &gt; Browsers &gt; Mobile &gt; Firefox for Android &gt; Usage
Shared &gt; Browsers &gt; Mobile &gt; Firefox for iOS &gt; Usage
</code></pre>
<p>The example below shows the location of the Looks for Firefox Focus for Android:</p>
<p><img src="cookbooks/looker/../../assets/Looker_screenshots/browsers_usage_views.jpg" alt="" /></p>
<h2 id="source"><a class="header" href="#source">Source</a></h2>
<p>The dashboards and views for growth and usage are based on the <code>moz-fx-data-shared-prod.telemetry.active_users_aggregates</code> table, which contains the dimensions and metrics for desktop and mobile, as calculated from the <code>clients_last_seen</code> and <code>mobile_search_clients_daily_v1</code> tables.</p>
<pre><code>Note. The dashboards have been migrated to Looker from the previous GUD Dashboard at https://mozilla.github.io/gud/ as part of the Looker on-boarding and with the purpose of enhancement with new dimensions of analysis and improved performance.
</code></pre>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/looker/growth_usage_dashboards.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="using-the-event-counts-explore"><a class="header" href="#using-the-event-counts-explore">Using the Event Counts Explore</a></h1>
<p>If you want to answer product related questions using events, you can use the Event Counts explore in Looker.
This can help you understand how users interact with specific product features <em>in isolation</em> (for example, the number of users that created a bookmark in Firefox for Android).
You can see a quick demo of how to use this explore to answer a simple Firefox for Android product question in this video:</p>
<center>
<iframe width="560" height="315" src="https://www.youtube.com/embed/J0Hi5poV4D4" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
</center>
<p>This explore also exists for Mozilla VPN and other products (and you can use the same Glean Dictionary-based workflow to fill it out).
For Firefox Desktop, this explore is currently using legacy (non-Glean data) and you will need to use the <a href="cookbooks/looker/../analysis/probe_dictionary.html">probe dictionary</a> instead.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/looker/event_counts_explore.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="using-the-funnel-analysis-explore"><a class="header" href="#using-the-funnel-analysis-explore">Using the Funnel Analysis Explore</a></h1>
<p>If you want to answer product related questions using events, you can use the Funnel Analysis explore in Looker.
This can help you understand how users interact with specific product features <em>in sequence</em> (for example, what percentage of users completed a specific set of interactions).
You can see a quick demo of how to use this explore to answer a simple Firefox for Android product question in this video:</p>
<center>
<iframe width="560" height="315" src="https://www.youtube.com/embed/Nltt4wYmoUM" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
</center>
<p>This explore also exists for Mozilla VPN and other products (and you can use the same Glean Dictionary-based workflow to fill it out).
For Firefox Desktop, this explore is currently using legacy (non-Glean data) and you will need to use the <a href="cookbooks/looker/../analysis/probe_dictionary.html">probe dictionary</a> to look up event metadata instead.</p>
<p>Under the hood, the funnel analysis explore uses the <a href="cookbooks/looker/../../datasets/bigquery/events_daily/reference.html"><code>events_daily</code></a> dataset.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/looker/funnel_analysis_explore.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="optimizing-looker-performance---caching"><a class="header" href="#optimizing-looker-performance---caching">Optimizing Looker Performance - Caching</a></h1>
<p>This how-to guide is about improving performance of Looker dashboards and explores using caching. This is particularly
useful if you have many users accessing a dashboard or explore, especially across timezones.</p>
<p>This guide has two alternatives: Applying Generated Datagroups and Custom Datagroups</p>
<blockquote>
<p>⚠️ Some steps in this guide require knowledge of LookML and a developer license - ask in #data-help for assistance if
needed.</p>
</blockquote>
<p>The default setting in Looker is to store (cache) the results of any query for one hour. The majority of our data
is updated at least daily. This means that while users will get the same result for a query, they might have to wait
for the query to retrieve results from our BigQuery warehouse.</p>
<p>We solve this by using Looker's <a href="https://cloud.google.com/looker/docs/caching-and-datagroups">datagroups</a>. In short,
datagroups are a method for scheduling various actions in Looker like resetting the cache, e-mailing dashboards, or
rebuilding derived tables.</p>
<h2 id="alternative-one---applying-generated-datagroups"><a class="header" href="#alternative-one---applying-generated-datagroups">Alternative One - Applying Generated Datagroups</a></h2>
<p>For a number of tables in our warehouse, we automatically generate datagroups in looker-hub.</p>
<p>First locate your <code>explore.lkml</code> file. Take note of the source table that powers this explore. Note that this might be
multiple tables. If your explore has <em>multiple</em> tables that are updated by
<a href="https://workflow.telemetry.mozilla.org/home">ETL</a> you should move on to Alternative Two. Explores joining
infrequently updated, shared views such as <code>countries.view.lkml</code> can still use this method.</p>
<p>After finding the source table, in your <code>explore.lkml</code> file, include the auto-generated datagroup and add a
<code>persist_with</code> parameter that matches the datagroup. By convention, the datagroup name matches source tables.
See the following example:</p>
<pre><code>...
include: &quot;//looker-hub/search/datagroups/mobile_search_clients_daily_v1_last_updated.datagroup.lkml&quot;
explore: mobile_search_counts {
...
persist_with: mobile_search_clients_daily_v1_last_updated
}
</code></pre>
<p>Now, query results for the <code>Mobile Search Counts</code> explore will be cached until the <code>mobile_search_clients_daily_v1</code>
table is updated and users running the same query will receive fast results (a few seconds usually).</p>
<p>If you aren't able to find an auto-generated datagroup or your explore has multiple, complex joins, move on to
Alternative Two.</p>
<h2 id="alternative-two---custom-datagroups"><a class="header" href="#alternative-two---custom-datagroups">Alternative Two - Custom Datagroups</a></h2>
<p>This, simple alternative makes use of a static timer, longer than the default one hour used by Looker.</p>
<p>First, locate your <code>explore.lkml</code> file. Define a datagroup with an <code>interval_trigger</code> and <code>max_cache_age</code> &gt; 1 hour. For
example:</p>
<pre><code>explore: my_explore {
datagroup: my_explore_datagroup {
interval_trigger: &quot;6 hours&quot;
max_cache_age: &quot;6 hours&quot;
}
persist_with: my_explore_datagroup
}
</code></pre>
<p>This will cache results for 6 hours at a time.</p>
<p>As always, if any steps in this guide are unclear, or you are unable to locate the source tables, feel free to ask for
assistance in #data-help.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/looker/performance_caching.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="data-analysis-tools-1"><a class="header" href="#data-analysis-tools-1">Data Analysis Tools</a></h1>
<p>This section covers data analysis tools that you can use for analyzing and visualizing data.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/analysis/tools.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="introduction-to-glam"><a class="header" href="#introduction-to-glam">Introduction to GLAM</a></h1>
<p>GLAM was built to help Mozillians answer their data questions without needing data analysis or coding skills. It contains a visualization layer meant to answer most &quot;easy&quot; questions about how a probe or metric has changed over build ids and releases.</p>
<p>GLAM is one of several high-level data tools that we provide at Mozilla. For more information, see <a href="cookbooks/../introduction/tools.html">Tools for Data Analysis</a>.</p>
<p>Access to GLAM is currently limited to Mozilla employees and designated contributors (this <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1712353">may change in the future</a>). For more information, see <a href="cookbooks/../concepts/gaining_access.html">gaining access</a>.</p>
<h2 id="how-to-use-glam"><a class="header" href="#how-to-use-glam">How to use GLAM</a></h2>
<p>You can visit GLAM at <a href="https://glam.telemetry.mozilla.org"><code>glam.telemetry.mozilla.org</code></a>.</p>
<h3 id="front-page-1"><a class="header" href="#front-page-1">Front page</a></h3>
<p><img src="cookbooks/../assets/GLAM_screenshots/front-page.png" alt="" /></p>
<p>The front page includes two main sections: the search bar and the random probe explorer. Fuzzy tech search is implemented to let users search not only by the probe title, but also by the full description.</p>
<p>GLAM is currently serving data for Firefox Desktop and Firefox for Android.</p>
<h3 id="individual-probemetric-page"><a class="header" href="#individual-probemetric-page">Individual probe/metric page</a></h3>
<p>Clicking on a probe or metric name takes you to the individual explorer, where most of the analysis happens. As this page is packed with data, we make sure that it's self-documented as much as possible: every button, surface, menu item, is tool-tipped with description and links.</p>
<p><img src="cookbooks/../assets/GLAM_screenshots/probe-page.png" alt="" /></p>
<p><strong><code>(1)</code></strong> The left column shows metadata about the probe or metric: what kind, what channels it's active in, a description, associated bugs with its implementation. As our goal is to make GLAM a self-educating tool, we try to provide as much information as available, and link out to other resources where applicable (<a href="https://dictionary.telemetry.mozilla.org/">Glean Dictionary</a>, Looker, etc.)</p>
<p><strong><code>(2)</code></strong> For convenience, we provide two utility features:</p>
<ul>
<li><code>View SQL Query</code>: if you want to dig more deeply into the data than the GLAM UI allows, “View SQL Query” parses SQL that can be copied and then pasted into our <a href="cookbooks/../tools/stmo.html">STMO Redash instance</a>.</li>
<li><code>Export to JSON</code>: exports JSON data to be used in Jupyter notebook or similar services.</li>
</ul>
<p><strong><code>(3)</code></strong> A set of dimensions (qualitative attributes of data) to subset on</p>
<p><strong><code>(4)</code></strong> <code>Time Horizon</code> lets users choose how much data they want to investigate: week, month, quarter, or all (note that we only keep data from the last three versions.)</p>
<p><strong><code>(5)</code></strong> Probe or metric distribution and percentiles over time:</p>
<ul>
<li><code>Percentiles</code> shows the percentiles of the probe over time. To perform analysis, set a reference point by clicking on a target date, then hover along the graph to see the recorded differences. See attached <code>tooltips</code> on the page for more instruction.</li>
<li>The <code>compare</code> violin plot shows the comparison between two (vertical) normal distributions</li>
<li><code>Summary</code> table provides the exact numeric values of the percentiles of the median changes between Build IDs.</li>
</ul>
<p><strong><code>(6)</code></strong> shows the volume of clients with each given Build ID</p>
<h2 id="differences-between-glam-and-telemetrymozillaorg-dashboard"><a class="header" href="#differences-between-glam-and-telemetrymozillaorg-dashboard">Differences between GLAM and <code>telemetry.mozilla.org</code> dashboard</a></h2>
<p>GLAM is aggregated per client, <code>telemetry.mozilla.org</code> (TMO) is aggregated per ping. This will cause different movements in the visualization between the two systems. Notably:</p>
<ul>
<li>Because GLAM normalizes the aggregations by client ID, a single client weighs equally to all other clients, regardless of how many samples that client sends.</li>
<li>Reversely, TMO does not normalize by client ID, so if a single client sends a lot of pings, that client will impact the distribution more heavily. This can result in some changes appearing bigger on TMO.</li>
</ul>
<p>As of July 2022, TMO serves only Firefox Desktop (telemetry) data, while GLAM supports Firefox Desktop (both telemetry and Glean), Firefox for Android (Fenix), with ongoing efforts to integrate Firefox iOS and more products that use Glean as their telemetry system.</p>
<h2 id="going-deeper-1"><a class="header" href="#going-deeper-1">Going deeper</a></h2>
<p>For more information about the datasets that power GLAM, see <a href="cookbooks/../datasets/glam.html">GLAM Datasets</a>.</p>
<p>If you have a question that can't be easily answered by the GLAM GUI, you can access the raw GLAM datasets using <a href="cookbooks/../tools/stmo.html"><code>sql.telemetry.mozilla.org</code></a>.</p>
<h2 id="getting-help-1"><a class="header" href="#getting-help-1">Getting help</a></h2>
<p>If you have further questions, please reach out on the #GLAM slack channel.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/glam.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="operational-monitoring-opmon"><a class="header" href="#operational-monitoring-opmon">Operational Monitoring (OpMon)</a></h1>
<p><a href="https://github.com/mozilla/opmon">Operational Monitoring (OpMon)</a> is a self-service tool that aggregates and summarizes operational metrics that indicate the health of software.
OpMon can be used to continuously monitor rollouts, experiments (including experiments with continuous enrollments) or the population of a specific product (for example, Firefox Desktop).
OpMon automatically generates Looker dashboards that will provide insights into whether landing code changes impact certain metrics in a meaningful way.</p>
<p>A couple of specific use cases are supported:</p>
<ol>
<li>Monitoring build over build. This is typically used for Nightly where one build may contain changes that a previous build doesn't and we want to see if those changes affected certain metrics.</li>
<li>Monitoring by submission date over time. This is helpful for a rollout in Release for example, where we want to make sure there are no performance or stability regressions over time as a new build rolls out.</li>
</ol>
<p>The monitoring dashboards produced for these use cases are available in <a href="https://mozilla.cloud.looker.com/folders/lookml">Looker</a>.
OpMon does not emit real-time results. Dashboards and related datasets get updated on a daily basis.</p>
<p>Access to the Looker Operational Monitoring dashboards is currently limited to Mozilla employees and designated contributors. For more information, see <a href="cookbooks/../concepts/gaining_access.html">gaining access</a>.</p>
<h2 id="configuring-a-operational-monitoring-project"><a class="header" href="#configuring-a-operational-monitoring-project">Configuring a Operational Monitoring project</a></h2>
<p>To add or update a project configuration, open a pull request against the <code>opmon/</code> directory in <a href="https://github.com/mozilla/metric-hub/tree/main/opmon">metric-hub</a>.
Consider using and adding metric definitions to the <a href="https://github.com/mozilla/metric-hub">metric-hub</a> that should be available for use across other tools, such as <a href="https://experimenter.info/deep-dives/jetstream/overview">Jetstream</a>, as well.</p>
<p>CI checks will validate the columns, data sources, and SQL syntax. Once CI completes, the pull request gets automatically approved and can be merged. Results for the new project will be available within the next 24 hours.</p>
<p>Project configurations files are written in <a href="https://toml.io/en/">TOML</a>. To reuse configurations across multiple projects, project configurations can reference configurations from definition files.
These definitions files are platform-specific and located in the <a href="https://github.com/mozilla/metric-hub/tree/main/opmon/definitions"><code>opmon/definitions/</code> directory in metric-hub</a> or in the <a href="https://github.com/mozilla/metric-hub">metric-hub</a> repository. Platform-specific configuration files follow the same format and structure as project configuration files.</p>
<p>If the project is used to monitor a rollout or experiment, then the configuration files should have the same name as the slug that has been assigned in <a href="https://experimenter.services.mozilla.com/">Experimenter</a>.
Generally, configuration files have four main sections: <code>[project]</code>, <code>[data_sources]</code>, <code>[metrics]</code>, and <code>[dimensions]</code>. All of these sections are optional.</p>
<p>Examples of every value you can specify in each section are given below. <strong>You do not need to, and should not, specify everything!</strong>
OpMon will take values from Experimenter (for rollouts and experiments) and combine them with a reasonable set of defaults.</p>
<p>Lines starting with a <code>#</code> are comments and have no effect.</p>
<h3 id="project-section"><a class="header" href="#project-section"><code>[project]</code> Section</a></h3>
<p>This part of the configuration file is optional and allows to:</p>
<ul>
<li>specify the metrics that should be analyzed</li>
<li>define the clients that should be monitored</li>
<li>indicate if/how the client population should be segmented, and</li>
<li>override some values from Experimenter</li>
</ul>
<p>This section is usually not specified in definition configuration.</p>
<pre><code class="language-toml">[project]
# A custom, descriptive name of the project.
# This will be used as the generated Looker dashboard title.
name = &quot;A new operational monitoring project&quot;
# The name of the platform this project targets.
# For example, &quot;firefox_desktop&quot;, &quot;fenix&quot;, &quot;firefox_ios&quot;, ...
platform = &quot;firefox_desktop&quot;
# Specifies the type of monitoring desired as described above.
# Either &quot;submission_date&quot; (to monitor each day) or &quot;build_id&quot; (to monitor build over build)
xaxis = &quot;submission_date&quot;
# Both start_date and end_date can be overridden, otherwise the dates configured in
# Experimenter will be used as defaults.
start_date = &quot;2022-01-01&quot;
# Whether to skip the analysis for this project entirely.
# Useful for skipping rollouts for which OpMon projects are generated automatically otherwise.
skip = false
# Whether the project is related to a rollout.
is_rollout = false
# Ignore the default metrics that would be computed.
skip_default_metrics = false
# Whether to have all the results in a single tile on the Looker dashboard (compact)
# or to have separate tiles for each metric.
compact_visualization = false
# Metrics, that are based on metrics, to compute.
# Defined as a list of strings. These strings are the &quot;slug&quot; of the metric, which is the
# name of the metric definition section in either the project configuration or the platform-specific
# configuration file.
# See [metrics] section on how these metrics get defined.
metrics = [
'shutdown_hangs',
'main_crashes',
'startup_crashes',
'memory_unique_content_startup',
'perf_page_load_time_ms'
]
alerts = [
&quot;ci_diffs&quot;
]
# This section specifies the clients that should be monitored.
[project.population]
# Slug/name of the data source definition section in either the project configuration or the platform-specific
# configuration file. This data source refers to a database table.
# See [data_sources] section on how this gets defined.
data_source = &quot;main&quot;
# The name of the branches that have been configured for a rollout or experiment.
# If defined, this configuration overrides boolean_pref.
branches = [&quot;enabled&quot;, &quot;disabled&quot;]
# A SQL snippet that results in a boolean representing whether a client is included in the rollout or experiment or not.
boolean_pref = &quot;environment.settings.fission_enabled&quot;
# The channel the clients should be monitored from: &quot;release&quot;, &quot;beta&quot;, or &quot;nightly&quot;.
channel = &quot;beta&quot;
# If set to &quot;true&quot;, the rollout and experiment configurations will be ignored and instead
# the entire client population (regardless of whether they are part of the experiment or rollout)
# will be monitored.
# This option is useful if the project is not associated to a rollout or experiment and the general
# client population of a product should be monitored.
monitor_entire_population = false
# References to dimension slugs that are used to segment the client population.
# Defined as a list of strings. These strings are the &quot;slug&quot; of the dimension, which is the
# name of the dimension definition section in either the project configuration or the platform-specific
# configuration file. See [dimensions] section on how these get defined.
dimensions = [&quot;os&quot;]
# A set of metrics that should be part of the the same visualization
[project.metric_groups.crashes]
friendly_name = &quot;Crashes&quot;
description = &quot;Breakdown of crashes&quot;
metrics = [
&quot;main_crashes&quot;,
&quot;startup_crashes&quot;,
]
</code></pre>
<h4 id="metric-groups"><a class="header" href="#metric-groups">Metric Groups</a></h4>
<p>Metrics groups allow to specify a set of existing metric definitions that should be displayed in the same visualization (like a graph, for example) as separate lines. Often these metrics are related to each other in some way and having them in a single graph simplifies visual comparison.</p>
<p>Metric groups are different from <code>dimensions</code>. Dimensions segment the population based on a specific criteria (for example a table field). Depending on the selected segment only data for this slice of the population is shown for all specified metrics.
Metric groups do not influence how metrics are computed, they only have an impact on how metrics are visualized.</p>
<h3 id="data_sources-section"><a class="header" href="#data_sources-section"><code>[data_sources]</code> Section</a></h3>
<p>Data sources specify the tables data should be queried from.</p>
<p>In most cases, it is not necessary to define project-specific data sources, instead data sources can be specified in and referenced from the
platform-specific definition configurations.</p>
<pre><code class="language-toml">[data_sources]
[data_sources.main]
# FROM expression - often just a fully-qualified table name. Sometimes a subquery.
from_expression = &quot;mozdata.telemetry.main&quot;
# SQL snippet specifying the submission_date column
submission_date_column = &quot;DATE(submission_timestamp)&quot;
[data_sources.events_memory]
# FROM expression - subquery
from_expression = &quot;&quot;&quot;
(
SELECT
*
FROM `moz-fx-data-shared-prod.telemetry.events`
WHERE
event_category = 'memory_watcher'
)
&quot;&quot;&quot;
submission_date_column = &quot;DATE(submission_date)&quot;
</code></pre>
<h3 id="metrics-section"><a class="header" href="#metrics-section"><code>[metrics]</code> Section</a></h3>
<p>The metrics sections allows to specify metrics based on metrics that should be monitored.</p>
<p>In most cases, it is not necessary to define project-specific metrics, instead metrics can be specified and referenced from the
platform-specific definition configurations.</p>
<p>A new metric can be defined by adding a new section with a name like:</p>
<p><code>[metrics.&lt;new_metric_slug&gt;]</code></p>
<pre><code class="language-toml">[metrics]
[metrics.memory_pressure_count]
# The data source to use. Use the slug of a data source defined in a platform-specific config,
# or else define a new data source (see above).
data_source = &quot;events_memory&quot;
# A clause of a SELECT expression with an aggregation
select_expression = &quot;SUM(SAFE_CAST(SPLIT(event_string_value, ',')[OFFSET(1)] AS NUMERIC))&quot;
# Type of the metric to be evaluated.
# This is used to determine the method of aggregation to be applied.
# Either &quot;scalar&quot; or &quot;histogram&quot;.
type = &quot;scalar&quot;
# A friendly metric name displayed in dashboards.
friendly_name = &quot;Memory Pressure Count&quot;
# A description that will be displayed by dashboards.
description = &quot;Number of memory pressure events&quot;
# This can be any string value. It's currently not being used but in the future, this could be used to visually group different metrics by category.
category = &quot;performance&quot;
</code></pre>
<p>Statistics reduce observations of many clients to one or many rows describing the population.</p>
<p>Any summarization of the client-level data can be implemented as a statistic.</p>
<p>There is a fixed set of statistics available:</p>
<ul>
<li><code>sum</code></li>
<li><code>percentile</code> (default)</li>
<li><code>mean</code></li>
<li><code>count</code></li>
</ul>
<pre><code class="language-toml"># Specify which statistic to use for a metric
[metrics.memory_pressure_count.statistics]
sum = {}
mean = {}
</code></pre>
<p>New statistics need to be implemented in OpMon. Some statistics allow to specify additional parameters.</p>
<h3 id="dimensions-section"><a class="header" href="#dimensions-section"><code>[dimensions]</code> Section</a></h3>
<p>Dimensions define how the client population should be segmented.</p>
<p>For example:</p>
<pre><code class="language-toml">[dimensions]
[dimensions.os]
# The data source to use. Use the slug of a data source defined in a platform-specific config,
# or else define a new data source (see above).
data_source = &quot;main&quot;
# SQL snippet referencing a field whose values should be used to segment the client population.
select_expression = &quot;normalized_os&quot;
</code></pre>
<p>The <code>os</code> dimension will result in the client population being segmented by operation system. For each dimension a filter is being added to the resulting
dashboard which allows to, for example, only show results for all Windows clients.</p>
<h3 id="alerts-section"><a class="header" href="#alerts-section"><code>[alerts]</code> Section</a></h3>
<p>Different types of alerts can be defined for metrics:</p>
<pre><code class="language-toml">[alerts]
[alerts.ci_diffs]
# Alert for large differences between branches:
# an alert is triggered if confidence interval of different branches
# do not overlap
type = &quot;ci_overlap&quot;
metrics = [ # metrics to monitor
&quot;gc_ms&quot;,
&quot;startup_crashes&quot;,
]
percentiles = [50, 90] # percentiles to monitor
[alerts.crash_tresholds]
# Thresholds based aler:
# an alert is triggered if defined thresholds are exceeded/subceeded
type = &quot;threshold&quot;
metrics = [ # metrics to monitor
&quot;oom_crashes&quot;,
&quot;gpu_crashes&quot;
]
min = [0] # lower thresholds
max = [10] # upper thresholds
[alerts.historical_diff]
# Deviation from historical data:
# an alert is triggered if the average of the specified window deviates
# from the average of the previous window
type = &quot;avg_diff&quot;
metrics = [ # metrics to monitor
&quot;memory_total&quot;,
]
window_size = 7 # window size in days
max_relative_change = 0.5 # relative change that when exceeded triggers an alert
percentiles = [50, 90] # percentiles to monitor
</code></pre>
<p>Currently, there are 3 different types of alerts:</p>
<ul>
<li><strong>Large differences between branches:</strong> Whenever the confidence intervals of different branches for a specific metric no longer overlap, it indicates that there is potentially some significant difference.</li>
<li><strong>Thresholds:</strong> Comparing the values of a metric to a user-defined threshold.</li>
<li><strong>Deviation from historical data:</strong> Detect anomalous behaviour of a metric based on previously collected data.</li>
</ul>
<h4 id="large-differences-between-branches"><a class="header" href="#large-differences-between-branches">Large differences between branches</a></h4>
<p>The OpMon dashboards show the values for specific metrics as a line chart with confidence intervals. Each line represents the metric values for a different branch. Whenever the confidence intervals of the branches do not overlap, it is considered a critical change. See:</p>
<p><img src="cookbooks/../assets/opmon_alerting_branch_differences.png" alt="" /></p>
<h4 id="thresholds"><a class="header" href="#thresholds">Thresholds</a></h4>
<p>In some cases the expected value of a metric is known and any large deviation from that expected value is considered a critical change. Fixed thresholds can be used to specify when a value is too large or too low. See:</p>
<p><img src="cookbooks/../assets/opmon_alerting_thresholds.png" alt="" /></p>
<h4 id="deviation-from-historical-data"><a class="header" href="#deviation-from-historical-data">Deviation from historical data</a></h4>
<p>An alert should be triggered for certain metrics if their value deviates significantly from historical records. Sudden changes could, for example, happen after a new version gets released. See:</p>
<p><img src="cookbooks/../assets/opmon_alerting_historical_diff.png" alt="" /></p>
<p>It is not always possible to define a specific threshold, so instead previously recorded data should be used to detect significant deviations.</p>
<p>This check is the most complicated and computation-intensive one with potentially the highest number of false positives. There are a lot of different anomaly detection algorithms out there, but OpMon uses an approach which compares the average value of a metric of the past <code>n</code> days to the average value of the <code>n</code> days before. If the relative difference between these two values exceeds a defined threshold an alert will be triggered.</p>
<p>The main downside of this approach is that whenever spikes happen, alerts will be sent even after the spike has gone down since it will inflate the average values for a while.</p>
<h2 id="previews"><a class="header" href="#previews">Previews</a></h2>
<p>When iterating on configurations, it is sometimes useful to get a preview of what computed data on the final dashboard would look like. A preview can be generated by installing the <a href="https://github.com/mozilla/opmon#local-installation">OpMon CLI tooling locally</a>.</p>
<p>Once installed <code>opmon preview</code> can be run for a specific configuration or rollout:</p>
<pre><code>&gt; opmon preview --help
Usage: opmon preview [OPTIONS]
Create a preview for a specific project based on a subset of data.
Options:
--project_id, --project-id TEXT
Project to write to
--dataset_id, --dataset-id TEXT
Temporary dataset to write to
--derived_dataset_id, --derived-dataset-id TEXT
Temporary derived dataset to write to
--start_date, --start-date YYYY-MM-DD
Date for which project should be started to
get analyzed. Default: current date - 3 days
--end_date, --end-date YYYY-MM-DD
Date for which project should be stop to get
analyzed. Default: current date
--slug TEXT Experimenter or Normandy slug associated
with the project to create a preview for
[required]
--config_file, --config-file PATH
Custom local config file
--config_repos, --config-repos TEXT
URLs to public repos with configs
--private_config_repos, --private-config-repos TEXT
URLs to private repos with configs
--help Show this message and exit.
&gt; gcloud auth login --update-adc
&gt; gcloud config set project mozdata
&gt; opmon preview --slug=firefox-install-demo --config_file='/local/path/to/opmon/firefox-install-demo.toml'
Start running backfill for firefox-install-demo: 2022-12-17 to 2022-12-19
Backfill 2022-12-17
...
A preview is available at: https://mozilla.cloud.looker.com/dashboards/operational_monitoring::opmon_preview?Table='mozdata.tmp.firefox_install_demo_statistics'&amp;Submission+Date=2022-12-17+to+2022-12-20
</code></pre>
<p>Once preview data has been computed, a link to a Looker dashboard will be printed where data for each metric and statistic is visualized.</p>
<p><img src="cookbooks/../assets/opmon_preview.png" alt="" /></p>
<p>The preview data gets written into the <code>tmp</code> dataset in the <code>mozdata</code> project by default. Data written to this dataset gets automatically removed after 7 days.</p>
<h2 id="reading-results"><a class="header" href="#reading-results">Reading Results</a></h2>
<p>Generated dashboards are available in <a href="https://mozilla.cloud.looker.com/folders/494">Looker</a>.</p>
<p><img src="cookbooks/../assets/opmon_results.png" alt="" /></p>
<p>Dashboards have filters for selecting a percentile and for filtering based on the dimensions that have been configured.
For each metric results are visualized on separate tiles, with results being grouped by branches. Depending on the project configuration, the x-axis will either be by submission date or by Build ID.</p>
<p>Results are visualized as line charts with confidence intervals. Since Looker does only support simple line chart visualizations, the upper and lower bounds of the confidence intervals are shown in lighter colors while the median value for a specific branch is shown in a darker color.</p>
<p>Results are divided into different percentiles that can be changed through the dashboard filter:</p>
<ul>
<li>The 50th percentile represents the median value across all clients for a metric</li>
<li>The 90th/95th/99th/... percentile are referring to the top 10%/5%/1%/... based on the measured metric. For example, in the screenshot above the 80th percentile is selected, so the top 20% of clients based on the <em>Perf Page Load Time Ms</em> metric that are enrolled in the <em>enabled</em> branch have a load time of around 6,600 ms.</li>
</ul>
<p>Usually places where the confidence intervals of different branches have a gap between them - if the higher bound of one metric is below the lower bound of another metric - means there is a high likelihood that there is an actual difference between the measurement for the groups.</p>
<p>Each dashboard tile also allows to explore the data for a specific metric in more detail by clicking on <em>Explore from here</em>.</p>
<h2 id="subscribing-to-alerts"><a class="header" href="#subscribing-to-alerts">Subscribing to Alerts</a></h2>
<p>If alerts have been configured for a OpMon project, then the generated dashboard will show any triggered alerts in a table at the bottom of the dashboard:</p>
<p><img src="cookbooks/../assets/opmon_alerts.png" alt="" /></p>
<p>To receive email or Slack notification whenever new alerts are being triggered, click on the <em>Alerts</em> icon that is in the right corner of the <em>Alerts</em> table.
Configure the alert by setting the condition, email addresses or Slack channels alerts should be sent to and the frequency of when checks should be performed:</p>
<p><img src="cookbooks/../assets/opmon_alert_config.png" alt="" /></p>
<h2 id="data-products"><a class="header" href="#data-products">Data Products</a></h2>
<p>OpMon writes monitoring results and metadata to BigQuery. OpMon runs as part of the nightly ETL job (see <a href="cookbooks/operational_monitoring.html#scheduling">Scheduling</a> below).</p>
<h3 id="result-tables"><a class="header" href="#result-tables">Result Tables</a></h3>
<p>The result tables that back the Looker dashboards are available in the <code>operational_monitoring_derived</code> dataset in <code>moz-fx-data-shared-prod</code>.
Result tables are named like:</p>
<p><code>&lt;slug&gt;_v&lt;version&gt;</code></p>
<p><code>&lt;slug&gt;</code> is referring to the slug that has been set for the project and a separate table is created for metrics, statistics and alerts. The schema for metric tables is flexible and depends on the metrics configured to be computed.</p>
<p>Views for each tables are also created in the <code>operational_monitoring</code> dataset. These views are used by the Looker dashboards.</p>
<h4 id="metric-tables"><a class="header" href="#metric-tables">Metric tables</a></h4>
<div class="table-wrapper"><table><thead><tr><th>Column name</th><th>Type</th><th>Description</th></tr></thead><tbody>
<tr><td><code>submission_date</code></td><td><code>DATE</code></td><td>Date the monitoring results are for</td></tr>
<tr><td><code>client_id</code></td><td><code>STRING</code></td><td>Client's telemetry <code>client_id</code></td></tr>
<tr><td><code>branch</code></td><td><code>STRING</code></td><td>Branch client is enrolled in</td></tr>
<tr><td><code>build_id</code></td><td><code>STRING</code></td><td>Build the client is on</td></tr>
</tbody></table>
</div>
<p>The result table will have additional columns for each metric and dimension that has been defined.</p>
<h4 id="statistic-tables"><a class="header" href="#statistic-tables">Statistic Tables</a></h4>
<div class="table-wrapper"><table><thead><tr><th>Column name</th><th>Type</th><th>Description</th></tr></thead><tbody>
<tr><td><code>submission_date</code></td><td><code>DATE</code></td><td>Date the monitoring results are for</td></tr>
<tr><td><code>client_id</code></td><td><code>STRING</code></td><td>Client's telemetry <code>client_id</code></td></tr>
<tr><td><code>branch</code></td><td><code>STRING</code></td><td>Branch client is enrolled in</td></tr>
<tr><td><code>build_id</code></td><td><code>STRING</code></td><td>Build the client is on</td></tr>
</tbody></table>
</div>
<p>The result table will have additional columns for each metric and dimension that has been defined.</p>
<h3 id="metadata"><a class="header" href="#metadata">Metadata</a></h3>
<p>The table <code>projects_v1</code> in <code>operational_monitoring_derived</code> contains metadata about the configured projects that is required for generating the dashboards. The table is updated after each ETL run for a specific project.</p>
<div class="table-wrapper"><table><thead><tr><th>Column name</th><th>Type</th><th>Description</th></tr></thead><tbody>
<tr><td><code>slug</code></td><td><code>STRING</code></td><td>Project slug</td></tr>
<tr><td><code>name</code></td><td><code>STRING</code></td><td>Descriptive name of the project used as dashboard title</td></tr>
<tr><td><code>xaxis</code></td><td><code>STRING</code></td><td>Specifies which column should be used as x-axis (either &quot;submission_date&quot; or &quot;build_id&quot;)</td></tr>
<tr><td><code>branches</code></td><td><code>ARRAY</code></td><td>List of branch names</td></tr>
<tr><td><code>dimensions</code></td><td><code>ARRAY</code></td><td>List of dimension slugs</td></tr>
<tr><td><code>start_date</code></td><td><code>DATE</code></td><td>Date for when monitoring should start for the project</td></tr>
<tr><td><code>end_date</code></td><td><code>DATE</code></td><td>Date for when monitoring should end for the project</td></tr>
<tr><td><code>metrics</code></td><td><code>RECORD</code></td><td>Repeated record with the metric slug and aggregation type</td></tr>
</tbody></table>
</div>
<h3 id="scheduling"><a class="header" href="#scheduling">Scheduling</a></h3>
<p>OpMon is updated nightly by telemetry-airflow. It is invoked by the <a href="https://github.com/mozilla/telemetry-airflow/blob/main/dags/operational_monitoring.py">operational_monitoring DAG</a>.</p>
<h2 id="experiments-vs-opmon"><a class="header" href="#experiments-vs-opmon">Experiments vs OpMon</a></h2>
<p>The requirements for Operational Monitoring are related to, but mostly distinct from those for experiments:</p>
<ul>
<li>With an A/B experiment, the goal is to determine with confidence whether a single change (i.e. the treatment) has an expected impact on a metric or small number of metrics. The change is only applied to a sample of clients for a fixed period of time.</li>
<li>With operational monitoring, a project team is making many changes over a long but indeterminate period of time and must identify if any one change or set of changes (e.g., changes in a given Nightly build) moves a metric in the target population. An identified metric impact may result in a change being backed out, but may also be used to guide future project work.</li>
</ul>
<p>OpMon can be used to monitor experiments. For experiments with continuous enrollments or no fixed end date, OpMon will provide insights that would otherwise not be available. Other experiments, that are interested in looking at operational metrics could also benefit from OpMon.</p>
<h2 id="going-deeper-2"><a class="header" href="#going-deeper-2">Going Deeper</a></h2>
<p>To get a deeper understanding of what happens under the hood, please see the <a href="https://github.com/mozilla/opmon">opmon repository and developer documentation</a>.</p>
<h2 id="getting-help-2"><a class="header" href="#getting-help-2">Getting Help</a></h2>
<p>If you have further questions, please reach out on the <a href="https://mozilla.slack.com/archives/C4D5ZA91B">#data-help</a> Slack channel.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/operational_monitoring.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h2 id="introduction-to-stmo"><a class="header" href="#introduction-to-stmo">Introduction to STMO</a></h2>
<blockquote>
<p><strong>⚠</strong> As of August 2021, <a href="tools/../introduction/tools.html#looker">Looker</a> is considered the preferred solution for data exploration and visualization at Mozilla where possible. If you have any questions about where or when to use STMO, see <a href="tools/../concepts/getting_help.html">getting help</a>.</p>
</blockquote>
<p><a href="https://sql.telemetry.mozilla.org"><code>sql.telemetry.mozilla.org</code></a> (STMO) is Mozilla's installation of the <a href="https://redash.io/">Redash</a> data analysis and dashboarding tool. As the name and URL imply, the effective use of this tool requires familiarity with <a href="https://en.wikipedia.org/wiki/SQL">SQL</a>, with which all of the tool's data extraction and analysis are performed.</p>
<p>Access to STMO is limited to Mozilla employees and designated contributors. For more information, see <a href="tools/../concepts/gaining_access.html">gaining access</a>.</p>
<h2 id="stmo-concepts"><a class="header" href="#stmo-concepts">STMO Concepts</a></h2>
<p>You need to use the following building blocks from which analyses in STMO are constructed:</p>
<ul>
<li>Queries</li>
<li>Visualizations</li>
<li>Dashboards</li>
</ul>
<h4 id="queries"><a class="header" href="#queries">Queries</a></h4>
<p>STMO's basic unit of analysis is a query. A query is a block of SQL code that
extracts and optionally transforms data from a single data source. Queries
can vary widely in complexity. Some queries are simply one liners (e.g., <code>SELECT some_field FROM tablename LIMIT 10</code>) while others span over many pages, almost like small programs.</p>
<p>The raw output from a query is tabular data. Each row represents one set of
return values for the query's output columns. You can run a query manually or specify a refresh schedule that executes automatically after a specified interval of time.</p>
<h4 id="visualizations"><a class="header" href="#visualizations">Visualizations</a></h4>
<p>Tabular data is great, but rarely is a grid of values the best way to make
sense of your data. You can associate each query with multiple visualizations. Each visualization can render the extracted data in a format that makes it easy to interpret your data.</p>
<p>There are many visualization types, including charts (line, bar, area, pie, etc.), counters, maps, pivot tables, and more. You can use each visualization type that provides a set of configuration parameters to specify how to map from the raw query output to the desired visualization. Some visualization types make demands of the query output. For example, a map visualization requires each row to include a longitude value and a latitude value.</p>
<h4 id="dashboards"><a class="header" href="#dashboards">Dashboards</a></h4>
<p>A dashboard is a collection of visualizations that is combined into a single visual
presentation for your convenience. A dashboard is decoupled from any particular queries. Although you can include multiple visualizations from a single query in one dashboard, it is not required. Users can add any visualizations that they can access to any dashboards they have created.</p>
<h2 id="data-sources"><a class="header" href="#data-sources">Data Sources</a></h2>
<p>SQL provides the ability to extract and manipulate the data, but you won't get
very far without having some familiarity with what data is actually available,
and how that data is structured. Each query in STMO is associated with exactly
one data source, and you have to know ahead of time which data source contains
the information that you need. One of the most commonly used data sources is
called <em>Telemetry (BigQuery)</em>, which contains most of the data that is
obtained from telemetry pings received from Firefox clients. <em>BigQuery</em>
refers to Google's <a href="https://cloud.google.com/bigquery/">BigQuery</a> data warehouse.</p>
<p>Other available data sources include <em>Crash DB</em>, <em>Tiles</em>, <em>Sync Stats</em>, <em>Push</em>,
<em>Test Pilot</em>, and even a <em>Redash metadata</em> which connects to STMO's
own Redash database.</p>
<h2 id="create-an-example-dashboard"><a class="header" href="#create-an-example-dashboard">Create an Example Dashboard</a></h2>
<p>The following topics describe the process of creating a simple dashboard using STMO.</p>
<h4 id="create-a-query"><a class="header" href="#create-a-query">Create A Query</a></h4>
<p>Let's start by creating a query. The first query counts the number of client ids that Mozilla receives from each country, for the top N countries. If you click the 'New Query' button that is located at the top on the left-hand side of the site, the query editing page appears:</p>
<p><img src="tools/../assets/STMO_screenshots/new_query.png" alt="New Query Page" title="New Query page" /></p>
<p>For this and most other queries where each client IDs is counted, you want to use
<a href="tools/../datasets/bigquery/clients_last_seen/reference.html"><code>clients_last_seen</code></a>,
which is generated from Firefox telemetry pings.</p>
<ul>
<li>
<p>Search for the table in <code>Telemetry (BigQuery)</code></p>
<p>Click the 'Data Source' drop-down and select <code>Telemetry (BigQuery)</code>. Then, search for the table by typing <code>clients_last_seen</code> in the &quot;Search schema...&quot; field that is above the schema browser interface to the left of the main query Edit field.</p>
</li>
</ul>
<p>You should see a <code>clients_last_seen</code> entry (appearing as <code>telemetry.clients_last_seen</code>). You may also see versioned copies of the tables as <code>telemetry.clients_last_seen_v&lt;VERSION&gt;</code>.</p>
<ul>
<li>
<p>Introspect the available columns</p>
<p>Click <code>telemetry.clients_last_seen</code> in the schema browser to display the columns that are available in the table. The following columns are of interest for this query:</p>
<ul>
<li><code>country</code></li>
<li><code>days_since_seen</code></li>
<li><code>submission_date</code>.</li>
</ul>
</li>
</ul>
<p>If a query extracts all unique country values and the MAU for one day for each one, sorted from highest MAU to lowest MAU, the query then appears as follows:</p>
<pre><code class="language-sql">SELECT
country,
COUNTIF(days_since_seen &lt; 28) AS mau
FROM
telemetry.clients_last_seen
WHERE
submission_date = '2019-04-01'
GROUP BY
country
ORDER BY
mau DESC
</code></pre>
<p>If you type these parameters into the main query Edit field and then click the &quot;Execute&quot;
button, a blue bar then appears below the Edit field. It includes the text &quot;Executing query...&quot;, followed by a timer that indicates how long the query has run. After some period of time, usually less than a minute, the query typically completes its run. A table appears that displays a MAU value for each country. You have just created and run your first STMO query!</p>
<p>Next, click the large &quot;New Query&quot; text located at the top of the page. An Edit field appears so you can rename the query. It is recommended that you assign a unique prefix (such as your name) to the query to make it easy to find your query later. For example, <code>rmiller:Top Countries</code>.</p>
<h4 id="create-a-visualization"><a class="header" href="#create-a-visualization">Create A Visualization</a></h4>
<p>After you have created a query, you may want to provide a simple visualization. The table with results from the first query execution now appears under the query Edit field. Another heading titled <code>+NEW VISUALIZATION</code> appears next to the <code>TABLE</code> heading:</p>
<p><img src="tools/../assets/STMO_screenshots/new_visualization.png" alt="New Visualization" title="New Visualization" /></p>
<p>Click the <code>+NEW VISUALIZATION</code> link to display the &quot;Visualization Editor&quot; screen. You can now specify a visualization name (&quot;Top Countries bar chart&quot;), a chart type (&quot;Bar&quot;), an x-axis column (<code>country</code>), and a y-axis column (<code>mau</code>):</p>
<p><img src="tools/../assets/STMO_screenshots/vis_editor.png" alt="Visualization Editor" title="Visualization Editor" /></p>
<p>After the <code>GENERAL</code> settings have been specified, you need to modify additional settings on the <code>X AXIS</code> tab. Click this tab and then change the 'Scale' setting to 'Category', and un-check the 'Sort Values' checkbox to enable the query's sort order to take precedence:</p>
<p><img src="tools/../assets/STMO_screenshots/x_axis_editor.png" alt="Visualization X Axis" title="Visualization X Axis" /></p>
<h4 id="a-note-about-limits"><a class="header" href="#a-note-about-limits">A Note About Limits</a></h4>
<p>After you have saved the visualization settings and displayed the query source page, a bar graph appears near the bottom of the page. The graph includes quite a few entries. Rather than being able to view <em>all</em> of the countries, you may want to display only the first 20 entries by adding a <code>LIMIT</code> clause to the end of a query:</p>
<pre><code class="language-sql">SELECT
country,
COUNTIF(days_since_seen &lt; 28) AS mau
FROM
telemetry.clients_last_seen
WHERE
submission_date = '2019-04-01'
GROUP BY
country
ORDER BY
mau DESC
LIMIT
20
</code></pre>
<p>If you edit the query to add a limit clause and then click 'Execute', a new bar graph only displays the 20 countries that have the highest number of unique clients. In this case, the full result set includes approximately 250 return values: limiting the result count improves readability.</p>
<p>In other cases, however, an unlimited query may return thousands or even millions of rows. Any queries that return millions of rows or lines can not only be unreadable but negatively impact the performance of all other users of STMO. Thus an important warning:</p>
<p><strong>ALL QUERIES SHOULD INCLUDE A &quot;LIMIT&quot; STATEMENT BY DEFAULT!</strong></p>
<p>It is highly recommended that you add a &quot;LIMIT 100&quot; clause to the end of all new queries to prevent a query from returning a large result set that causes user interface (UI) and performance problems. You may learn that the total result set is small enough that setting a limit becomes unnecessary. Specifying an explicit LIMIT helps prevent unnecessary issues.</p>
<h4 id="query-parameters"><a class="header" href="#query-parameters">Query Parameters</a></h4>
<p>You can add user arguments to a query, which allows the user to specify parameters without modifying the query itself. Using the query in the previous section as an example, you can replace the <code>LIMIT 20</code> with a country count variable in double curly braces:</p>
<pre><code class="language-sql">SELECT
country,
COUNTIF(days_since_seen &lt; 28) AS mau
FROM
telemetry.clients_last_seen
WHERE
submission_date = '2019-04-01'
GROUP BY
country
ORDER BY
mau DESC
LIMIT
{{country_count}}
</code></pre>
<p>After you have replaced the hard-coded limit value with <code>{{country_count}}</code>, an input field appears above the bar chart. If you type a numeric value in the input field and click 'Execute', the query will run with the specified limit. Click 'Save' to save the query. The query applies the parameter value that you typed as the default.</p>
<h4 id="create-a-dashboard"><a class="header" href="#create-a-dashboard">Create A Dashboard</a></h4>
<p>You can create a dashboard to display visualization by selecting 'New Dashboard' from the 'Dashboards' dropdown that is located at the top left of the page. Choose a name for your dashboard and an empty page should appears. Click the '...' button near the top right of the page to select 'Add Widget'. The following dialog box appears:</p>
<p><img src="tools/../assets/STMO_screenshots/add_widget.png" alt="Add Widget" title="Add Widget" /></p>
<p>Type a unique prefix that has previously been used in a query name in the &quot;Search a query by name&quot; field to locate the query that you have created. You cannot yet execute this query because the query has not yet been published. As soon as you publish a query, you can search on the summary pages. Even though this is only an exercise, the query must be published briefly and then added to the dashboard. You can publish your query by clicking &quot;Publish&quot; on the query source page.</p>
<p>As soon as a query is published, it appears in the search results when you type a unique prefix in the &quot;Search a query by name&quot; field on the &quot;Add Widget&quot; dialog box. When you select a query, you can select a query's visualizations from the &quot;Choose Visualization&quot; dropdown. Select the bar chart that you created and then click &quot;Add to Dashboard&quot;. The dashboard now includes a bar chart. You can now edit the <code>country_count</code> value and click &quot;Refresh&quot; to change the number of countries that are included in the chart.</p>
<h4 id="completing-the-dashboard"><a class="header" href="#completing-the-dashboard">Completing the Dashboard</a></h4>
<p>A dashboard with just one graph is often insufficient. Therefore, you may want to create additional queries, each with a very similar bar chart. The text that you need to apply to the queries is listed below. However, you need to create the queries and the visualizations and then link them to the dashboard. The queries are as follows:</p>
<ul>
<li>Top OSes (recommended <code>os_count</code> value == 6)</li>
</ul>
<pre><code class="language-sql">SELECT
os,
COUNTIF(days_since_seen &lt; 28) AS mau
FROM
telemetry.clients_last_seen
WHERE
submission_date = '2019-04-01'
GROUP BY
os
ORDER BY
mau DESC
LIMIT
{{os_count}}
</code></pre>
<ul>
<li>Channel Counts</li>
</ul>
<pre><code class="language-sql">SELECT
normalized_channel AS channel,
COUNTIF(days_since_seen &lt; 28) AS mau
FROM
telemetry.clients_last_seen
WHERE
submission_date = '2019-04-01'
GROUP BY
channel
ORDER BY
mau DESC
</code></pre>
<ul>
<li>App Version Counts (recommended <code>app_version_count value</code> == 20)</li>
</ul>
<pre><code class="language-sql">SELECT
app_name,
app_version,
COUNTIF(days_since_seen &lt; 28) AS mau
FROM
telemetry.clients_last_seen
WHERE
submission_date = '2019-04-01'
GROUP BY
app_name,
app_version
ORDER BY
mau DESC
LIMIT
{{app_version_count}}
</code></pre>
<p>Creating bar charts for these queries and adding them to the original dashboard
can result in a dashboard that resembles this:</p>
<p><img src="tools/../assets/STMO_screenshots/finished_dashboard.png" alt="Completed Dashboard" title="Completed Dashboard" /></p>
<p>Some final notes to help you create your dashboards:</p>
<ul>
<li>
<p>Remember to publish each of your queries before adding its visualizations to a dashboard.</p>
</li>
<li>
<p>Similarly, it is recommended to un-publish any test queries after you have used
them in a dashboard. This prevents everyone's search results from being contaminated
with your tests and experiments. Any queries that result from an actual
work-related analysis typically remain published. Others users can view these queries and
learn from them.</p>
</li>
<li>
<p>The 'Firefox' label on the 'App Version counts' graph is related to the use
of the 'Group by' visualization setting. It is recommended that you experiment with
the use of 'Group by' in your graphs to learn more about its usage.</p>
</li>
<li>
<p>This tutorial has only touched the surface of the wide variety of
sophisticated visualizations that STMO supports. You can view many
more sophisticated queries and dashboards by browsing and exploring
the work that has been published by others.</p>
</li>
<li>
<p>The <a href="https://redash.io/help/">Redash help center</a> is a useful resource for exploring Redash and all its capabilities.</p>
</li>
</ul>
<h4 id="prototyping-queries"><a class="header" href="#prototyping-queries">Prototyping Queries</a></h4>
<p>You may want to start working on a query before data becomes available.
You can do this with most of the data sources by selecting a static test data
set and then work with it, as usual. You can also use this method to explore
how a particular SQL backend behaves.</p>
<p>Note that <code>UNION ALL</code> will retain duplicate rows while <code>UNION</code> will discard them.</p>
<p>Here are a couple of examples:</p>
<p><strong>Simple three-column test dataset</strong></p>
<pre><code class="language-sql">WITH test AS (
SELECT 1 AS client_id, 'foo' AS v1, 'bar' AS v2 UNION ALL
SELECT 2 AS client_id, 'bla' AS v1, 'baz' AS v2 UNION ALL
SELECT 3 AS client_id, 'bla' AS v1, 'bar' AS v2 UNION ALL
SELECT 2 AS client_id, 'bla' AS v1, 'baz' AS v2 UNION ALL
SELECT 3 AS client_id, 'bla' AS v1, 'bar' AS v2
)
SELECT * FROM test
</code></pre>
<p><strong>Convert a semantic version string to a sortable array field</strong></p>
<pre><code class="language-sql">WITH foo AS (
SELECT '1.0.1' AS v UNION
SELECT '1.10.3' AS v UNION
SELECT '1.0.2' AS v UNION
SELECT '1.1' AS v UNION
-- Doesn't work with these type of strings due to casting
-- SELECT '1.3a1' AS v UNION
SELECT '1.2.1' AS v
)
SELECT cast(split(v, '.') AS array&lt;bigint&gt;) FROM foo ORDER BY 1
</code></pre>
<p><strong>How do boolean fields get parsed from strings?</strong></p>
<pre><code class="language-sql">WITH bar AS (
SELECT '1' AS b UNION
SELECT '0' UNION
SELECT 't' UNION
SELECT 'f' UNION
SELECT 'true' UNION
SELECT 'false' UNION
SELECT 'turkey'
)
SELECT b, try(cast(b AS boolean)) from bar
</code></pre>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/tools/stmo.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="accessing-public-data"><a class="header" href="#accessing-public-data">Accessing Public Data</a></h1>
<p>A public dataset is a dataset in <a href="https://cloud.google.com/bigquery">BigQuery</a> which is made available to the general public
in BigQuery or through our <a href="https://public-data.telemetry.mozilla.org">public HTTP endpoint</a>.</p>
<h2 id="table-of-contents-2"><a class="header" href="#table-of-contents-2">Table of Contents</a></h2>
<ul>
<li><a href="cookbooks/public_data.html#accessing-public-data-in-bigquery">Accessing Public Data in BigQuery</a></li>
<li><a href="cookbooks/public_data.html#accessing-public-data-through-the-public-http-endpoint">Accessing Public Data Through the Public HTTP Endpoint</a></li>
<li><a href="cookbooks/public_data.html#let-us-know">Let us know!</a></li>
</ul>
<h2 id="accessing-public-data-in-bigquery"><a class="header" href="#accessing-public-data-in-bigquery">Accessing Public Data in BigQuery</a></h2>
<p>To access public datasets in BigQuery, a <a href="https://cloud.google.com">Google Cloud Platform</a> (GCP) account is required.
GCP also offers <a href="https://cloud.google.com/free">a free tier</a> which offers free credits to use and run queries in BigQuery. <a href="https://cloud.google.com/blog/products/data-analytics/query-without-a-credit-card-introducing-bigquery-sandbox">BigQuery sandbox</a> enables users to use BigQuery for free without requiring payment information.</p>
<p>To get started, log into the <a href="https://console.cloud.google.com/bigquery">BigQuery console</a> or use the
<a href="https://cloud.google.com/bigquery/docs/bq-command-line-tool">BigQuery command line tools</a> to <a href="https://cloud.google.com/appengine/docs/standard/nodejs/building-app/creating-project">create a new project</a>.
After selecting the project, Mozilla's public datasets in the <code>mozilla-public-data</code> project can
be accessed and queried. For example:</p>
<pre><code class="language-sql">SELECT *
FROM `mozilla-public-data.telemetry_derived.ssl_ratios_v1`
WHERE submission_date = &quot;2020-04-16&quot;
</code></pre>
<h2 id="accessing-public-data-through-the-public-http-endpoint"><a class="header" href="#accessing-public-data-through-the-public-http-endpoint">Accessing Public Data Through the Public HTTP Endpoint</a></h2>
<p>Some BigQuery datasets are also published as gzipped JSON files through the public HTTP endpoint:
<a href="https://public-data.telemetry.mozilla.org">https://public-data.telemetry.mozilla.org</a>.</p>
<p>A list of available public datasets is available at: <a href="https://public-data.telemetry.mozilla.org/all-datasets.json">https://public-data.telemetry.mozilla.org/all-datasets.json</a>
This list contains the names of available datasets, additional metadata and links to the
storage locations of the files containing the data.</p>
<p>For example:</p>
<pre><code class="language-json">{
&quot;telemetry_derived&quot;: {
// ^ dataset name
&quot;deviations&quot;: {
// ^ table name
&quot;v1&quot;: {
// ^ table version
&quot;friendly_name&quot;: &quot;Deviations&quot;,
&quot;description&quot;: &quot;Deviation of different metrics from forecast.&quot;,
&quot;incremental&quot;: true,
&quot;incremental_export&quot;: false,
&quot;review_link&quot;: &quot;https://bugzilla.mozilla.org/show_bug.cgi?id=1624528&quot;,
&quot;files_uri&quot;: &quot;https://public-data.telemetry.mozilla.org/api/v1/tables/telemetry_derived/deviations/v1/files&quot;,
&quot;last_updated&quot;: &quot;https://public-data.telemetry.mozilla.org/api/v1/tables/telemetry_derived/deviations/v1/last_updated&quot;
}
},
&quot;ssl_ratios&quot;: {
&quot;v1&quot;: {
&quot;friendly_name&quot;: &quot;SSL Ratios&quot;,
&quot;description&quot;: &quot;Percentages of page loads Firefox users have performed that were conducted over SSL broken down by country.&quot;,
&quot;incremental&quot;: true,
&quot;incremental_export&quot;: false,
&quot;review_link&quot;: &quot;https://bugzilla.mozilla.org/show_bug.cgi?id=1414839&quot;,
&quot;files_uri&quot;: &quot;https://public-data.telemetry.mozilla.org/api/v1/tables/telemetry_derived/ssl_ratios/v1/files&quot;,
&quot;last_updated&quot;: &quot;https://public-data.telemetry.mozilla.org/api/v1/tables/telemetry_derived/ssl_ratios/v1/last_updated&quot;
}
}
// [...]
}
}
</code></pre>
<p>The keys within each dataset have the following meanings:</p>
<ul>
<li><code>incremental</code>:
<ul>
<li><code>true</code>: data gets incrementally updated which means that new data gets added periodically
(for most datasets on a daily basis)</li>
<li><code>false</code>: the entire table data gets updated periodically</li>
</ul>
</li>
<li><code>incremental_export</code>:
<ul>
<li><code>true</code>: data for each <code>submission_date</code> gets exported into separate directories (e.g.
<code>files/2020-04-15</code>, <code>files/2020-04-16</code>, ...)</li>
<li><code>false</code>: all data gets exported into one <code>files/</code> directory</li>
</ul>
</li>
<li><code>review_link</code>: links to the Bugzilla bug for the data review</li>
<li><code>files_uri</code>: lists links to all available data files</li>
<li><code>last_updated</code>: link to a <code>last_updated</code> file containing the timestamp for when the data files were
last updated</li>
</ul>
<p>Data files are gzipped and up to 1 GB in size. If the data exceeds 1 GB, then it gets split up into multiple
files named <code>000000000000.json</code>, <code>000000000001.json</code>, ...
For example: <a href="https://public-data.telemetry.mozilla.org/api/v1/tables/telemetry_derived/ssl_ratios/v1/files/000000000000.json">https://public-data.telemetry.mozilla.org/api/v1/tables/telemetry_derived/ssl_ratios/v1/files/000000000000.json</a></p>
<h2 id="let-us-know"><a class="header" href="#let-us-know">Let us know!</a></h2>
<p>If this public data has proved useful to your research, or you've built a cool visualization with it, let us know! You can email <a href="mailto:publicdata@mozilla.com"><code>publicdata@mozilla.com</code></a> or reach us on the <a href="https://chat.mozilla.org/#/room/#telemetry:mozilla.org">#telemetry:mozilla.org</a> channel on <a href="https://wiki.mozilla.org/Matrix">Mozilla's instance of matrix</a>.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/public_data.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="accessing-and-working-with-bigquery"><a class="header" href="#accessing-and-working-with-bigquery">Accessing and working with BigQuery</a></h1>
<p>With the transition to <a href="https://cloud.google.com">GCP</a> in 2019 having been completed, BigQuery has become Mozilla's primary data warehouse and SQL Query engine.</p>
<p>The following topics provide an introduction to working with data that is stored
in <a href="https://cloud.google.com/bigquery/">BigQuery</a>:</p>
<ul>
<li><a href="cookbooks/./bigquery/access.html">Accessing BigQuery</a></li>
<li><a href="cookbooks/./bigquery/querying.html">Writing BigQuery Queries</a></li>
<li><a href="cookbooks/./bigquery/optimization.html">Optimizing BigQuery Queries</a></li>
</ul>
<p>There is a <a href="https://cloud.google.com/bigquery/pricing">cost associated with using BigQuery</a> based on operations. The on-demand pricing for queries is based on how much data a query scans. Before using BigQuery, please see <a href="cookbooks/./bigquery/optimization.html">Optimizing BigQuery Queries</a> above for information on how to understand and minimize costs.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/bigquery.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="accessing-bigquery"><a class="header" href="#accessing-bigquery">Accessing BigQuery</a></h1>
<p>There are many methods that you can use to access BigQuery: both interactive and programmatic. This document provides some basic information and pointers on how to get started with each.</p>
<p>It is worth pointing out that all internal access to BigQuery is logged and periodically audited by Data Engineering and Operations for cost and other purposes.</p>
<h2 id="table-of-contents-3"><a class="header" href="#table-of-contents-3">Table of Contents</a></h2>
<ul>
<li><a href="cookbooks/bigquery/access.html#interfaces">Interfaces</a>
<ul>
<li><a href="cookbooks/bigquery/access.html#stmo-sqltelemetrymozillaorg">STMO (<code>sql.telemetry.mozilla.org</code>)</a></li>
<li><a href="cookbooks/bigquery/access.html#bigquery-console">BigQuery Console</a></li>
<li><a href="cookbooks/bigquery/access.html#using-the-bq-command-line-tool">Using the <code>bq</code> Command-Line Tool</a></li>
<li><a href="cookbooks/bigquery/access.html#api-access">API Access</a>
<ul>
<li><a href="cookbooks/bigquery/access.html#service-accounts">Service Accounts</a></li>
</ul>
</li>
<li><a href="cookbooks/bigquery/access.html#spark">Spark</a></li>
<li><a href="cookbooks/bigquery/access.html#colaboratory">Colaboratory</a></li>
<li><a href="cookbooks/bigquery/access.html#ai-platform-notebooks">AI Platform Notebooks</a>
<ul>
<li><a href="cookbooks/bigquery/access.html#notebooks-access-to-workgroup-confidential-datasets">Notebooks Access to workgroup-confidential Datasets</a></li>
</ul>
</li>
</ul>
</li>
<li><a href="cookbooks/bigquery/access.html#bigquery-access-request">BigQuery Access Request</a></li>
</ul>
<h2 id="interfaces"><a class="header" href="#interfaces">Interfaces</a></h2>
<h3 id="stmo-sqltelemetrymozillaorg-1"><a class="header" href="#stmo-sqltelemetrymozillaorg-1">STMO (<code>sql.telemetry.mozilla.org</code>)</a></h3>
<blockquote>
<p><strong>⚠</strong> Queries made from STMO are read-only: you cannot create views or tables.</p>
</blockquote>
<p>All users with access to <a href="cookbooks/bigquery/../../tools/stmo.html">STMO</a> can access BigQuery using the following data sources:</p>
<ul>
<li><code>Telemetry (BigQuery)</code></li>
<li><code>Telemetry Search (BigQuery)</code></li>
</ul>
<h3 id="bigquery-console"><a class="header" href="#bigquery-console">BigQuery Console</a></h3>
<p>The BigQuery console is similar to STMO, but allows write access to views and tables. Some
people also prefer its user interface, though note that results that you get from it can
only be shared with others who also have BigQuery access provisioned.</p>
<ul>
<li>Visit <a href="https://console.cloud.google.com/bigquery?project=mozdata">GCP BigQuery Console <code>mozdata</code></a></li>
<li>Use <code>mozdata</code> or switch to the project provided to you during your access request e.g <code>moz-fx-data-bq-&lt;team-name&gt;</code></li>
<li>Write and run your queries</li>
</ul>
<p>Note that if you are trying to query telemetry datasets from a team-specific project,
you will need to explicitly specify
the project (<code>mozdata</code>) that the view lives in, since you're querying from a different one. For example:</p>
<pre><code class="language-sql">SELECT
client_id
FROM
mozdata.telemetry.main
WHERE
DATE(submission_timestamp) = '2020-04-20'
AND sample_id = 42
AND application.channel='nightly'
</code></pre>
<p>For more details, see <a href="https://cloud.google.com/bigquery/docs/bigquery-web-ui">Google's Documentation on the GCP Console</a>.</p>
<h3 id="using-the-bq-command-line-tool"><a class="header" href="#using-the-bq-command-line-tool">Using the <code>bq</code> Command-Line Tool</a></h3>
<p>Steps to use:</p>
<ul>
<li>Install the <a href="https://cloud.google.com/sdk/docs/install-sdk">GCP SDK</a></li>
<li>Authorize <code>gcloud</code> with either your user account or provisioned service account. See documentation <a href="https://cloud.google.com/sdk/docs/authorizing">here</a>.
<ul>
<li><code>gcloud auth login</code></li>
</ul>
</li>
<li>Set your google project to <code>mozdata</code>
<ul>
<li><code>gcloud config set project mozdata</code></li>
</ul>
</li>
<li>Set your google project to your team project if you were given one during your access request.
<ul>
<li><code>gcloud config set project moz-fx-data-bq-&lt;team-name&gt;</code></li>
</ul>
</li>
</ul>
<p>Once configured, you can now use the <code>bq</code> command-line client. The following example
lists the tables and views in a BigQuery dataset:</p>
<pre><code class="language-bash">bq ls mozdata:telemetry
</code></pre>
<p>And here's another which gets the count of entries in <code>telemetry.main</code> on <code>2019-08-22</code> in the nightly channel:</p>
<pre><code class="language-bash">bq query --nouse_legacy_sql 'select count(*) from mozdata.telemetry.main where date(submission_timestamp) = &quot;2019-08-22&quot; and normalized_channel=&quot;nightly&quot;'
</code></pre>
<p>Additional examples and documentation can be found <a href="https://cloud.google.com/bigquery/docs/bq-command-line-tool">in the BigQuery command-line reference</a>.</p>
<h3 id="api-access"><a class="header" href="#api-access">API Access</a></h3>
<p>For advanced use cases involving programmatic access -- including automated workloads, ETL, <a href="https://cloud.google.com/bigquery/docs/reference/storage/">BigQuery Storage API</a>.</p>
<p>You can locate a list of supported BigQuery client libraries <a href="https://cloud.google.com/bigquery/docs/reference/libraries">here</a>.</p>
<p>Although you typically want to use a client library, Google also provides a <a href="https://cloud.google.com/bigquery/docs/reference/rest/">detailed reference of their underlying REST API</a>.</p>
<h4 id="service-accounts"><a class="header" href="#service-accounts">Service Accounts</a></h4>
<p>Client SDKs do not access credentials the same way as the <code>gcloud</code> and <code>bq</code>
command-line tools. The client SDKs generally assume that the machine is configured with
a service account and looks for JSON-based credentials in several well-known locations
rather than looking for user credentials.</p>
<p>If you have service account credentials, you can point client SDKs at them
by setting:</p>
<pre><code class="language-bash">export GOOGLE_APPLICATION_CREDENTIALS=/path/to/creds.json
</code></pre>
<p>If you do not have appropriate service account credentials, but your GCP user
account has sufficient access, you can have your user credentials mimic a
service account by running:</p>
<pre><code class="language-bash">gcloud auth application-default login
</code></pre>
<p>Or activate both personal credentials and application default credentials in one command:</p>
<pre><code class="language-bash">gcloud auth login --update-adc
</code></pre>
<p>Once you've followed the browser flow to grant access, you should be able to,
for example, access BigQuery from Python:</p>
<pre><code class="language-bash">pip install google-cloud-bigquery
python -c 'from google.cloud import bigquery; print([d.dataset_id for d in bigquery.Client().list_datasets()])'
</code></pre>
<h3 id="spark"><a class="header" href="#spark">Spark</a></h3>
<p><a href="https://spark.apache.org/">Apache Spark</a> is a data processing engine designed to be fast and easy to use. There are several methods you can use to access BigQuery via Spark, depending on your needs. See <a href="cookbooks/bigquery/../../tools/spark.html">Custom Analysis with Spark</a> for more information and examples.</p>
<h3 id="colaboratory"><a class="header" href="#colaboratory">Colaboratory</a></h3>
<p><a href="https://colab.research.google.com">Colaboratory</a> (Colab) is Jupyter notebook environment, managed by Google and running in the cloud. Notebooks are stored in Google Drive and can be shared in a similar way to Google Docs.</p>
<p>Colab can be used to easily access BigQuery and perform analyses. See the <a href="https://colab.research.google.com/drive/1uXmrPnqzDATiCVH2RNJKD8obIZuofFHx"><code>Telemetry Hello World</code> notebook</a> for an interactive example. Under the hood, it uses the BigQuery API to read and write to BigQuery tables, so access needs to be explicitly provisioned.</p>
<h3 id="ai-platform-notebooks"><a class="header" href="#ai-platform-notebooks">AI Platform Notebooks</a></h3>
<p><a href="https://cloud.google.com/ai-platform/notebooks/docs">AI Platform Notebooks</a> is a managed JupyterLab service running on GCP. It gives you full control over the machine where your notebooks are running - you can install your own libraries and choose machine size depending on your needs.</p>
<p>To start, go to <a href="https://console.cloud.google.com">GCP console</a> and make sure you are in the correct project - most likely this will be your team project. Then navigate to the Notebooks page in the sidebar under AI Platform &gt; Notebooks (<a href="https://console.cloud.google.com/ai-platform/notebooks/list/instances">direct link</a>). There you can create new notebook server instances and connect to them (when your instance is ready, you'll see an <code>Open JupyterLab</code> button).</p>
<p>Please note that by default JupyterLab saves notebook files only locally, so they are lost if your instance is deleted. To make sure you don't lose your work, either push your files to a Git repository (via a pre-installed Git extension) or upload them to GCS (using <code>gsutil</code> command in a terminal session).</p>
<h4 id="notebooks-access-to-workgroup-confidential-datasets"><a class="header" href="#notebooks-access-to-workgroup-confidential-datasets">Notebooks Access to workgroup-confidential Datasets</a></h4>
<p>If you are a member of a restricted access workgroup, you can provision AI notebooks in the <a href="https://console.cloud.google.com/vertex-ai/workbench/list/instances?project=mozdata&amp;supportedpurview=project"><code>mozdata GCP project</code></a> that can read workgroup-confidential data.</p>
<blockquote>
<p><strong>⚠</strong> You must provision AI notebooks in <code>mozdata</code> using a nonstandard service account specific to your workgroup, see below.</p>
</blockquote>
<p>When you create a notebook server, under &quot;Advanced Options&quot; / &quot;Permissions&quot;, deselect &quot;Use Compute Engine Default Service Account&quot; and replace it with the service account associated with your workgroup. You may need to type this service account manually as it will not be available from a drop-down menu to all users. The ID of the service account matches the following pattern:</p>
<p><code>WORKGROUP-SUBGROUP@mozdata.iam.gserviceaccount.com</code></p>
<p>For example, if you are member of <code>workgroup:search-terms/aggregated</code>, use <code>search-terms-aggregated@mozdata.iam.gserviceaccount.com</code>.</p>
<p>This notebook server should have access to any restricted access datasets that are accessible to <code>workgroup:search-terms/aggregated</code>. Additionally, this notebooks server will not have write access to the standard <code>mozdata.analysis</code> dataset, but will instead have write access to a workgroup-specific dataset that looks like the following:</p>
<p><code>mozdata.WORKGROUP_SUBGROUP_analysis</code></p>
<p>In the example above this maps to <code>mozdata.search_terms_aggregated_analysis</code>.</p>
<h2 id="bigquery-access-request"><a class="header" href="#bigquery-access-request">BigQuery Access Request</a></h2>
<blockquote>
<p><strong>⚠</strong> Access to BigQuery via the <code>mozdata</code> GCP project is granted to Mozilla Staff by default; only file an access request if you need other specific access such as via a teams project</p>
</blockquote>
<p>For access to BigQuery using projects other than <code>mozdata</code>, <a href="https://mozilla-hub.atlassian.net/secure/CreateIssueDetails!init.jspa?pid=10058&amp;issuetype=10007&amp;priority=3&amp;customfield_10014=DSRE-87&amp;summary=BigQuery%20GCP%20Console%20and%20API%20Access%20for%20YOUR_EMAIL_HERE&amp;description=My%20request%20information%0A%3D%3D%3D%3D%3D%3D%3D%3D%0Amozilla.com%20ldap%20login%3A%0Ateam%3A%0Aaccess%20required%3A%20BigQuery%20GCP%20console%20and%20API%20Access%3B%20ENTER%20OTHER%20ACCESS%20REQUESTS%20HERE%0A%0APost%20request%0A%3D%3D%3D%3D%3D%3D%3D%3D%0ASee%20GCP%20console%20and%20other%20access%20methods%20docs%20here%3A%20https%3A%2F%2Fdocs.telemetry.mozilla.org%2Fcookbooks%2Fbigquery">file a bug (requires access to Mozilla Jira)</a>.
If you require access to AI Notebooks or Dataproc, please specify in the bug and a team project will be provisioned for you.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/bigquery/access.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="querying-bigquery-tables"><a class="header" href="#querying-bigquery-tables">Querying BigQuery Tables</a></h1>
<h2 id="table-of-contents-4"><a class="header" href="#table-of-contents-4">Table of Contents</a></h2>
<ul>
<li><a href="cookbooks/bigquery/querying.html#projects-datasets-and-tables-in-bigquery">Projects, Datasets, and Tables in BigQuery</a>
<ul>
<li><a href="cookbooks/bigquery/querying.html#caveats">Caveats</a></li>
<li><a href="cookbooks/bigquery/querying.html#projects-with-bigquery-datasets">Projects with BigQuery datasets</a></li>
<li><a href="cookbooks/bigquery/querying.html#table-layout-and-naming">Table Layout and Naming</a></li>
<li><a href="cookbooks/bigquery/querying.html#structure-of-ping-tables-in-bigquery">Structure of Ping Tables in BigQuery</a></li>
</ul>
</li>
<li><a href="cookbooks/bigquery/querying.html#writing-queries">Writing Queries</a>
<ul>
<li><a href="cookbooks/bigquery/querying.html#writing-query-results-to-a-permanent-table">Writing query results to a permanent table</a></li>
<li><a href="cookbooks/bigquery/querying.html#writing-results-to-gcs-object-store">Writing results to GCS (object store)</a></li>
</ul>
</li>
<li><a href="cookbooks/bigquery/querying.html#creating-a-view">Creating a View</a></li>
<li><a href="cookbooks/bigquery/querying.html#using-udfs">Using UDFs</a></li>
<li><a href="cookbooks/bigquery/querying.html#accessing-map-like-fields">Accessing map-like fields</a></li>
<li><a href="cookbooks/bigquery/querying.html#accessing-histograms">Accessing histograms</a></li>
</ul>
<h2 id="projects-datasets-and-tables-in-bigquery"><a class="header" href="#projects-datasets-and-tables-in-bigquery">Projects, Datasets, and Tables in BigQuery</a></h2>
<p>In GCP a <a href="https://cloud.google.com/resource-manager/docs/creating-managing-projects">project</a> enables you to organize cloud resources. Mozilla uses multiple
projects to maintain BigQuery <a href="https://cloud.google.com/bigquery/docs/datasets-intro">datasets</a>.</p>
<blockquote>
<p><strong>Note</strong>: The term <em>dataset</em> has historically been used to describe a set of records that all follow the same schema, but this idea corresponds to a <em>table</em>
in BigQuery. In BigQuery terminology,
datasets represent top-level containers that are used to organize and
control access to tables and views.</p>
</blockquote>
<h3 id="caveats"><a class="header" href="#caveats">Caveats</a></h3>
<ul>
<li>Each derived table is exposed by a view in the <code>mozdata</code> project and in many cases, these views apply additional filters or business logic. For this reason, the best practice is to always query the views instead of directly querying the underlying table(s).</li>
<li>Most tables are partitioned by date and you can scan much less data by filtering on the partition field
(usually <code>submission_timestamp</code> or <code>submission_date</code>).
These dates are always in UTC.</li>
<li>Unqualified queries can become very costly very easily. Restrictions have been placed on large tables to avoid accidental querying &quot;all data for all time&quot;. You must use the date partition fields for large tables (like <code>main_summary</code> or <code>clients_daily</code>).</li>
<li>Read the <a href="cookbooks/bigquery/./optimization.html"><em>Query Optimization Cookbook</em></a> that includes recommendations on how to reduce cost and improve query performance.</li>
<li>STMO BigQuery data sources have a 10 TB data-scanned limit for each query. <a href="cookbooks/bigquery/../../concepts/getting_help.html">Let us know</a> if this becomes an issue.</li>
<li>There is not any native map support available in BigQuery. Instead, structs are used with fields [key, value]. Convenience functions are available to access the like key-value maps, as described <a href="cookbooks/bigquery/querying.html#accessing-map-like-fields">below</a>.</li>
</ul>
<h3 id="projects-with-bigquery-datasets"><a class="header" href="#projects-with-bigquery-datasets">Projects with BigQuery datasets</a></h3>
<div class="table-wrapper"><table><thead><tr><th>Project</th><th>Dataset</th><th>Purpose</th></tr></thead><tbody>
<tr><td><code>mozdata</code></td><td></td><td>The primary home for user analysis; it has a short name that is easy to type and is filled with views that reference underlying tables in <code>moz-fx-data-shared-prod</code>; the default project for STMO and Looker</td></tr>
<tr><td></td><td><code>analysis</code></td><td>User-generated tables for analysis; please prefix tables with your username</td></tr>
<tr><td></td><td><code>tmp</code></td><td>User-generated tables for ephemeral analysis results; tables created here are automatically deleted after 7 days.</td></tr>
<tr><td></td><td><code>telemetry</code></td><td>Views into legacy desktop telemetry pings and many derived tables; see <em>user-facing (unsuffixed) datasets</em> below</td></tr>
<tr><td></td><td><code>&lt;namespace&gt;</code></td><td>See <em>user-facing (unsuffixed) datasets</em> below</td></tr>
<tr><td></td><td><code>search</code></td><td>Search data imported from parquet (<em>restricted</em>)</td></tr>
<tr><td></td><td><code>static</code></td><td>Static tables, often useful for data-enriching joins</td></tr>
<tr><td></td><td><code>udf</code></td><td>Internal persistent user-defined functions defined in SQL; see <a href="cookbooks/bigquery/querying.html#using-udfs">Using UDFs</a></td></tr>
<tr><td></td><td><code>udf_js</code></td><td>Internal user-defined functions defined in JavaScript; see <a href="cookbooks/bigquery/querying.html#using-udfs">Using UDFs</a></td></tr>
<tr><td><code>mozfun</code></td><td></td><td>The primary home for user-defined functions; see <a href="cookbooks/bigquery/querying.html#using-udfs">Using UDFs</a></td></tr>
<tr><td><code>moz-fx-data-bq-&lt;team-name&gt;</code></td><td></td><td>Some teams have specialized needs and can be provisioned a team-specific project</td></tr>
<tr><td><code>moz-fx-data-shared-prod</code></td><td></td><td>All production data including full pings and derived datasets defined in <a href="https://github.com/mozilla/bigquery-etl">bigquery-etl</a></td></tr>
<tr><td></td><td><code>&lt;namespace&gt;_live</code></td><td>See <em>live datasets</em> below</td></tr>
<tr><td></td><td><code>&lt;namespace&gt;_stable</code></td><td>See <em>stable datasets</em> below</td></tr>
<tr><td></td><td><code>&lt;namespace&gt;_derived</code></td><td>See <em>derived datasets</em> below</td></tr>
<tr><td></td><td><code>&lt;product&gt;_external</code></td><td>Tables that reference external resources; these may be native BigQuery tables populated by a job that queries an third-party API, or they may be <a href="https://cloud.google.com/bigquery/external-data-sources">federated data sources</a> that pull data from other GCP services like GCS at query time.</td></tr>
<tr><td></td><td><code>backfill</code></td><td>Temporary staging area for back-fills</td></tr>
<tr><td></td><td><code>blpadi</code></td><td>Blocklist ping derived data(<em>restricted</em>)</td></tr>
<tr><td></td><td><code>payload_bytes_raw</code></td><td>Raw JSON payloads as received from clients, used for reprocessing scenarios, a.k.a. &quot;landfill&quot; (<em>restricted</em>)</td></tr>
<tr><td></td><td><code>payload_bytes_error</code></td><td><code>gzip</code>-compressed JSON payloads that were rejected in some phase of the pipeline; particularly useful for investigating schema validation errors</td></tr>
<tr><td></td><td><code>tmp</code></td><td>Temporary staging area for parquet data loads</td></tr>
<tr><td></td><td><code>validation</code></td><td>Temporary staging area for validation</td></tr>
<tr><td><code>moz-fx-data-derived-datasets</code></td><td></td><td>Legacy project that was a precursor to <code>mozdata</code></td></tr>
<tr><td><code>moz-fx-data-shar-nonprod-efed</code></td><td></td><td>Non-production data produced by stage ingestion infrastructure</td></tr>
</tbody></table>
</div>
<h3 id="table-layout-and-naming"><a class="header" href="#table-layout-and-naming">Table Layout and Naming</a></h3>
<p>Under the single <code>moz-fx-data-shared-prod</code> project,
each document namespace (corresponding to folders underneath the <a href="https://github.com/mozilla-services/mozilla-pipeline-schemas/tree/master/schemas">schemas directory of <code>mozilla-pipeline-schemas</code></a>) has four BigQuery datasets provisioned with the following properties:</p>
<ul>
<li><em>Live datasets</em> (<code>telemetry_live</code>, <code>activity_stream_live</code>, etc.) contain live ping tables (see definitions of table types in the next paragraph)</li>
<li><em>Stable datasets</em> (<code>telemetry_stable</code>, <code>activity_stream_stable</code>, etc.) contain historical ping tables</li>
<li><em>Derived datasets</em> (<code>telemetry_derived</code>, <code>activity_stream_derived</code>, etc.) contain derived tables, primarily populated via nightly queries defined in <a href="https://github.com/mozilla/bigquery-etl">BigQuery ETL</a> and managed by Airflow</li>
<li><em>User-facing (unsuffixed) datasets</em> (<code>telemetry</code>, <code>activity_stream</code>, etc.) contain user-facing views on top of the tables in the corresponding stable and derived datasets.</li>
</ul>
<p>The table and view types referenced above are defined as follows:</p>
<ul>
<li><em>Live ping tables</em> are the final destination for the <a href="https://mozilla.github.io/gcp-ingestion/">telemetry ingestion pipeline</a>. Dataflow jobs process incoming ping payloads from clients, batch them together by document type, and load the results to these tables approximately every five minutes, although a few document types are opted in to a more expensive streaming path that makes records available in BigQuery within seconds of ingestion. These tables are partitioned by date according to <code>submission_timestamp</code> and are also clustered on that same field, so it is possible to make efficient queries over short windows of recent data such as the last hour. They have a rolling expiration period of 30 days, but that window may be shortened in the future. Analyses should only use these tables if they need results for the current (partial) day.</li>
<li><em>Historical ping tables</em> have exactly the same schema as their corresponding live ping tables, but they are populated only once per day (<code>12:00:00am</code> to <code>11:59:59pm</code> UTC) via an Airflow job and have a 25 month retention period. These tables are superior to the live ping tables for historical analysis because they never contain partial days, they have additional deduplication applied, and they are clustered on <code>sample_id</code>, allowing efficient queries on a 1% sample of clients. It is guaranteed that <code>document_id</code> is distinct within each day of each historical ping table, but it is still possible for a document to appear multiple times if a client sends the same payload across multiple UTC days. Note that this requirement is relaxed for older telemetry ping data that was backfilled from AWS; approximately 0.5% of documents are duplicated in <code>telemetry.main</code> and other historical ping tables for 2019-04-30 and earlier dates.</li>
<li><em>Derived tables</em> are populated by nightly <a href="https://workflow.telemetry.mozilla.org/home">Airflow</a> jobs and are considered an implementation detail; their structure may change at any time at the discretion of the data platform team to allow refactoring or efficiency improvements.</li>
<li><em>User-facing views</em> are the schema objects that users are primarily expected to use in analyses. Many of these views correspond directly to an underlying historical ping table or derived table, but they provide the flexibility to hide deprecated columns or present additional calculated columns to users. These views are the schema contract with users and they should not change in backwards-incompatible ways without a version increase or an announcement to users about a breaking change.</li>
</ul>
<p>Spark and other applications relying on the BigQuery Storage API for data access need to reference derived tables or historical ping tables directly rather than user-facing views. Unless the query result is relatively large, we recommend instead that users run a query on top of user-facing views with the output saved in a destination table, which can then be accessed from Spark.</p>
<h3 id="structure-of-ping-tables-in-bigquery"><a class="header" href="#structure-of-ping-tables-in-bigquery">Structure of Ping Tables in BigQuery</a></h3>
<p>Unlike with the previous AWS-based data infrastructure, we don't have different mechanisms for accessing entire pings vs. &quot;summary&quot; tables. As such, there are no longer special libraries or infrastructure necessary for accessing full pings, rather each document type maps to a user-facing view that can be queried in STMO. For example:</p>
<ul>
<li>&quot;main&quot; pings are accessible from view <code>telemetry.main</code> (<a href="cookbooks/bigquery/../../datasets/main_ping_tables.html">see docs for faster-to-query tables</a>)</li>
<li>&quot;crash&quot; pings are accessible from view <code>telemetry.crash</code></li>
<li>&quot;baseline&quot; pings for the release version of Firefox for Android (Fenix) are accessible from view <code>org_mozilla_firefox.baseline</code></li>
</ul>
<p>All fields in the incoming pings are accessible in these views, and (where possible) match the nested data structures of the original JSON. Field names are converted from <code>camelCase</code> form to <code>snake_case</code> for consistency and SQL compatibility.</p>
<p>Any fields not present in the ping schemas are present in an <code>additional_properties</code> field containing leftover JSON. BigQuery provides <a href="https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions">functions for parsing and manipulating JSON data via SQL</a>.</p>
<p>Later in this document, we demonstrate the use of a few Mozilla-specific
functions that we have defined to allow ergonomic querying of
<a href="cookbooks/bigquery/querying.html#accessing-map-like-fields">map-like fields</a> (which are represented as arrays of structs in BigQuery) and
<a href="cookbooks/bigquery/querying.html#accessing-histograms">histograms</a> (which are encoded as raw JSON strings).</p>
<h2 id="writing-queries"><a class="header" href="#writing-queries">Writing Queries</a></h2>
<p>To query a BigQuery table you will need to specify the dataset and table name. It is good practice to specify the project however depending on which project the query
originates from this is optional.</p>
<pre><code class="language-sql">SELECT
col1,
col2
FROM
`project.dataset.table`
WHERE
-- data_partition_field will vary based on table
date_partition_field &gt;= DATE_SUB(CURRENT_DATE, INTERVAL 1 MONTH)
</code></pre>
<p>An example query from <a href="cookbooks/bigquery/../../datasets/bigquery/clients_last_seen/reference.html">Clients Last Seen Reference</a></p>
<pre><code class="language-sql">SELECT
submission_date,
os,
COUNT(*) AS count
FROM
mozdata.telemetry.clients_last_seen
WHERE
submission_date &gt;= DATE_SUB(CURRENT_DATE, INTERVAL 1 WEEK)
AND days_since_seen = 0
GROUP BY
submission_date,
os
HAVING
count &gt; 10 -- remove outliers
AND lower(os) NOT LIKE '%windows%'
ORDER BY
os,
submission_date DESC
</code></pre>
<p>Check out the <a href="https://cloud.google.com/bigquery/docs/reference/standard-sql/functions-and-operators">BigQuery Standard SQL Functions &amp; Operators</a> for detailed documentation.</p>
<h3 id="writing-query-results-to-a-permanent-table"><a class="header" href="#writing-query-results-to-a-permanent-table">Writing query results to a permanent table</a></h3>
<p>You can write query results to a BigQuery table you have access via <a href="cookbooks/bigquery/access.html#gcp-bigquery-console">GCP BigQuery Console</a> or <a href="cookbooks/bigquery/access.html#gcp-bigquery-api-access">GCP BigQuery API Access</a></p>
<ul>
<li>For temporary experiments use <code>mozdata.tmp</code> (it will automatically be deleted after 7 days). For longer-lived results, use the <code>mozdata.analysis</code> dataset.
<ul>
<li>Prefix your table with your username. If your username is <code>username@mozilla.com</code> create a table with <code>username_my_table</code>.</li>
</ul>
</li>
<li>See <a href="https://cloud.google.com/bigquery/docs/writing-results">Writing query results</a> documentation for detailed steps.</li>
</ul>
<h3 id="writing-results-to-gcs-object-store"><a class="header" href="#writing-results-to-gcs-object-store">Writing results to GCS (object store)</a></h3>
<p>If a BigQuery table is not a suitable destination for your analysis results,
we also have a GCS bucket available for storing analysis results. It is usually
Spark jobs that will need to do this.</p>
<ul>
<li>Use bucket <code>gs://mozdata-analysis/</code>
<ul>
<li>Prefix object paths with your username. If your username is <code>username@mozilla.com</code>, you might store a file to <code>gs://mozdata-analysis/username/myresults.json</code>.</li>
</ul>
</li>
</ul>
<h2 id="creating-a-view"><a class="header" href="#creating-a-view">Creating a View</a></h2>
<p>You can create views in BigQuery if you have access via <a href="cookbooks/bigquery/access.html#gcp-bigquery-console">GCP BigQuery Console</a> or <a href="cookbooks/bigquery/access.html#gcp-bigquery-api-access">GCP BigQuery API Access</a>.</p>
<ul>
<li>Use the <code>mozdata.analysis</code> dataset.
<ul>
<li>Prefix your view with your username. If your username is <code>username@mozilla.com</code> create a table with <code>username_my_view</code>.</li>
</ul>
</li>
<li>See <a href="https://cloud.google.com/bigquery/docs/views">Creating Views</a> documentation for detailed steps.</li>
</ul>
<h2 id="using-udfs"><a class="header" href="#using-udfs">Using UDFs</a></h2>
<p>BigQuery offers <a href="https://cloud.google.com/bigquery/docs/reference/standard-sql/user-defined-functions">user-defined functions</a> (UDFs) that can be defined in SQL or JavaScript as part of a query or as a persistent function stored in a dataset. We have defined a suite of public persistent functions to enable transformations specific to our data formats, available in <a href="https://mozilla.github.io/bigquery-etl/"><code>mozfun</code></a>. UDFs used internally in <code>moz-fx-data-shared-prod</code> are available in datasets <code>udf</code> (for functions defined in SQL) and <code>udf_js</code> (for functions defined in JavaScript). Note that JavaScript functions are potentially much slower than those defined in SQL, so use functions in <code>udf_js</code> with some caution, likely only after performing aggregation in your query.</p>
<p>We document a few of the most broadly useful UDFs below, but you can see the full list of <code>mozfun</code> UDFs in <a href="https://mozilla.github.io/bigquery-etl/">https://mozilla.github.io/bigquery-etl</a> and or UDFs with source code used within <code>moz-fx-data-shared-prod</code> in <a href="https://github.com/mozilla/bigquery-etl/tree/master/sql/moz-fx-data-shared-prod/udf"><code>bigquery-etl/sql/moz-fx-data-shared-prod/udf</code></a> and <a href="https://github.com/mozilla/bigquery-etl/tree/master/sql/moz-fx-data-shared-prod/udf_js"><code>bigquery-etl/sql/moz-fx-data-shared-prod/udf_js</code></a>.</p>
<h2 id="accessing-map-like-fields"><a class="header" href="#accessing-map-like-fields">Accessing map-like fields</a></h2>
<p>BigQuery currently lacks native map support and our workaround is to use a STRUCT type with fields named [key, value]. We've created a UDF that provides key-based access with the signature: <code>mozfun.map.get_key(&lt;struct&gt;, &lt;key&gt;)</code>. The example below generates a count per <code>reason</code> key in the <code>event_map_values</code> field in the telemetry events table for Normandy unenrollment events from yesterday.</p>
<pre><code class="language-sql">SELECT mozfun.map.get_key(event_map_values, 'reason') AS reason,
COUNT(*) AS EVENTS
FROM telemetry.events
WHERE submission_date = DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY)
AND event_category='normandy'
AND event_method='unenroll'
GROUP BY 1
ORDER BY 2 DESC
</code></pre>
<h2 id="accessing-histograms"><a class="header" href="#accessing-histograms">Accessing histograms</a></h2>
<p>We considered many potential ways to represent histograms as BigQuery fields
and found the most efficient encoding was actually to leave them as raw JSON
strings. To make these strings easier to use for analysis, you can convert them
into nested structures using <code>mozfun.hist.extract</code>:</p>
<pre><code class="language-sql">WITH
extracted AS (
SELECT
submission_timestamp,
mozfun.hist.extract(payload.histograms.a11y_consumers) AS a11y_consumers
FROM
telemetry.main )
--
SELECT
a11y_consumers.bucket_count,
a11y_consumers.sum,
a11y_consumers.range[ORDINAL(1)] AS range_low,
udf.get_key(a11y_consumers.values, 11) AS value_11
FROM
extracted
WHERE
a11y_consumers.bucket_count IS NOT NULL
AND DATE(submission_timestamp) = &quot;2019-08-09&quot;
LIMIT
10
</code></pre>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/bigquery/querying.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="optimizing-bigquery-queries"><a class="header" href="#optimizing-bigquery-queries">Optimizing BigQuery Queries</a></h1>
<p>When you write a query using <a href="https://sql.telemetry.mozilla.org">STMO</a> or the BigQuery console, you can improve performance and reduce costs by learning how data is stored, how databases function, and what you can change about a query to take advantage of the storage structure and the data function.</p>
<p><a href="https://cloud.google.com/bigquery/pricing#on_demand_pricing">Queries are charged by data scanned at $5 per terabyte (TB)</a> so each 200 gigabytes of data scanned will cost $1: on tables with hundreds of TBs of data (like the <a href="cookbooks/bigquery/../../datasets/pings.html#main-ping">main ping table</a> or <a href="cookbooks/bigquery/../../datasets/batch_view/clients_daily/reference.html"><code>clients_daily</code></a>), costs <strong>can add up very quickly</strong>. When trying to reduce the cost, the main thing to do is reduce the amount of data scanned: some of the advice in this article will improve your query's performance but will not scan a smaller amount of data, and thus cost the same.</p>
<h2 id="table-of-contents-5"><a class="header" href="#table-of-contents-5">Table of Contents</a></h2>
<ul>
<li><a href="cookbooks/bigquery/optimization.html#tldr-what-to-implement-for-quick-improvements">TL;DR: What to implement for quick improvements</a>
<ul>
<li><a href="cookbooks/bigquery/optimization.html#how-to-improve-both-query-speed-and-cost">How to improve both query speed and cost</a></li>
<li><a href="cookbooks/bigquery/optimization.html#improvements-to-query-speed-only">Improvements to query speed only</a></li>
<li><a href="cookbooks/bigquery/optimization.html#caveats">Caveats</a></li>
</ul>
</li>
<li><a href="cookbooks/bigquery/optimization.html#some-explanations">Some Explanations</a>
<ul>
<li><a href="cookbooks/bigquery/optimization.html#what-are-these-databases">What are these databases?</a>
<ul>
<li><a href="cookbooks/bigquery/optimization.html#key-takeaways">Key takeaways</a></li>
</ul>
</li>
<li><a href="cookbooks/bigquery/optimization.html#how-is-the-data-stored">How is the data stored?</a>
<ul>
<li><a href="cookbooks/bigquery/optimization.html#traditional-row-stores">Traditional Row Stores</a></li>
<li><a href="cookbooks/bigquery/optimization.html#columnar-stores">Columnar Stores</a></li>
<li><a href="cookbooks/bigquery/optimization.html#data-partitions">Data partitions</a></li>
<li><a href="cookbooks/bigquery/optimization.html#data-ordering-clustering">Data Ordering (clustering)</a></li>
<li><a href="cookbooks/bigquery/optimization.html#key-takeaways-1">Key takeaways</a></li>
</ul>
</li>
</ul>
</li>
</ul>
<h2 id="tldr-what-to-implement-for-quick-improvements"><a class="header" href="#tldr-what-to-implement-for-quick-improvements">TL;DR: What to implement for quick improvements</a></h2>
<h3 id="how-to-improve-both-query-speed-and-cost"><a class="header" href="#how-to-improve-both-query-speed-and-cost">How to improve both query speed and cost</a></h3>
<ul>
<li>Filter on a partitioned column such as <code>submission_timestamp</code> or <code>submission_date</code> (<em>even</em> if you have a <code>LIMIT</code>: see <a href="cookbooks/bigquery/optimization.html#caveats">optimization caveats</a>)</li>
<li>Use a sample of the data that is based on the <code>sample_id</code> field. This can be helpful for initial development even if you later run the query using the entire
population (without sampling).
<ul>
<li>Tables that include a <code>sample_id</code> field will usually have that as one of the clustering fields and you can efficiently scan random samples of users by specifying <code>WHERE sample_id = 0</code> (1% sample), <code>WHERE sample_id &lt; 10</code> (10% sample), etc. This can be especially helpful with <code>main_summary</code>, <code>clients_daily</code>, and <code>clients_last_seen</code> which are very large tables and are all clustered on <code>sample_id</code>.</li>
</ul>
</li>
<li>Many datasets also cluster on <code>normalized_channel</code>, corresponding to the channel of the product. If you are working with data that has different channels (for example, Firefox desktop), limit your initial query to a channel with a limited population like Nightly (in the case of Firefox desktop, do this by adding <code>WHERE normalized_channel='nightly'</code> to your query)</li>
<li>Select only the columns that you want (<strong>Don't</strong> use <code>SELECT *</code>)
<ul>
<li>If you are experimenting with data or exploring data, use one of the <a href="https://cloud.google.com/bigquery/docs/best-practices-costs#preview-data">data preview options</a> instead of <code>SELECT *</code>.</li>
</ul>
</li>
<li>Reference the data size prediction (&quot;This query will process X bytes&quot;) in STMO and the BigQuery UI to help gauge the efficiency of your queries. You should see this number go down as you limit the range of <code>submission_date</code>s or include fewer fields in your <code>SELECT</code> statement.</li>
</ul>
<h3 id="improvements-to-query-speed-only"><a class="header" href="#improvements-to-query-speed-only">Improvements to query speed only</a></h3>
<p>These are still worth doing!</p>
<ul>
<li>Use <a href="https://cloud.google.com/bigquery/docs/reference/standard-sql/approximate_aggregate_functions">approximate algorithms</a>: e.g., <code>approx_count_distinct(...)</code> instead of <code>COUNT(DISTINCT ...)</code></li>
<li>If using JOIN, trim the data to-be-joined before the query performs a JOIN. If you reduce data early in the processing cycle, shuffling and other complex operations only execute on the data that you need.
<ul>
<li>Use sub queries with filters or intermediate tables or views as a way of decreasing sides of a join, prior to the join itself.</li>
</ul>
</li>
</ul>
<h3 id="caveats-1"><a class="header" href="#caveats-1">Caveats</a></h3>
<ul>
<li>For clustered tables, the data size prediction won't take into account benefits from <code>LIMIT</code>s and <code>WHERE</code> clauses on clustering fields, so you'll need to compare to the actual &quot;Data Scanned&quot; after the query is run.</li>
<li>Applying a <code>LIMIT</code> clause to a <code>SELECT *</code> query might not affect the amount of data read, depending on the table structure.
<ul>
<li>Many of our tables are configured to use <em>clustering</em> in which case a <code>LIMIT</code> clause does effectively limit the amount of data that needs to be scanned. To check whether your <code>LIMIT</code> and <code>WHERE</code> clauses are actually improving performance, you should see a lower value reported for actual &quot;Data Scanned&quot; by a query compared to the prediction (&quot;This query will process X bytes&quot;) in STMO or the BigQuery UI.</li>
</ul>
</li>
<li>Do not treat <code>WITH</code> clauses as prepared statements
<ul>
<li><code>WITH</code> clauses are used primarily for readability because they are not materialized: if a query appears in more than one <code>WITH</code> clause, it executes in each clause. Do not rely on them to optimize your query!</li>
</ul>
</li>
</ul>
<h2 id="some-explanations"><a class="header" href="#some-explanations">Some Explanations</a></h2>
<h3 id="what-are-these-databases"><a class="header" href="#what-are-these-databases">What are these databases?</a></h3>
<p>The primary data storage mechanism used at Mozilla, BigQuery, is not a traditional relational database like PostgreSQL or MySQL. Instead it is a <em>distributed SQL engine</em> where data is stored separately from computational resources used to retrieve it.</p>
<p>Multiple machines work together to get the result of your query. Because there is more than one system, you need to pay particular attention to <em>Data Shuffles</em>: when all systems have to send data to all other systems.</p>
<p>For example, consider the following query, which lists the number of rows that are present for each
<code>client_id</code>:</p>
<pre><code class="language-sql">SELECT client_id, COUNT(*)
FROM telemetry.main
GROUP BY client_id
</code></pre>
<p>During the execution of this query, the BigQuery cluster performs the following steps:</p>
<ol>
<li>Each system reads a different piece of the data and parses the <code>client_id</code> for
each row. Internally, it then computes the number of rows seen for each <code>client_id</code>,
<em>but only for the data that it read</em>.</li>
<li>Each system is then assigned a set of <code>client_id</code>s to aggregate. For example, the first
system may be given instructions to get the count of <code>client1</code>. It then has to send a request to every other system for the total seen for <code>client1</code>. It can then aggregate the total.</li>
<li>If every <code>client_id</code> has been aggregated, each system reports to the coordinator
the <code>client_id</code>s that it was responsible for, as well as the count of rows seen by each.
The coordinator is responsible for returning the result of the query to the client,
which in this example is STMO.</li>
</ol>
<p>A similar process occurs on data joins, where different systems are instructed to join on
different keys. In that case, data from both tables needs to be shuffled to every system.</p>
<h4 id="key-takeaways"><a class="header" href="#key-takeaways">Key takeaways</a></h4>
<ul>
<li>Use <code>LIMIT</code> for query prototyping to dramatically reduce the volume of data scanned
as well as speeding up processing.</li>
<li>Use approximate algorithms. Then less data needs to be shuffled because
probabilistic data structures can be used instead of the raw data itself.</li>
<li>Specify large tables first in a <code>JOIN</code> operation. In this case, small tables can be sent to
every system to eliminate one data shuffle operation. Note that Spark supports a <code>broadcast</code>
command explicitly.</li>
</ul>
<h3 id="how-is-the-data-stored"><a class="header" href="#how-is-the-data-stored">How is the data stored?</a></h3>
<p>The data is stored in columnar format.</p>
<h4 id="traditional-row-stores"><a class="header" href="#traditional-row-stores">Traditional Row Stores</a></h4>
<p>Consider a typical CSV file, which represents an example of a row store.</p>
<pre><code class="language-cs">name,age,height
&quot;Ted&quot;,27,6.0
&quot;Emmanuel&quot;,45,5.9
&quot;Cadence&quot;,5,3.5
</code></pre>
<p>When this data is stored to disk, you can read an entire record in consecutive order. For example, if
the first <code>&quot;</code> is stored at block 1 on disk, then a sequential scan from 1 assigns the first row of
data: <code>&quot;ted&quot;,27,6.0</code>. Keep scanning and you get <code>\n&quot;Emm</code>... and so on.</p>
<p>You can use the following query that you can execute very quickly:</p>
<pre><code class="language-sql">SELECT *
FROM people
WHERE name == 'Ted'
</code></pre>
<p>The database can just scan the first row of data. However, the following is more difficult:</p>
<pre><code class="language-sql">SELECT name
FROM people
</code></pre>
<p>Now the database has to read <em>all</em> of the rows and then select the <code>name</code> column, which results in a lot
more overhead.</p>
<h4 id="columnar-stores"><a class="header" href="#columnar-stores">Columnar Stores</a></h4>
<p>Columnar turns the data sideways. For example, you can make a columnar version of the above data
and still store it in CSV format:</p>
<pre><code class="language-cs">name,&quot;Ted&quot;,&quot;Emmanuel&quot;,&quot;Cadence&quot;
age,27,45,5
height,6.0,5.9,3.5
</code></pre>
<p>Now let's consider how we can query the data when it's stored this way:</p>
<pre><code class="language-sql">SELECT *
FROM people
WHERE name == &quot;ted&quot;
</code></pre>
<p>In this case, all the data must be read because the
<code>(name, age, height)</code> is not stored together.</p>
<p>Here's another query:</p>
<pre><code class="language-sql">SELECT name
FROM people
</code></pre>
<p>In this case, only the &quot;name&quot; row needs to be read. All the other lines of the
file can be skipped.</p>
<h4 id="data-partitions"><a class="header" href="#data-partitions">Data partitions</a></h4>
<p>You can improve performance even further by taking advantage of partitions, grouping together data that shares a value for a column. For example, if everyone in the <code>people</code> table lived in <code>DE</code>, then you can add that to the filename: <code>/country=DE/people.csv</code>.</p>
<p>From there, a query engine would have to know how to read that path and understand that
all of these people share a country. You can query as follows:</p>
<pre><code class="language-sql">SELECT *
FROM people
WHERE country == 'US'
</code></pre>
<p>In this case, the query engine no longer even has to read the file. It could just look at the path and realize that there is nothing of interest.</p>
<p>Tables are usually partitioned based on dates; e.g., <code>submission_date</code> or <code>DATE(submission_timestamp)</code>.</p>
<h4 id="data-ordering-clustering"><a class="header" href="#data-ordering-clustering">Data Ordering (clustering)</a></h4>
<p>Another way to improve query performance is to select a subset of data on a field that the data is ordered by. In BigQuery, this is called &quot;clustering&quot;. A clustered field is one which is sorted in the underlying data.</p>
<p>For example, if you wanted to get all of ages greater than age 40 in the table above, you might query like this:</p>
<pre><code class="language-sql">SELECT age FROM people WHERE age &gt; 40
</code></pre>
<p>This would scan all of the <code>age</code> field, starting from <code>27</code>, then <code>45</code>, and ending with <code>5</code>.</p>
<p>However, if instead data was sorted on that field, the table would look like this:</p>
<pre><code class="language-cs">name,&quot;Cadence&quot;,&quot;Ted&quot;,&quot;Emmanuel&quot;
age,5,27,45
height,3.5,6.0,5.9
</code></pre>
<p>Since data is stored on different files, we could ignore any <code>age</code> files that don't have data less than 40. So if Cadence and Ted's ages were in one file, and Emmanuel's in the next, we could skip reading that first file entirely. In that way, we can sometimes drastically reduce the amount of data we're reading.</p>
<h4 id="key-takeaways-1"><a class="header" href="#key-takeaways-1">Key takeaways</a></h4>
<ul>
<li>Limit queries to a few columns that you need to reduce the volume of data that must be read</li>
<li>Filter the partitions to reduce the volume of data that you need</li>
<li>Filter on clustered fields</li>
</ul>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/bigquery/optimization.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="accessing-desktop-data"><a class="header" href="#accessing-desktop-data">Accessing Desktop Data</a></h1>
<p>This document will help you find the best data source for a given analysis of Desktop Firefox. It focuses on <em>descriptive</em> datasets and does not cover anything attempting to explain <em>why</em> something is observed. This guide will help if you need to answer questions like:</p>
<ul>
<li>How many Firefox users are active in Germany?</li>
<li>How many crashes occur each day?</li>
<li>How many users have installed a specific add-on?</li>
</ul>
<p>If you want to know whether a causal link occurs between two events, consider running an <a href="cookbooks/bigquery/../../concepts/experiments.html">experiment</a>.</p>
<p>There are two types of datasets that you might want to use: those based on raw pings and those derived from them.</p>
<h2 id="raw-ping-datasets"><a class="header" href="#raw-ping-datasets">Raw Ping Datasets</a></h2>
<p>We receive data from Firefox users via <strong>pings</strong>: small JSON payloads sent by clients at specified intervals.
There are many types of pings, each containing different measurements and sent for different purposes.</p>
<p>These pings are then <a href="cookbooks/bigquery/./querying.html#structure-of-ping-tables-in-bigquery">aggregated into ping-level datasets</a> that can be retrieved using BigQuery.
Pings can be difficult to work with and expensive to query: where possible, you should use a derived dataset to answer your question.</p>
<p>For more information on pings and how to use them, see <a href="cookbooks/bigquery/../../datasets/pings.html">Raw Ping Data</a>.</p>
<h2 id="derived-datasets"><a class="header" href="#derived-datasets">Derived Datasets</a></h2>
<p>Derived datasets are built using the raw ping data above with various transformations to make them easier to work with and help you avoid the pitfall of <a href="https://docs.telemetry.mozilla.org/concepts/analysis_gotchas.html#pseudo-replication">pseudo-replication</a>.
You can find a full list of them in the <a href="cookbooks/bigquery/../../datasets/derived.html">derived datasets section</a>, but two commonly used ones are &quot;Clients Daily&quot; and &quot;Clients Last Seen&quot;.</p>
<h3 id="clients-daily"><a class="header" href="#clients-daily">Clients Daily</a></h3>
<p>Many questions about Firefox take the form &quot;What did clients with
characteristics X, Y, and Z do during the period S to E?&quot; The
<code>clients_daily</code> table aims to answer these questions. Each row in
the table is a (<code>client_id</code>, <code>submission_date</code>) and contains a
number of aggregates about that day's activity.</p>
<p>See the <a href="cookbooks/bigquery/../../datasets/batch_view/clients_daily/reference.html"><code>clients_daily</code> reference</a> for more information.</p>
<h3 id="clients-last-seen"><a class="header" href="#clients-last-seen">Clients Last Seen</a></h3>
<p>The <code>clients_last_seen</code> dataset is useful for efficiently determining exact
user counts such as DAU and MAU.
It can also allow efficient calculation of other windowed usage metrics like retention via its
<a href="cookbooks/bigquery/../../../cookbooks/clients_last_seen_bits.html">bit pattern fields</a>.
It includes the most recent values in a 28 day window for all columns in the
<a href="cookbooks/bigquery//datasets/batch_view/clients_daily/reference.html"><code>clients_daily</code> dataset</a>.</p>
<p>See the <a href="cookbooks/bigquery/../../datasets/bigquery/clients_last_seen/reference.html"><code>clients_last_seen</code> reference</a> for more information.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/bigquery/accessing_desktop_data.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="accessing-glean-data-in-bigquery"><a class="header" href="#accessing-glean-data-in-bigquery">Accessing Glean Data in BigQuery</a></h1>
<p>This document describes how to access Glean data using BigQuery, such as in <a href="https://sql.telemetry.mozilla.org">Redash</a>.
This is intended for in-depth analysis: <a href="cookbooks/../introduction/tools.html#mozilla-growth--usage-dashboard-gud">GUD</a>, <a href="cookbooks/./glam.html">GLAM</a> and <a href="cookbooks/../introduction/tools.html#looker">Looker</a> can answer many simple questions.</p>
<p>The data that Glean applications generates maps cleanly to structures we create in
BigQuery: see the section on <a href="cookbooks/../concepts/pipeline/glean_data.html">Glean Data</a> in the data pipeline
reference. The exact method to use depends on the metric type you want to access.</p>
<ul>
<li><a href="cookbooks/accessing_glean_data.html#counter-boolean-and-most-other-metrics">Counter, boolean, and most other metrics</a></li>
<li><a href="cookbooks/accessing_glean_data.html#event-metrics">Event metrics</a></li>
</ul>
<h2 id="counter-boolean-and-most-other-metrics"><a class="header" href="#counter-boolean-and-most-other-metrics">Counter, boolean, and most other metrics</a></h2>
<p>Most metrics Glean collects map to a single column in a BigQuery table.
The <a href="https://dictionary.telemetry.mozilla.org">Glean Dictionary</a> shows the mapping to access these mappings
when writing queries.
For example, say you wanted to get a count of top sites as measured in Firefox for Android.
You can get the information you need to build your query by following this procedure:</p>
<ul>
<li>Go to the <a href="https://dictionary.telemetry.mozilla.org">Glean Dictionary</a> home page.</li>
<li>Navigate to the <a href="https://dictionary.telemetry.mozilla.org/apps/fenix">Firefox for Android application</a></li>
<li>Under metrics, search for &quot;top&quot;, select <a href="https://dictionary.telemetry.mozilla.org/apps/fenix/metrics/metrics_top_sites_count"><code>metrics.top_sites_count</code></a>.</li>
<li>Scroll down to the bottom. Under BigQuery, you should see an entry like: &quot;In <code>org_mozilla_fenix.metrics</code> as <code>metrics.counter.metrics_top_sites_count</code>&quot;.
The former corresponds to the table name whilst the latter corresponds to the column name.
You can select which channel you want to view information for and the table name will update accordingly.</li>
</ul>
<p>With this information in hand, you can now proceed to writing a query. For example, to get the
average of this metric on the first of January, you could write something like this:</p>
<pre><code class="language-sql">-- Count number of pings where Fenix is the default browser
SELECT
AVG(metrics.counter.metrics_top_sites_count)
FROM
-- We give the table an alias so that the table name `metrics` and field name
-- `metrics` don't conflict.
org_mozilla_fenix.metrics AS m
WHERE
date(submission_timestamp) = '2021-01-01'
</code></pre>
<p>Note that we alias the table used in the query, otherwise the BigQuery parser gets confused.
This can also happen with the tables and columns corresponding to the events ping.
Another option is to explicitly qualify the table when selecting the column (so <code>metrics.counter.metrics_top_sites_count</code> becomes <code>metrics.metrics.counter.metrics_top_sites_count</code>):</p>
<pre><code class="language-sql">SELECT AVG(metrics.metrics.counter.metrics_top_sites_count)
FROM org_mozilla_fenix.metrics
WHERE DATE(submission_timestamp) = '2021-01-01'
</code></pre>
<h2 id="event-metrics"><a class="header" href="#event-metrics">Event metrics</a></h2>
<p>Event metrics are stored slightly differently: since each ping sent by a Glean application sends a <em>group</em> of metrics, they are mapped into a set of records within a single column.
To query them individually, you need to unnest them into their own table.
For example, let's say you wanted to investigate the foreground metrics for tab engine.
You can get the information you need to build your query by following this procedure:</p>
<ul>
<li>Go to the <a href="https://dictionary.telemetry.mozilla.org">Glean Dictionary</a> home page.</li>
<li>Navigate to the <a href="https://dictionary.telemetry.mozilla.org/apps/fenix">Firefox for Android application</a></li>
<li>Under metrics, search for &quot;foreground&quot;, select <a href="https://dictionary.telemetry.mozilla.org/apps/fenix/metrics/engine_tab_foreground_metrics"><code>engine_tab.foreground_metrics</code></a>.</li>
<li>Scroll down to the bottom until you see &quot;Access&quot;. Under BigQuery, you should see an entry like: &quot;In <code>org_mozilla_fenix.events</code>&quot;.</li>
</ul>
<p>This tells you the BigQuery table in which this data is stored.
With this information, plus knowledge of the metric's category (<code>engine_tab</code>) and name (<code>foreground_metrics</code>) we now know enough to write a simple query:</p>
<pre><code class="language-sql">WITH events AS (
SELECT
submission_timestamp,
client_info.client_id,
event.timestamp AS event_timestamp,
event.category AS event_category,
event.name AS event_name,
event.extra AS event_extra,
FROM org_mozilla_fenix.events AS e
CROSS JOIN UNNEST(e.events) AS event
WHERE
submission_timestamp = '2021-05-03'
AND sample_id = 42 -- 1% sample for development
AND event.category = 'engine_tab'
AND event.name = 'foreground_metrics'
)
SELECT * FROM events
</code></pre>
<p>The extra fields are stored as a structure. For more information on accessing those, see <a href="cookbooks/./bigquery/querying.html#accessing-map-like-fields">accessing map-like fields in the querying documentation</a>.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/accessing_glean_data.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="additional-properties"><a class="header" href="#additional-properties">Additional Properties</a></h1>
<p>If some field is present in a valid received <a href="cookbooks/../concepts/terminology.html#ping">ping</a>,
but is not present in the ping's <a href="cookbooks/../concepts/terminology.html#schema">schema</a>,
it doesn't have its own column to be placed into during
<a href="cookbooks/../concepts/terminology.html#ingestion">ingestion</a>.
Instead, those fields remain as raw JSON and are placed in the
<code>additional_properties</code> column of the ping's table or view.</p>
<p>This can happen for a variety of usually-temporary problems like:</p>
<ul>
<li>The latest schema hasn't yet been deployed (see <a href="cookbooks/../concepts/pipeline/schemas.html#what-does-it-mean-when-a-schema-deploy-is-blocked">&quot;What does it mean when a schema deploy is blocked?&quot;</a>)</li>
<li><a href="https://github.com/mozilla-services/mozilla-pipeline-schemas"><code>mozilla-pipeline-schemas</code></a> was not updated after a change in the data which was submitted (this shouldn't happen with <a href="cookbooks/../concepts/terminology.html#glean">Glean</a>, but can happen with some legacy data, for example the <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/data/environment.html">legacy Firefox Desktop telemetry environment</a>).</li>
</ul>
<p>Until the problem is fixed, any ingested pings with these
&quot;unknown&quot; data in them will place that data in the <code>additional_properties</code> column.
The rows of this dataset will always have these data in the
<code>additional_properties</code> column,
even after the dataset has been updated to add the column.</p>
<p>To access data that's been put into the <code>additional_properties</code> column,
you'll have to parse the JSON.
Be aware that when you access the <code>additional_properties</code> field, BigQuery has
to read the entire contents, even if you're extracting just a single field.
In the case of <code>main</code> pings, <code>additional_properties</code> can be quite large, leading
to expensive queries.</p>
<h2 id="example"><a class="header" href="#example">Example</a></h2>
<p>At the beginning of February 2021, schema deploys were delayed.
So to access the newly-added parent-process Firefox Desktop probes
<code>telemetry.generated_new_client_id</code>,
<code>telemetry.state_file_save_errors</code>, and
<code>telemetry.loaded_client_id_doesnt_match_pref</code>,
we needed to locate where they would be in the payload, and use
<code>JSON_EXTRACT_SCALAR</code> to extract the scalar
(in both a JSON and Telemetry sense of the word &quot;scalar&quot;) values.</p>
<pre><code class="language-sql">SELECT
JSON_EXTRACT_SCALAR(additional_properties, &quot;$.payload.processes.parent.scalars['telemetry.generated_new_client_id']&quot;) AS generated_new_client_id,
JSON_EXTRACT_SCALAR(additional_properties, &quot;$.payload.processes.parent.scalars['telemetry.state_file_save_errors']&quot;) AS state_file_save_errors,
JSON_EXTRACT_SCALAR(additional_properties, &quot;$.payload.processes.parent.scalars['telemetry.loaded_client_id_doesnt_match_pref']&quot;) AS loaded_client_id_doesnt_match_pref,
payload.info.profile_subsession_counter AS profile_subsession_counter
FROM mozdata.telemetry.main_nightly
WHERE
submission_timestamp &gt; '2021-02-02'
AND application.build_id &gt;= '20210202095107' -- First nightly with measure 20210202095107
</code></pre>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/additional_props.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="custom-analysis-with-spark"><a class="header" href="#custom-analysis-with-spark">Custom Analysis with Spark</a></h1>
<ul>
<li><a href="tools/spark.html#introduction">Introduction</a></li>
<li><a href="tools/spark.html#accessing-bigquery-data-from-spark">Accessing BigQuery data from Spark</a>
<ul>
<li><a href="tools/spark.html#using-the-storage-api-connector">Using the Storage API Connector</a></li>
<li><a href="tools/spark.html#using-dataproc">Using Dataproc</a></li>
</ul>
</li>
<li><a href="tools/spark.html#reading-data-from-bigquery-into-spark">Reading data from BigQuery into Spark</a>
<ul>
<li><a href="tools/spark.html#storage-api">Storage API</a></li>
<li><a href="tools/spark.html#query-api">Query API</a></li>
</ul>
</li>
<li><a href="tools/spark.html#persisting-data">Persisting data</a></li>
</ul>
<h2 id="introduction-2"><a class="header" href="#introduction-2">Introduction</a></h2>
<p><a href="https://spark.apache.org/">Apache Spark</a> is a general-purpose cluster computing system - it allows users to
run general execution graphs. APIs are available in Python, Scala, R, and Java. It is designed to be fast and easy to use.</p>
<p>Here are some useful introductory materials:</p>
<ul>
<li><a href="https://spark.apache.org/docs/latest/programming-guide.html">Spark Programming Guide</a></li>
<li><a href="https://spark.apache.org/docs/latest/sql-programming-guide.html">Spark SQL Programming Guide</a></li>
</ul>
<p>Spark can be used from <a href="https://cloud.google.com/dataproc/">Google's Dataproc</a>, and works with data stored in BigQuery.</p>
<p>There are a number of methods of both reading from and writing to BigQuery using Spark.</p>
<h2 id="accessing-bigquery-data-from-spark"><a class="header" href="#accessing-bigquery-data-from-spark">Accessing BigQuery data from Spark</a></h2>
<h3 id="using-the-storage-api-connector"><a class="header" href="#using-the-storage-api-connector">Using the Storage API Connector</a></h3>
<blockquote>
<p><strong>⚠</strong> This method requires <a href="tools/../cookbooks/bigquery/access.html#bigquery-access-request">BigQuery Access</a> to be provisioned.</p>
</blockquote>
<p>If you want to use Spark locally (or via an arbitrary GCP instance in the cloud), we recommend the <a href="https://github.com/GoogleCloudPlatform/spark-bigquery-connector">Storage API Connector</a> for accessing BigQuery tables in Spark as it is the most modern and actively developed connector. It works well with the BigQuery client library which is useful if you need to run arbitrary SQL queries and load their results into Spark.</p>
<h3 id="using-dataproc"><a class="header" href="#using-dataproc">Using Dataproc</a></h3>
<blockquote>
<p><strong>⚠</strong> This method requires <a href="tools/../cookbooks/bigquery/access.html#bigquery-access-request">BigQuery Access</a> to be provisioned.</p>
</blockquote>
<p>Dataproc is Google's managed Spark cluster service.</p>
<p>You can spin up a Dataproc cluster with Jupyter using the following command. Insert your values for <code>cluster-name</code>, <code>bucket-name</code>, and <code>project-id</code> there. Your notebooks are stored in Cloud Storage under <code>gs://bucket-name/notebooks/jupyter</code>:</p>
<pre><code class="language-bash">gcloud beta dataproc clusters create cluster-name \
--optional-components=ANACONDA,JUPYTER \
--image-version=1.4 \
--enable-component-gateway \
--properties=^#^spark:spark.jars=gs://spark-lib/bigquery/spark-bigquery-latest.jar \
--num-workers=3 \
--max-idle=3h \
--bucket bucket-name \
--region=us-west1 \
--project project-id
</code></pre>
<p>You can retrieve the Jupyter URL with the following command:</p>
<pre><code class="language-bash">gcloud beta dataproc clusters describe cluster-name --region=us-west1 --project project-id | grep Jupyter
</code></pre>
<p>After you've finished your work, it's a good practice to delete your cluster:</p>
<pre><code class="language-bash">gcloud beta dataproc clusters delete cluster-name --region=us-west1 --project project-id --quiet
</code></pre>
<h2 id="reading-data-from-bigquery-into-spark"><a class="header" href="#reading-data-from-bigquery-into-spark">Reading data from BigQuery into Spark</a></h2>
<p>There are two main ways to read data from BigQuery into Spark: using either the storage API and the
query API.</p>
<h3 id="storage-api"><a class="header" href="#storage-api">Storage API</a></h3>
<p>First, using the Storage API - this bypasses BigQuery's execution engine and
directly reads from the underlying storage.</p>
<p>This is the preferred method of loading data from BigQuery into Spark.</p>
<p>It is more efficient for reading large amounts of data into Spark, and
supports basic column and partitioning filters.</p>
<p>Example of using the Storage API from Databricks:</p>
<pre><code class="language-python">dbutils.library.installPyPI(&quot;google-cloud-bigquery&quot;, &quot;1.16.0&quot;)
dbutils.library.restartPython()
from google.cloud import bigquery
def get_table(view):
&quot;&quot;&quot;Helper for determining what table underlies a user-facing view, since the Storage API can't read views.&quot;&quot;&quot;
bq = bigquery.Client()
view = view.replace(&quot;:&quot;, &quot;.&quot;)
# partition filter is required, so try a couple options
for partition_column in [&quot;DATE(submission_timestamp)&quot;, &quot;submission_date&quot;]:
try:
job = bq.query(
f&quot;SELECT * FROM `{view}` WHERE {partition_column} = CURRENT_DATE&quot;,
bigquery.QueryJobConfig(dry_run=True),
)
break
except Exception:
continue
else:
raise ValueError(&quot;could not determine partition column&quot;)
assert len(job.referenced_tables) == 1, &quot;View combines multiple tables&quot;
table = job.referenced_tables[0]
return f&quot;{table.project}:{table.dataset_id}.{table.table_id}&quot;
# Read one day of main pings and select a subset of columns.
core_pings_single_day = spark.read.format(&quot;bigquery&quot;) \
.option(&quot;table&quot;, get_table(&quot;moz-fx-data-shared-prod.telemetry.main&quot;)) \
.load() \
.where(&quot;submission_timestamp &gt;= to_date('2019-08-25') submission_timestamp &lt; to_date('2019-08-26')&quot;) \
.select(&quot;client_id&quot;, &quot;experiments&quot;, &quot;normalized_channel&quot;)
</code></pre>
<p>A couple of things are worth noting in the above example.</p>
<ul>
<li><code>get_table</code> is necessary because an actual <em>table</em> name is required to read
from BigQuery here, fully qualified with project name and dataset name.
The Storage API does not support accessing <code>VIEW</code>s, so the convenience names
such as <code>telemetry.core</code> are not available via this API.</li>
<li>You must supply a filter on the table's date partitioning column, in this
case <code>submission_timestamp</code>.
Additionally, you must use the <code>to_date</code> function to make sure that predicate
push-down works properly for these filters.</li>
</ul>
<h3 id="query-api"><a class="header" href="#query-api">Query API</a></h3>
<p>If you want to read the results of a query (rather than directly reading
tables), you may also use the Query API.</p>
<p>This pushes the execution of the query into BigQuery's computation engine,
and is typically suitable for reading smaller amounts of data. If you need
to read large amounts of data, prefer the Storage API as described above.</p>
<p>Example:</p>
<pre><code class="language-python">from google.cloud import bigquery
bq = bigquery.Client()
query = &quot;&quot;&quot;
SELECT
event_string_value,
count(distinct client_id) AS client_count
FROM
mozdata.telemetry.events
WHERE
event_category = 'normandy'
AND event_method = 'enroll'
AND submission_date = '2019-06-01'
GROUP BY
event_string_value
ORDER BY
client_count DESC
LIMIT 20
&quot;&quot;&quot;
query_job = bq.query(query)
# Wait for query execution, then fetch results as a pandas dataframe.
rows = query_job.result().to_dataframe()
</code></pre>
<h2 id="persisting-data"><a class="header" href="#persisting-data">Persisting data</a></h2>
<p>You can save data resulting from your Spark analysis as a <a href="tools/../cookbooks/bigquery/querying.html#writing-query-results-to-a-permanent-table">BigQuery table</a>
or to <a href="tools/../cookbooks/bigquery/querying.html#writing-results-to-gcs-object-store">Google Cloud Storage</a>.</p>
<p>You can also save data to the <a href="https://docs.databricks.com/user-guide/databricks-file-system.html#dbfs">Databricks Filesystem</a>.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/tools/spark.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="dataset-specific"><a class="header" href="#dataset-specific">Dataset Specific</a></h1>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/dataset_specific.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="working-with-normandy-events"><a class="header" href="#working-with-normandy-events">Working with Normandy events</a></h1>
<p>A common request is to count the number of users who have
enrolled or unenrolled from a SHIELD experiment.</p>
<p>The <a href="cookbooks/../datasets/batch_view/events/reference.html"><code>events</code> table</a>
includes Normandy enrollment and unenrollment events
for both pref-flip and add-on studies.
Note that the events table is updated nightly.</p>
<p>Normandy events have <code>event_category</code> <code>normandy</code>.
The <code>event_string_value</code> will contain the experiment slug (for pref-flip experiments)
or name (for add-on experiments).</p>
<p>Normandy events are described in detail in the
<a href="https://firefox-source-docs.mozilla.org/toolkit/components/normandy/normandy/data-collection.html#enrollment">Firefox source tree docs</a>.</p>
<p>Note that addon studies do not have branch information in the events table,
since addons, not Normandy, are responsible for branch assignment.
For studies built with the obsolete <a href="https://github.com/mozilla/shield-studies-addon-utils">add-on utilities</a>,
branch assignments are published to the
<a href="cookbooks/../datasets/experiment_telemetry.html#telemetryshield_study">shield_study</a> dataset.</p>
<h2 id="counting-pref-flip-enrollment-events-by-branch"><a class="header" href="#counting-pref-flip-enrollment-events-by-branch">Counting pref-flip enrollment events by branch</a></h2>
<p>The <code>event_map_values</code> column of enroll events contains a <code>branch</code> key,
describing which branch the user enrolled in.</p>
<p>To fetch a count of events by branch in BigQuery SQL:</p>
<pre><code class="language-sql">SELECT
submission_date,
udf.get_key(event_map_values, 'branch') AS branch,
COUNT(*) AS n
FROM telemetry.events
WHERE
event_category = 'normandy'
AND event_method = 'enroll'
AND event_string_value = '{{experiment_slug}}'
AND submission_date &gt;= '{{experiment_start}}'
GROUP BY 1, 2
ORDER BY 1, 2
</code></pre>
<h2 id="counting-pref-flip-unenrollment-events-by-branch"><a class="header" href="#counting-pref-flip-unenrollment-events-by-branch">Counting pref-flip unenrollment events by branch</a></h2>
<p>The <code>event_map_values</code> column of unenroll events includes a <code>reason</code> key.
Reasons are described in the <a href="https://firefox-source-docs.mozilla.org/toolkit/components/normandy/normandy/data-collection.html#enrollment">Normandy docs</a>.
Normal unenroll events at the termination of a study will occur for the reason <code>recipe-not-seen</code>.</p>
<p>To fetch a count of events by reason and branch:</p>
<pre><code class="language-sql">SELECT
submission_date,
udf.get_key(event_map_values, 'branch') AS branch,
udf.get_key(event_map_values, 'reason') AS reason,
COUNT(*) AS n
FROM telemetry.events
WHERE
event_category = 'normandy'
AND event_method = 'unenroll'
AND event_string_value = '{{experiment_slug}}'
AND submission_date &gt;= '{{experiment_start}}'
GROUP BY 1, 2, 3
ORDER BY 1, 2, 3
</code></pre>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/normandy_events.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="working-with-crash-pings"><a class="header" href="#working-with-crash-pings">Working with Crash Pings</a></h1>
<p>You can use the following snippets to start querying <a href="cookbooks/../datasets/pings.html#crash-ping">crash pings</a> with <a href="cookbooks/../tools/stmo.html">STMO</a> and
<a href="cookbooks/../cookbooks/bigquery.html">BigQuery</a>. Using these tools, you can quickly get counts
and other information about crash pings that are submitted day-to-day.</p>
<p>The following example just counts all existing pings for a few days across several dimensions:</p>
<pre><code class="language-sql">SELECT date(submission_timestamp) AS crash_date,
count(*) AS crash_count
FROM telemetry.crash
WHERE date(submission_timestamp) &gt;= DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY)
GROUP BY crash_date
</code></pre>
<p><a href="https://sql.telemetry.mozilla.org/queries/67925/"><code>STMO#67925</code></a>.</p>
<p>Although the total crash counts is not always useful, you may want to restrict
a query to a channel or some other dimensions, and also facet the results. Therefore, you can add a few more fields to the SQL:</p>
<pre><code class="language-sql">SELECT date(submission_timestamp) AS crash_date,
normalized_os AS os,
count(*) AS crash_count
FROM telemetry.crash
WHERE date(submission_timestamp) &gt;= DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY)
AND normalized_channel='nightly'
GROUP BY normalized_os,
crash_date
</code></pre>
<p><a href="https://sql.telemetry.mozilla.org/queries/67927/"><code>STMO#67927</code></a></p>
<p>These are just initial examples. You can query across all the fields in
a telemetry crash ping, which provides useful information about the crashes themselves. You can view a summary of the available fields in the STMO schema browser, referring to <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/data/crash-ping.html">the documentation on the Firefox crash ping</a>
for more information where necessary.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/crash_pings.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="working-with-bit-patterns-in-clients-last-seen"><a class="header" href="#working-with-bit-patterns-in-clients-last-seen">Working with Bit Patterns in Clients Last Seen</a></h1>
<p>Monthly active users (MAU) is a windowed metric that requires joining data
per client across 28 days. Calculating this from individual pings or daily
aggregations can be computationally expensive, which motivated creation of the
<a href="cookbooks/../datasets/bigquery/clients_last_seen/reference.html"><code>clients_last_seen</code> dataset</a>
for desktop Firefox and similar datasets for other applications.</p>
<p>A powerful feature of the <code>clients_last_seen</code> methodology is that it doesn't
record specific metrics like MAU and WAU directly, but rather each row stores
a history of the discrete days on which a client was active in the past 28 days.
We could calculate active users in a 10 day or 25 day window just as efficiently
as a 7 day (WAU) or 28 day (MAU) window. But we can also define completely new
metrics based on these usage histories, such as various retention definitions.</p>
<p>The usage history is encoded as a &quot;bit pattern&quot; where the physical
type of the field is a BigQuery INT64, but logically the integer
represents an array of bits, with each 1 indicating a day where the given clients
was active and each 0 indicating a day where the client was inactive. This
article discusses the details of how we represent usage in bit patterns,
how to extract standard usage and retention metrics,
and how to build new metrics from them.</p>
<h2 id="table-of-contents-6"><a class="header" href="#table-of-contents-6">Table of Contents</a></h2>
<ul>
<li><a href="cookbooks/clients_last_seen_bits.html#calculating-dau-wau-and-mau">Calculating DAU, WAU, and MAU</a></li>
<li><a href="cookbooks/clients_last_seen_bits.html#calculating-retention">Calculating retention</a></li>
<li><a href="cookbooks/clients_last_seen_bits.html#understanding-bit-patterns">Understanding bit patterns</a>
<ul>
<li><a href="cookbooks/clients_last_seen_bits.html#why-28-bits-instead-of-64">Why 28 bits instead of 64?</a></li>
<li><a href="cookbooks/clients_last_seen_bits.html#forward-looking-windows-and-backward-looking-windows">Forward-looking windows and backward-looking windows</a></li>
</ul>
</li>
<li><a href="cookbooks/clients_last_seen_bits.html#usage-backward-looking-windows">Usage: Backward-looking windows</a>
<ul>
<li><a href="cookbooks/clients_last_seen_bits.html#how-windows-shift-from-day-to-day">How windows shift from day to day</a></li>
</ul>
</li>
<li><a href="cookbooks/clients_last_seen_bits.html#retention-forward-looking-windows">Retention: Forward-looking windows</a>
<ul>
<li><a href="cookbooks/clients_last_seen_bits.html#n-day-retention">N-day Retention</a></li>
<li><a href="cookbooks/clients_last_seen_bits.html#retention-using-activity-date">Retention using activity date</a></li>
</ul>
</li>
<li><a href="cookbooks/clients_last_seen_bits.html#proposing-a-new-bit-pattern-field">Proposing a new bit pattern field</a></li>
<li><a href="cookbooks/clients_last_seen_bits.html#udf-reference">UDF Reference</a>
<ul>
<li><a href="cookbooks/clients_last_seen_bits.html#bits28to_string"><code>bits28.to_string</code></a></li>
<li><a href="cookbooks/clients_last_seen_bits.html#bits28from_string"><code>bits28.from_string</code></a></li>
<li><a href="cookbooks/clients_last_seen_bits.html#bits28to_dates"><code>bits28.to_dates</code></a></li>
<li><a href="cookbooks/clients_last_seen_bits.html#bits28days_since_seen"><code>bits28.days_since_seen</code></a></li>
<li><a href="cookbooks/clients_last_seen_bits.html#bits28range"><code>bits28.range</code></a></li>
<li><a href="cookbooks/clients_last_seen_bits.html#bits28active_in_range"><code>bits28.active_in_range</code></a></li>
<li><a href="cookbooks/clients_last_seen_bits.html#bits28retention"><code>bits28.retention</code></a></li>
</ul>
</li>
</ul>
<h2 id="calculating-dau-wau-and-mau"><a class="header" href="#calculating-dau-wau-and-mau">Calculating DAU, WAU, and MAU</a></h2>
<p>The simplest application of usage bit patterns is for calculating metrics in
backward-looking windows. This is what we do for our canonical <em>usage</em> measures
DAU, WAU, and MAU.</p>
<p>To decide whether a given client should count towards DAU, WAU, and MAU, we
need to know how recently that client was active. If the client was seen
in the past 28 days, they count toward MAU. If they were active in the past 7
days, the count toward WAU. And only if they were active today do they count
toward DAU.</p>
<p>The user-facing <code>clients_last_seen</code> views present fields like <code>days_since_seen</code>
that extract this information for us from the underlying <code>days_seen_bits</code> field,
allowing us to easily express DAU, WAU, and MAU aggregates like:</p>
<pre><code class="language-sql">SELECT
submission_date,
COUNTIF(days_since_seen &lt; 28) AS mau,
COUNTIF(days_since_seen &lt; 7) AS wau,
COUNTIF(days_since_seen &lt; 1) AS dau
FROM
telemetry.clients_last_seen
WHERE
submission_date = '2020-01-28'
GROUP BY
submission_date
ORDER BY
submission_date
</code></pre>
<p>Under the hood, <code>days_since_seen</code> is calculated using the <code>bits28.days_since_seen</code>
UDF which is explained in more detail later in this article.</p>
<p>Note that the desktop <code>clients_last_seen</code> table also has additional bit pattern
fields corresponding to other <a href="cookbooks/../metrics/index.html">usage criteria</a>,
so other variants on MAU can be calculated like:</p>
<pre><code class="language-sql">SELECT
submission_date,
COUNTIF(days_since_visited_5_uri &lt; 28) AS visited_5_uri_mau,
COUNTIF(days_since_opened_dev_tools &lt; 28) AS opened_dev_tools_mau
FROM
telemetry.clients_last_seen
WHERE
submission_date = '2020-01-28'
GROUP BY
submission_date
ORDER BY
submission_date
</code></pre>
<p>Adding a new usage criterion is possible, but requires some work especially
if a historical backfill is necessary, so
<a href="cookbooks/../concepts/reporting_a_problem.html">file a bug</a> to begin discussions on
new usage criteria.</p>
<p>Also note that non-desktop products also have derived tables following the
<code>clients_last_seen</code> methodology. Per-product MAU could be calculated as:</p>
<pre><code class="language-sql">SELECT
submission_date,
app_name,
COUNTIF(days_since_seen &lt; 28) AS mau,
COUNTIF(days_since_seen &lt; 7) AS wau,
COUNTIF(days_since_seen &lt; 1) AS dau
FROM
telemetry.nondesktop_clients_last_seen
WHERE
submission_date = '2020-01-28'
GROUP BY
submission_date, app_name
ORDER BY
submission_date, app_name
</code></pre>
<h2 id="calculating-retention"><a class="header" href="#calculating-retention">Calculating retention</a></h2>
<p>For retention calculations, we use forward-looking windows. This means that
when we report a retention value for 2020-01-01, we're talking about what
portion of clients active on 2020-01-01 are still active some number of days
later.</p>
<p>In particular, let's consider the &quot;1-Week Retention&quot; measure shown in <a href="https://gud.telemetry.mozilla.org/">GUD</a>
which considers a window of 14 days.
For each client active in &quot;week 0&quot; (days 0 through 6), we determine retention by
checking if they were also active in &quot;week 1&quot; (days 7 through 13).</p>
<p>We provide a UDF called <code>bits28.retention</code> that returns a rich STRUCT
type representing activity in various windows, with all the date and bit
offsets handled for you. You pass in a bit pattern and the corresponding <code>submission_date</code>,
and it returns fields like:</p>
<ul>
<li><code>day_13.metric_date</code></li>
<li><code>day_13.active_in_week_0</code></li>
<li><code>day_13.active_in_week_1</code></li>
</ul>
<p>Calculating GUD's retention aggregates and some other variants looks like:</p>
<pre><code class="language-sql">-- The struct returned by bits28.retention is nested.
-- The first level of nesting defines the beginning of our window;
-- in our case, we want day_13 to get retention in a 2-week window.
-- This base query uses day_13.* to make all the day_13 fields available:
-- - metric_date
-- - active_in_week_0
-- - active_in_week_1
-- - ...
--
WITH base AS (
SELECT
*,
mozfun.bits28.retention(
days_seen_bits, submission_date
).day_13.*,
mozfun.bits28.retention(
days_created_profile_bits, submission_date
).day_13.active_on_metric_date AS is_new_profile
FROM
telemetry.clients_last_seen )
SELECT
metric_date, -- 2020-01-15 (13 days earlier than submission_date)
-- 1-Week Retention matching GUD.
SAFE_DIVIDE(
COUNTIF(active_in_week_0 AND active_in_week_1),
COUNTIF(active_in_week_0)
) AS retention_1_week,
-- 1-Week New Profile Retention matching GUD.
SAFE_DIVIDE(
COUNTIF(is_new_profile AND active_in_week_1),
COUNTIF(is_new_profile)
) AS retention_1_week_new_profile,
-- NOT AN OFFICIAL METRIC
-- A more restrictive 1-Week Retention definition that considers only clients
-- active on day 0 rather than clients active on any day in week 0.
SAFE_DIVIDE(
COUNTIF(active_on_metric_date AND active_in_week_1),
COUNTIF(active_on_metric_date)
) AS retention_1_week_active_on_day_0,
-- NOT AN OFFICIAL METRIC
-- A more restrictive 0-and-1-Week Retention definition where again the denominator
-- is restricted to clients active on day 0 and the client must be active both in
-- week 0 after the metric date and in week 1.
SAFE_DIVIDE(
COUNTIF(active_on_metric_date AND active_in_week_0_after_metric_date AND active_in_week_1),
COUNTIF(active_on_metric_date)
) AS retention_0_and_1_week_active_on_day_0,
FROM
base
WHERE
submission_date = '2020-01-28'
GROUP BY
metric_date
</code></pre>
<p>Notice that in each retention definition, the numerator always contains the exact
same condition as the denominator plus additional constraints (<code>AND ...</code>).
It is very easy to accidentally define a retention metric that is logically
inconsistent and can rise above 1.</p>
<p>Under the hood, <code>bits28.retention</code> is using a series of calls to the lower-level
<code>bits28.range</code> function, which is explained later in this article.
<code>bits28.range</code> is very powerful and can be used to construct novel metrics,
but it also introduces many opportunities for off-by-one errors and passing parameters
in incorrect order, so please fully read through this documentation before
attempting to use the lower-level functions.</p>
<h2 id="understanding-bit-patterns"><a class="header" href="#understanding-bit-patterns">Understanding bit patterns</a></h2>
<p>If you look at the <code>days_seen_bits</code> field in <code>telemetry.clients_last_seen</code>,
you'll see seemingly random whole numbers, some as large as nine digits.
How should we interpret these?</p>
<p>For very small numbers, it may be possible to interpret the value by eye.
A value of <code>1</code> means the client was active on <code>submission_date</code> only
and wasn't seen in any of the 27 days previous. A value of <code>2</code> means
the client was seen 1 day ago, but not on <code>submission_date</code>. A value of <code>3</code>
means that the client was seen on <code>submission_date</code> <em>and</em> the day previous.</p>
<p>It's much easier to reason about these bit patterns, however, when we view them
as strings of ones and zeros. We've provided a UDF to convert these
values to &quot;bit strings&quot;:</p>
<pre><code class="language-sql">SELECT
[ mozfun.bits28.to_string(1),
mozfun.bits28.to_string(2),
mozfun.bits28.to_string(3) ]
&gt;&gt;&gt; ['0000000000000000000000000001',
'0000000000000000000000000010',
'0000000000000000000000000011']
</code></pre>
<p>A value of <code>3</code> is equal to <code>2^1 + 2^0</code> and indeed we see that reading from
right to left in the string of bits, the &quot;lowest&quot; to bits are set (1) while
the rest of the bits are unset (0).</p>
<p>Let's consider a larger value <code>8256</code>. In terms of powers of two, this is equal
to <code>2^13 + 2^6</code> and its string representation should have two <code>1</code> values.
If we label the rightmost bit as &quot;offset 0&quot;, we would expect the set
bits to be at offsets <code>-13</code> and <code>-6</code>:</p>
<pre><code class="language-sql">SELECT mozfun.bits28.to_string(8256)
&gt;&gt;&gt; '0000000000000010000001000000'
</code></pre>
<p>We also provide the inverse of this function to take a string representation
of a bit pattern and return the associated integer:</p>
<pre><code class="language-sql">SELECT mozfun.bits28.from_string('0000000000000010000001000000')
&gt;&gt;&gt; 8256
</code></pre>
<p>Note that the leading zeros are optional for this function:</p>
<pre><code class="language-sql">SELECT mozfun.bits28.from_string('10000001000000')
&gt;&gt;&gt; 8256
</code></pre>
<p>Finally, we can translate this into an array of concrete dates by passing
a value for the date that corresponds to the rightmost bit:</p>
<pre><code class="language-sql">SELECT mozfun.bits28.to_dates(8256, '2020-01-28')
&gt;&gt;&gt; ['2020-01-15', '2020-01-22']
</code></pre>
<h3 id="why-28-bits-instead-of-64"><a class="header" href="#why-28-bits-instead-of-64">Why 28 bits instead of 64?</a></h3>
<p>BigQuery has only one integer type (<code>INT64</code>) which is composed of 64 bits,
so we could technically store 64 days of history per bit pattern. Limiting
to 28 bits is a practical concern related to storage costs and reprocessing concerns.</p>
<p>Consider a client that is only active on a single day and then never shows up again.
A client that becomes inactive will eventually fall outside the 28-day usage
window and will thus not have a row in following days of <code>clients_last_seen</code>
and we no longer duplicate that client's data for those days.</p>
<p>Also, tables following the <code>clients_last_seen</code> methodology have to be populated
incrementally. For each new day of data, we have to reference the previous
day's rows in <code>clients_last_seen</code>, take the trailing 27 bits of each pattern
and appending a 0 or 1 to represent whether the client was active in the new
day.</p>
<p>Now, suppose we find that there was a processing error 10 days ago that affected
a table upstream of <code>clients_last_seen</code>. If we fix that error, we now have to
recompute each day of <code>clients_last_seen</code> from 10 days ago all the way to the
present.</p>
<p>We chose to encode only 28 days of history in these bit patterns as a compromise
that gives just enough history to calculate MAU on a rolling basis but otherwise
limits the amount of data that needs to be reprocessed to recover from errors.</p>
<h3 id="forward-looking-windows-and-backward-looking-windows"><a class="header" href="#forward-looking-windows-and-backward-looking-windows">Forward-looking windows and backward-looking windows</a></h3>
<p>Bit patterns can be used to calculate a variety of windowed metrics,
but there are a number of ways we can choose to interpret a bit pattern
and define windows within it. In particular, we can choose to read a bit
pattern from right to left, looking <em>backwards</em> from the most recent day.
Or we can choose to read a bit pattern from left to right, looking
<em>forwards</em> from some chosen reference point.</p>
<p>MAU and WAU use <em>backward-looking windows</em> where the value for 2020-01-28
depends on activity from 2020-01-01 to 2020-01-28. You can calculate DAU,
WAU, and MAU for 2020-01-28 as soon data for that target date has been
processed. In other words, the <em>metric date</em> for usage metrics corresponds
directly to the <code>submission_date</code> in <code>clients_last_seen</code>.</p>
<p>Retention metrics, however, use <em>forward-looking windows</em> where the value for
2020-01-28 depends on activity happening on and <em>after</em> that date.
Be prepared for this to twist your mind a bit. What we call &quot;1-Week Retention&quot;
depends on activity in a 2-week window. If we want to calculate a 1-week
retention value for 2020-01-01, we need to consider activity from 2020-01-01
through 2020-01-14, so we cannot know the retention value for a given day
until we've fully processed data 13 days later. In other words, the <em>metric date</em>
for 1-week retention is always 13 days earlier than the <code>submission_date</code>
on which it can be calculated.</p>
<p>Using forward-looking windows initially seems awkward, but it turns out
to be necessary for consistency in how we define various retention metrics.
Consider if we wanted to compare 1-week, 2-week, and 3-week retention metrics on a
single plot. If we use forward-looking windows, then the point labeled 2020-01-01
describes the same set of users for all three metrics and how their activity
differs over time. If we use backwards-looking windows, then each of these three
metrics is considering a separate population of users.
We'll discuss this in more detail later.</p>
<h2 id="usage-backward-looking-windows"><a class="header" href="#usage-backward-looking-windows">Usage: Backward-looking windows</a></h2>
<p>The simplest application of usage bit patterns is for calculating metrics in
backward-looking windows. This is what we do for our canonical <em>usage</em> measures
DAU, WAU, and MAU.</p>
<p>Let's imagine a single row from <code>clients_last_seen</code> with <code>submission_date = 2020-01-28</code>.
The <code>days_seen_bits</code> field records usage for a single client over a period of 28 days
<em>ending</em> on the given <code>submission_date</code>. We will call this metric date &quot;day 0&quot;
(2020-01-28 in this case) and count backwards to &quot;day -27&quot; (2020-01-01).</p>
<p>Let's suppose this client was only active on two days in the past month:
2020-01-22 (day -6) and 2020-01-15 (day -13). That client's <code>days_seen_bits</code>
value would show up as <code>8256</code>, which as we saw in the previous section can
be represented as bit string <code>'0000000000000010000001000000'</code>.</p>
<p>Let's dive more deeply into that bit string representation:</p>
<pre><code> 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0
──────────────────────────────────────────────────────────────────────────────────
│ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │
│-26 │-24 │-22 │-20 │-18 │-16 │-14 │-12 │-10 │ -8 │ -6 │ -4 │ -2 │ 0
-27 -25 -23 -21 -19 -17 -15 -13 -11 -9 -7 -5 -3 -1
└──────────────────────────────────────────────────────────────────────────────────┘
MAU └──────────────────┘
WAU └─┘
DAU
</code></pre>
<p>In this picture, we've annotated the windows for DAU (day 0 only),
WAU (days 0 through -6) and MAU (days 0 through -27). This particular client
won't count toward DAU for 2020-01-28, but the client does count towards both
WAU and MAU.</p>
<p>Note that for each of these usage metrics, the <em>number</em> of active days
does not matter but only the <em>recency</em> of the latest active day.
We provide a special function to tell us how many days have elapsed since
the most recent activity encoded in a bit pattern:</p>
<pre><code class="language-sql">SELECT mozfun.bits28.days_since_seen(mozfun.bits28.from_string('10000001000000'))
&gt;&gt;&gt; 6
</code></pre>
<p>Indeed, this is so commonly used that we build this function into user-facing
views, so that instead of referencing <code>days_seen_bits</code> with a UDF, you can
instead reference a field called <code>days_since_seen</code>. Counting MAU, WAU, and
DAU generally looks like:</p>
<pre><code class="language-sql">SELECT
COUNTIF(days_since_seen &lt; 28) AS mau,
COUNTIF(days_since_seen &lt; 7) AS wau,
COUNTIF(days_since_seen &lt; 1) AS dau
FROM
telemetry.clients_last_seen
</code></pre>
<h3 id="how-windows-shift-from-day-to-day"><a class="header" href="#how-windows-shift-from-day-to-day">How windows shift from day to day</a></h3>
<p>Note that this particular client is about to fall outside the WAU window.
If the client doesn't send a main ping on 2020-01-29, the new <code>days_seen_bits</code>
pattern for this client will look like:</p>
<pre><code> 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0
──────────────────────────────────────────────────────────────────────────────────
│ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │
│-26 │-24 │-22 │-20 │-18 │-16 │-14 │-12 │-10 │ -8 │ -6 │ -4 │ -2 │ 0
-27 -25 -23 -21 -19 -17 -15 -13 -11 -9 -7 -5 -3 -1
└──────────────────────────────────────────────────────────────────────────────────┘
MAU └──────────────────┘
WAU └─┘
DAU
</code></pre>
<p>The entire pattern has simply shifted one offset to the left, with the leading zero
falling off (since it's now outside the 28-day range) and a trailing zero added
on the right (this would be a <code>1</code> instead if the user had been active on 2020-01-29).</p>
<p>The <code>days_since_seen</code> value is now <code>7</code>, which is outside the WAU window:</p>
<pre><code class="language-sql">SELECT mozfun.bits28.days_since_seen(mozfun.bits28.from_string('100000010000000'))
&gt;&gt;&gt; 7
</code></pre>
<h2 id="retention-forward-looking-windows"><a class="header" href="#retention-forward-looking-windows">Retention: Forward-looking windows</a></h2>
<p>For retention calculations, we use forward-looking windows. This means that
when we report a retention value for 2020-01-01, we're talking about what
portion of clients active on 2020-01-01 are still active some number of days
later.</p>
<p>When we were talking about backward-looking windows, our metric date or &quot;day 0&quot;
was always the most recent day, corresponding to the rightmost bit.
When we define forward-looking windows, however, we always choose a metric date
some time in the past. How we number the individual bits depends on what
metric date we choose.</p>
<p>For example, in <a href="https://gud.telemetry.mozilla.org/">GUD</a>, we show a &quot;1-Week Retention&quot; which considers a window of 14 days.
For each client active in &quot;week 0&quot; (days 0 through 6), we determine retention by
checking if they were also active in &quot;week 1&quot; (days 7 through 13).</p>
<p>To make &quot;1-Week Retention&quot; more concrete,
let's consider the same client as before, grabbing the <code>days_seen_bits</code> value from
<code>clients_last_seen</code> with <code>submission_date = 2020-01-28</code>. We count back 13 bits in
the array to define our new &quot;day 0&quot; which corresponds to 2020-01-15:</p>
<pre><code> 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0
──────────────────────────────────────────────────────────────────────────────────
│ │ │ │ │ │ │ │ │ │ │ │ │ │
│ 1 │ 3 │ 5 │ 7 │ 9 │ 11 │ 13
0 2 4 6 8 10 12
└────────────────────┘
Week 0 └───────────────────┘
Week 1
</code></pre>
<p>This client has a bit set in both week 0 and in week 1, so logically this client
can be considered retained; they should be counted in both the denominator and
in the numerator for the &quot;1-Week Retention&quot; value on 2020-01-15.</p>
<p>Also note there is some nuance in retention metrics as to what counts as &quot;week 0&quot;
because sometimes we want to measure a user as active in week 0 excluding the metric
date (&quot;day 0&quot;) itself. The client shown above would not count as &quot;active in week 0 after metric date&quot;:</p>
<pre><code> 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0
──────────────────────────────────────────────────────────────────────────────────
│ │ │ │ │ │ │ │ │ │ │ │ │ │
│ 1 │ 3 │ 5 │ 7 │ 9 │ 11 │ 13
0 2 4 6 8 10 12
Metric Date └──┘
Week 0 After Metric Date └─────────────────┘
Week 0 └────────────────────┘
</code></pre>
<p>But how can we extract this usage per week information in a query?</p>
<p>Extracting the bits for a specific week can be achieved via UDF:</p>
<pre><code class="language-sql">SELECT
-- Signature is bits28.range(offset_to_day_0, start_bit, number_of_bits)
mozfun.bits28.range(days_seen_bits, -13 + 0, 7) AS week_0_bits,
mozfun.bits28.range(days_seen_bits, -13 + 7, 7) AS week_1_bits
FROM
telemetry.clients_last_seen
</code></pre>
<p>And then we can turn those bits into a boolean indicating whether the client
was active or not as:</p>
<pre><code class="language-sql">SELECT
BIT_COUNT(mozfun.bits28.range(days_seen_bits, -13 + 0, 7)) &gt; 0 AS active_in_week_0
BIT_COUNT(mozfun.bits28.range(days_seen_bits, -13 + 7, 7)) &gt; 0 AS active_in_week_1
FROM
telemetry.clients_last_seen
</code></pre>
<p>This pattern of checking whether any bit is set within a given range is common
enough that we provide short-hand for it in <code>bits28.active_in_range</code>.
The above query can be made a bit cleaner as:</p>
<pre><code class="language-sql">SELECT
mozfun.bits28.active_in_range(days_seen_bits, -13 + 0, 7) AS active_in_week_0
mozfun.bits28.active_in_range(days_seen_bits, -13 + 7, 7) AS active_in_week_1
FROM
telemetry.clients_last_seen
</code></pre>
<p>In terms of the higher-level <code>bits28.retention</code> function discussed earlier,
here's how this client looks:</p>
<pre><code class="language-sql">SELECT
submission_date,
mozfun.bits28.retention(days_seen_bits, submission_date).day_13.*
FROM
telemetry.clients_last_seen
/*
submission_date = 2020-01-28
metric_date = 2020-01-15
active_on_day_0 = true
active_in_week_0 = true
active_in_week_0_after_metric_date = false
active_in_week_1 = true
*/
</code></pre>
<h3 id="n-day-retention"><a class="header" href="#n-day-retention">N-day Retention</a></h3>
<p><em>Not an official metric. This section is intended solely as an example of advanced usage.</em></p>
<p>As an example of a novel metric that can be defined using the low-level
bit pattern UDFs, let's define <code>n</code>-day retention as the fraction of clients active on a given day
who are also active within the next <code>n</code> days. For example, 3-day retention would
have a denominator of all clients active on day 0 and a numerator of all clients
active on day 0 who were also active on days 1 or 2.</p>
<p>To calculate <code>n</code>-day retention, we need to use the lower-level <code>bits28.range</code>
function:</p>
<pre><code class="language-sql">DECLARE n INT64;
SET n = 3;
WITH base AS (
SELECT
*,
mozfun.bits28.active_in_range(days_seen_bits, -n + 1, 1) AS seen_on_day_0,
mozfun.bits28.active_in_range(days_seen_bits, -n + 2, n - 1) AS seen_after_day_0
FROM
telemetry.clients_last_seen )
SELECT
DATE_SUB(submission_date, INTERVAL n DAY) AS metric_date,
-- NOT AN OFFICIAL METRIC
SAFE_DIVIDE(
COUNTIF(seen_on_day_0 AND seen_after_day_0),
COUNTIF(seen_on_day_0)
) AS retention_n_day
FROM
base
WHERE
submission_date = '2020-01-28'
GROUP BY
metric_date
</code></pre>
<h3 id="retention-using-activity-date"><a class="header" href="#retention-using-activity-date">Retention using activity date</a></h3>
<p><em>Not an official metric. This section is intended solely as an example of advanced usage.</em></p>
<p>GUD's canonical retention definitions are all based on ping submission dates rather
than logical activity dates taken from client-provided timestamps, but there is
interest in using client timestamps particularly for <code>n</code>-day retention calculations
for mobile products.</p>
<p>Let's consider Firefox Preview which sends telemetry via Glean. The
<code>org_mozilla_fenix.baseline_clients_last_seen</code> table includes two bit patterns
that encode client timestamps: <code>days_seen_session_start_bits</code> and
<code>days_seen_session_end_bits</code>. This table is still populated once per day based
on pings received over the previous day, but some of those pings will reflect
sessions that started on previous days. This introduces some new complexity
into retention calculations because we'll always be underestimating client
counts if we have our retention window end on <code>submission_date</code>.</p>
<p>When using activity date, it may be desirable to build in a few days of buffer
to ensure we are considering late-arriving pings. For example, if we wanted
to calculate 3-day retention but allow 2 days of cushion for late-arriving
pings, we would need to use an offset of 5 days from <code>submission_date</code>:</p>
<pre><code class="language-sql">DECLARE n, cushion_days, offset_to_day_0 INT64;
SET n = 3;
SET cushion_days = 2;
SET offset_to_day_0 = 1 - n - cushion_days;
WITH base AS (
SELECT
*,
mozfun.bits28.active_in_range(days_seen_session_start_bits, offset_to_day_0, 1) AS seen_on_day_0,
mozfun.bits28.active_in_range(days_seen_session_start_bits, offset_to_day_0 + 1, n - 1) AS seen_after_day_0
FROM
org_mozilla_fenix.baseline_clients_last_seen )
SELECT
DATE_SUB(submission_date, INTERVAL offset_to_day_0 DAY) AS metric_date,
-- NOT AN OFFICIAL METRIC
SAFE_DIVIDE(
COUNTIF(seen_on_day_0 AND seen_after_day_0),
COUNTIF(seen_on_day_0)
) AS retention_n_day
FROM
base
WHERE
submission_date = '2020-01-28'
GROUP BY
metric_date
</code></pre>
<h2 id="proposing-a-new-bit-pattern-field"><a class="header" href="#proposing-a-new-bit-pattern-field">Proposing a new bit pattern field</a></h2>
<p>The operational logic required to produce a <code>clients_last_seen</code> table makes
it unwieldy for backfilling, so it has historically been difficult for data
scientists to experiment with new bit pattern fields on their own.</p>
<p>Below are sample queries for producing a small <code>clients_last_seen</code>-like table
that presents an experimental usage definition. In this approach, the temporary
analysis table we create actually stores a client's whole usage history as a
BYTES field, and then we rely on view logic to present this as per-day windows.
Much of the logic is boilerplate; the sections that would need to change for
your specific new field are marked between <code>-- BEGIN</code> and <code>-- END</code> comments.</p>
<p>The first example defines a new feature based on a measure that already exists
in <code>clients_daily</code>. It takes only a few minutes to run:</p>
<pre><code class="language-sql">DECLARE start_date DATE DEFAULT '2020-05-01';
DECLARE end_date DATE DEFAULT '2020-11-01';
DECLARE target_sample_id INT64 DEFAULT 0;
CREATE TEMP FUNCTION process_bits(bits BYTES) AS (
STRUCT(
bits,
-- An INT64 version of the bits, compatible with bits28 functions
CAST(CONCAT('0x', TO_HEX(RIGHT(bits, 4))) AS INT64) &lt;&lt; 36 &gt;&gt; 36 AS bits28,
-- An INT64 version of the bits with 64 days of history
CAST(CONCAT('0x', TO_HEX(RIGHT(bits, 4))) AS INT64) AS bits64,
-- A field like days_since_seen from clients_last_seen.
udf.bits_to_days_since_seen(bits) AS days_since_active,
-- Days since first active, analogous to first_seen_date in clients_first_seen
udf.bits_to_days_since_first_seen(bits) AS days_since_first_active
)
);
CREATE OR REPLACE TABLE
analysis.&lt;myuser&gt;_newfeature
PARTITION BY submission_date
CLUSTER BY sample_id
AS
WITH
alltime AS (
SELECT
sample_id,
client_id,
-- BEGIN
-- Here we produce bit pattern fields based on the daily aggregates from the
-- previous step;
udf.bits_from_offsets(
ARRAY_AGG(
IF(active_hours_sum &gt;= 1,DATE_DIFF(end_date, submission_date, DAY), NULL)
IGNORE NULLS
)
) AS days_active_bits,
-- END
FROM
telemetry.clients_daily
WHERE
sample_id = target_sample_id
AND submission_date BETWEEN start_date AND end_date
GROUP BY
sample_id,
client_id
)
SELECT
end_date - i AS submission_date,
sample_id,
client_id,
process_bits(days_active_bits &gt;&gt; i) AS days_active
FROM
alltime
-- The cross join parses each input row into one row per day since the client
-- was first seen, emulating the format of the existing clients_last_seen table.
CROSS JOIN
UNNEST(GENERATE_ARRAY(0, DATE_DIFF(end_date, start_date, DAY))) AS i
WHERE
(days_active_bits &gt;&gt; i) IS NOT NULL
</code></pre>
<p>And here is a more complex example that references <code>main_v4</code> directly:</p>
<details>
<summary>Calculating a bit pattern field directly from `main_v4`</summary>
<pre><code class="language-sql">DECLARE start_date DATE DEFAULT '2020-05-01';
DECLARE end_date DATE DEFAULT '2020-11-01';
DECLARE target_sample_id INT64 DEFAULT 0;
CREATE TEMP FUNCTION process_bits(bits BYTES) AS (
STRUCT(
bits,
-- An INT64 version of the bits, compatible with bits28 functions
CAST(CONCAT('0x', TO_HEX(RIGHT(bits, 4))) AS INT64) &lt;&lt; 36 &gt;&gt; 36 AS bits28,
-- An INT64 version of the bits with 64 days of history
CAST(CONCAT('0x', TO_HEX(RIGHT(bits, 4))) AS INT64) AS bits64,
-- A field like days_since_seen from clients_last_seen.
udf.bits_to_days_since_seen(bits) AS days_since_active,
-- Days since first active, analogous to first_seen_date in clients_first_seen
udf.bits_to_days_since_first_seen(bits) AS days_since_first_active
)
);
CREATE OR REPLACE TABLE
analysis.&lt;myuser&gt;_newfeature
PARTITION BY submission_date
CLUSTER BY sample_id
AS
WITH
-- If clients_daily already contains a measure that suffices as the basis for
-- our new usage definition, we can skip this daily subquery and calculate
-- alltime based on clients_daily rather than `main`.
daily AS (
SELECT
DATE(submission_timestamp) AS submission_date,
sample_id,
client_id,
-- BEGIN
-- Here is where we put clients_daily-like aggregations that will be
-- used as the basis for bit patterns in the next step.
SUM(payload.processes.parent.scalars.browser_engagement_active_ticks)
AS active_ticks_sum,
-- END
FROM
telemetry.main
WHERE
sample_id = target_sample_id
AND DATE(submission_timestamp) BETWEEN start_date AND end_date
GROUP BY
submission_date,
sample_id,
client_id ),
alltime AS (
SELECT
sample_id,
client_id,
-- BEGIN
-- Here we produce bit pattern fields based on the daily aggregates from the
-- previous step;
udf.bits_from_offsets(
ARRAY_AGG(
IF(active_ticks_sum &gt;= 8,DATE_DIFF(end_date, submission_date, DAY), NULL)
IGNORE NULLS
)
) AS days_active_bits,
-- END
FROM
daily
GROUP BY
sample_id,
client_id
)
SELECT
end_date - i AS submission_date,
sample_id,
client_id,
process_bits(days_active_bits &gt;&gt; i) AS days_active
FROM
alltime
-- The cross join parses each input row into one row per day since the client
-- was first seen, emulating the format of the existing clients_last_seen table.
CROSS JOIN
UNNEST(GENERATE_ARRAY(0, DATE_DIFF(end_date, start_date, DAY))) AS i
WHERE
(days_active_bits &gt;&gt; i) IS NOT NULL
</code></pre>
</details>
<p>This script takes about 10 minutes to run over 6 months of data as written above
and about an hour to run over the whole history of <code>main_v4</code> (starting at 2018-11-01).
A query over the whole history of <code>clients_daily</code> (starting in early 2016)
can run in about an hour as well.
The resultant table can be used on its own
or joined with <code>clients_daily</code> to pull per-client dimensions.</p>
<p>If the definition proves useful in validation, your variant of the query
above can serve as a good starting point for Data Engineering to integrate the new
definition into <code>clients_last_seen</code> (and <code>clients_daily</code> if necessary).
Once a new definition is integrated into the model, we can backfill two months of
data fairly easily. Complete backfills are expensive in terms of computational cost
and engineering effort, so cannot happen more than approximately quarterly.</p>
<h2 id="udf-reference"><a class="header" href="#udf-reference">UDF Reference</a></h2>
<h3 id="bits28to_string"><a class="header" href="#bits28to_string"><code>bits28.to_string</code></a></h3>
<p>Convert an INT64 field into a 28 character string representing the individual bits.</p>
<pre><code class="language-sql">bits28.to_string(bits INT64)
SELECT mozfun.bits28.to_string(18)
&gt;&gt; 0000000000000000000000010010
</code></pre>
<h3 id="bits28from_string"><a class="header" href="#bits28from_string"><code>bits28.from_string</code></a></h3>
<p>Convert a string representing individual bits into an INT64.</p>
<pre><code class="language-sql">bits28.from_string(bits STRING)
SELECT mozfun.bits28.from_string('10010')
&gt;&gt; 18
</code></pre>
<h3 id="bits28to_dates"><a class="header" href="#bits28to_dates"><code>bits28.to_dates</code></a></h3>
<p>Convert a bit pattern into an array of the dates is represents.</p>
<pre><code class="language-sql">bits28.to_dates(bits INT64, end_date DATE)
SELECT mozfun.bits28.to_dates(18, '2020-01-28')
&gt;&gt; ['2020-01-24', '2020-01-27']
</code></pre>
<h3 id="bits28days_since_seen"><a class="header" href="#bits28days_since_seen"><code>bits28.days_since_seen</code></a></h3>
<p>Return the position of the rightmost set bit in an INT64 bit pattern.</p>
<pre><code class="language-sql">bits28.days_since_seen(bits INT64)
SELECT bits28.days_since_seen(18)
&gt;&gt; 1
</code></pre>
<h3 id="bits28range"><a class="header" href="#bits28range"><code>bits28.range</code></a></h3>
<p>Return an INT64 representing a range of bits from a source bit pattern.</p>
<p>The <code>start_offset</code> must be zero or a negative number indicating an offset from
the rightmost bit in the pattern.</p>
<p><code>n_bits</code> is the number of bits to consider, counting right from the bit at <code>start_offset</code>.</p>
<pre><code class="language-sql">bits28.range(bits INT64, offset INT64, n_bits INT64)
SELECT mozfun.bits28.to_string(mozfun.bits28.range(18, 5, 6))
&gt;&gt; '010010'
SELECT mozfun.bits28.to_string(mozfun.bits28.range(18, 5, 2))
&gt;&gt; '01'
SELECT mozfun.bits28.to_string(mozfun.bits28.range(18, 5 - 2, 4))
&gt;&gt; '0010'
</code></pre>
<h3 id="bits28active_in_range"><a class="header" href="#bits28active_in_range"><code>bits28.active_in_range</code></a></h3>
<p>Return a boolean indicating if any bits are set in the specified range of a bit pattern.</p>
<p>The <code>start_offset</code> must be zero or a negative number indicating an offset from
the rightmost bit in the pattern.</p>
<p><code>n_bits</code> is the number of bits to consider, counting right from the bit at <code>start_offset</code>.</p>
<pre><code class="language-sql">bits28.active_in_range(bits INT64, start_offset INT64, n_bits INT64)
</code></pre>
<h3 id="bits28retention"><a class="header" href="#bits28retention"><code>bits28.retention</code></a></h3>
<p>Return a nested struct providing numerator and denominator fields for
the standard 1-Week, 2-Week, and 3-Week retention definitions.</p>
<pre><code class="language-sql">bits28.retention(bits INT64, submission_date DATE)
</code></pre>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/clients_last_seen_bits.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="visualizing-percentiles-of-a-main-ping-exponential-histogram"><a class="header" href="#visualizing-percentiles-of-a-main-ping-exponential-histogram">Visualizing Percentiles of a Main Ping Exponential Histogram</a></h1>
<p><a href="https://glam.telemetry.mozilla.org/">GLAM</a> is great if you want to check out the behaviour of a histogram over a large population across a curated set of dimensions, but what if you have a follow-up question that doesn't fit into its UI model? This tutorial will go into the guts of how to reproduce a GLAM-like view using <code>sql.telemetry.mozilla.org</code> (STMO), along with some suggestions on how to dig deeper.</p>
<p>This tutorial tries to build up an understanding and intuition of how things work on a low-level before it gets to its main act of reproducing GLAM. If you don't care about the details, you can probably skip the earlier sections in this document.</p>
<p>Assumptions:</p>
<ul>
<li>You have some idea of what a histogram is (if not, the <a href="https://en.wikipedia.org/wiki/Histogram">Wikipedia article</a> is a great place to start), have at least skimmed over <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/collection/histograms.html">the Firefox documentation on histograms</a></li>
<li>You have read <a href="cookbooks/../tools/stmo.html">the introduction to STMO</a>.</li>
<li>You understand the <a href="cookbooks/../datasets/main_ping_tables.html">main ping tables</a> (<code>telemetry.main_1pct</code> and <code>telemetry.main_nightly</code>).</li>
</ul>
<h2 id="table-of-contents-7"><a class="header" href="#table-of-contents-7">Table of Contents</a></h2>
<ul>
<li><a href="cookbooks/main_ping_exponential_histograms.html#setting-the-stage-tab-spinners-duration">Setting the stage: tab spinners duration</a></li>
<li><a href="cookbooks/main_ping_exponential_histograms.html#getting-client-level-data">Getting client-level data</a></li>
<li><a href="cookbooks/main_ping_exponential_histograms.html#getting-percentiles-from-a-set-of-histograms">Getting percentiles from a set of histograms</a></li>
<li><a href="cookbooks/main_ping_exponential_histograms.html#viewing-change-of-percentiles-over-time">Viewing change of percentiles over time</a></li>
<li><a href="cookbooks/main_ping_exponential_histograms.html#percentiles-from-client-normalized-histograms">Percentiles from client-normalized histograms</a></li>
<li><a href="cookbooks/main_ping_exponential_histograms.html#slicing-along-arbitrary-dimensions">Slicing along arbitrary dimensions</a></li>
</ul>
<h2 id="setting-the-stage-tab-spinners-duration"><a class="header" href="#setting-the-stage-tab-spinners-duration">Setting the stage: tab spinners duration</a></h2>
<p>For the purposes of this tutorial, let's look at a typical performance-oriented histogram: <a href="https://probes.telemetry.mozilla.org/?view=detail&amp;probeId=histogram%2FFX_TAB_SWITCH_SPINNER_VISIBLE_MS"><code>FX_TAB_SWITCH_SPINNER_VISIBLE_MS</code></a> which we use to count the number of times a tab spinner appears after a switch tab operation in Firefox, along with the duration of
its appearance in milliseconds (ms). This is an unwanted operation (especially if it's long), as it makes the browser appear unresponsive and <a href="https://support.mozilla.org/en-US/questions/1198062">confuses / disturbs users</a>.</p>
<p><code>FX_TAB_SWITCH_SPINNER_VISIBLE_MS</code> is what's called an <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/collection/histograms.html#exponential">exponential histogram</a>: it represents an exponentially increasing distribution of values in each of its &quot;buckets&quot;. It's probably easier to visualize this using the histogram viewer than describe:</p>
<p><img src="cookbooks/../assets/exponential_histograms_tutorial/example_visualization_of_an_exponential_histogram.png" alt="example visualization of an exponential histogram" />
<a href="https://telemetry.mozilla.org/histogram-simulator/index.html#low=1&amp;high=1000&amp;n_buckets=20&amp;kind=exponential&amp;generate=normal">link</a></p>
<p>This visualization above shows how a <a href="https://en.wikipedia.org/wiki/Normal_distribution">normal distribution</a> would map into the buckets: you'll see that it skews towards the end: the point of the exponential histogram is to be sensitive to lower values (which one would assume would be more frequent, so long as the tab spinner doesn't come up too frequently!). Each &quot;tick&quot; represents the range of a bucket (in milliseconds): so we have a bucket representing values between <code>1ms</code> and <code>2ms</code>, <code>2ms</code> and <code>3ms</code>, and so on. You'll note that this distribution caps out at 1000: any values greater than or equal to this will wind up in this bucket but we won't know their value with any precision. For tracking values higher than this, a separate histogram (<a href="https://probes.telemetry.mozilla.org/?view=detail&amp;probeId=histogram%2FFX_TAB_SWITCH_SPINNER_VISIBLE_LONG_MS"><code>FX_TAB_SWITCH_SPINNER_VISIBLE_LONG_MS</code></a>) was created.</p>
<h2 id="getting-client-level-data"><a class="header" href="#getting-client-level-data">Getting client-level data</a></h2>
<p>Before we move on, let's do up a quick example of getting some client-level data using an SQL query, hopefully building up some intuition on how this stuff works on a low-level. From there, we can build up to aggregating it in interesting ways.</p>
<p>As of this writing, each main ping histogram is encoded as a JSON string inside the <code>telemetry.main</code> table inside BigQuery.</p>
<pre><code class="language-sql">SELECT
payload.histograms.FX_TAB_SWITCH_SPINNER_VISIBLE_MS AS histogram_json,
FROM
telemetry.main_nightly -- Use telemetry.main_1pct for a 1% sample across channels
WHERE
sample_id = 42
AND normalized_channel = 'nightly' -- Only technically necessary if using telemetry.main or telemetry.main_1pct (see above)
AND DATE(submission_timestamp) = '2020-04-20'
AND payload.histograms.FX_TAB_SWITCH_SPINNER_VISIBLE_MS IS NOT NULL
LIMIT
3
</code></pre>
<p><a href="https://sql.telemetry.mozilla.org/queries/71333/source"><code>STMO#71333</code></a></p>
<p>Running this query on STMO, we get the following output:</p>
<div class="table-wrapper"><table><thead><tr><th><code>histogram_json</code></th></tr></thead><tbody>
<tr><td><code>{&quot;bucket_count&quot;:20,&quot;histogram_type&quot;:0,&quot;sum&quot;:118,&quot;range&quot;:[1,1000],&quot;values&quot;:{&quot;80&quot;:0,&quot;115&quot;:1,&quot;165&quot;:0}}</code></td></tr>
<tr><td><code>{&quot;bucket_count&quot;:20,&quot;histogram_type&quot;:0,&quot;sum&quot;:19145,&quot;range&quot;:[1,1000],&quot;values&quot;:{&quot;237&quot;:0,&quot;340&quot;:1,&quot;1000&quot;:1}}</code></td></tr>
<tr><td><code>{&quot;bucket_count&quot;:20,&quot;histogram_type&quot;:0,&quot;sum&quot;:1996,&quot;range&quot;:[1,1000],&quot;values&quot;:{&quot;698&quot;:0,&quot;1000&quot;:1}}</code></td></tr>
</tbody></table>
</div>
<p>In this representation, <code>bucket_count</code> and <code>range</code> represent the number of buckets and the range of possible values. <code>histogram_type</code> is an enumerated value that describes whether the histogram has linear, exponential, or categorical buckets; the values are <a href="https://searchfox.org/mozilla-central/rev/0c682c4f01442c3de0fa6cd286e9cadc8276b45f/toolkit/components/telemetry/core/nsITelemetry.idl#18-32">defined in the Firefox source code</a>.
<code>values</code> represents the number of instances in each of the buckets while <code>sum</code> represents the sum total of all histogram values recorded.
Note how the first column has one bucket with no elements in it (the &quot;165&quot; bucket), this is because Firefox adds a zero-count bucket on the left and right edges of the data (unless that would be one of the extremes and that bucket already has a count in it, as is the case for the &quot;1000&quot; bucket in the last two examples).</p>
<p>In general, it is best not to rely on this representation of the histogram in production code (it is quite likely to change in the future). Instead, use the <a href="https://mozilla.github.io/bigquery-etl/mozfun/hist/#extract-udf"><code>mozfun.hist.extract</code></a> user-defined-function (UDF) and extract out the fields you need: for example, to just get the <code>sum</code> for all the histograms above, you could modify the query above to something like:</p>
<pre><code class="language-sql">WITH intermediate AS (
SELECT
mozfun.hist.extract(payload.histograms.FX_TAB_SWITCH_SPINNER_VISIBLE_MS) AS histogram,
FROM
telemetry.main_nightly -- Use telemetry.main_1pct for a 1% sample across channels
WHERE
sample_id = 42
AND normalized_channel = 'nightly' -- Only technically necessary if using telemetry.main or telemetry.main_1pct (see above)
AND DATE(submission_timestamp) = '2020-04-20'
AND payload.histograms.FX_TAB_SWITCH_SPINNER_VISIBLE_MS IS NOT NULL
LIMIT
3
)
SELECT
histogram.sum,
histogram.bucket_count
FROM
intermediate;
</code></pre>
<p><a href="https://sql.telemetry.mozilla.org/queries/71408/source"><code>STMO#71408</code></a></p>
<p>Which yields:</p>
<div class="table-wrapper"><table><thead><tr><th><code>sum</code></th><th><code>bucket_count</code></th></tr></thead><tbody>
<tr><td>118</td><td>20</td></tr>
<tr><td>19145</td><td>20</td></tr>
<tr><td>1996</td><td>20</td></tr>
</tbody></table>
</div>
<p>Note that these are the same values in the JSON histogram above.</p>
<p>Obviously this by itself is not particularly useful or meaningful - generally we are interested in <em>aggregate</em> behaviour across a larger set of clients. Let's look at how we might get that.</p>
<h2 id="getting-percentiles-from-a-set-of-histograms"><a class="header" href="#getting-percentiles-from-a-set-of-histograms">Getting percentiles from a set of histograms</a></h2>
<p>Often, questions around histograms are framed as &quot;what's the 99th percentile?&quot; -- that is, what is the <em>maximum</em> value that 99% of users experience: this helps give perspective on data which may have a number of weird outliers (a.k.a the <em>Bill Gates walks into a bar and everyone inside becomes a millionaire</em> effect). Let's take an initial stab of grabbing some percentiles of the data we were looking at earlier using the <a href="https://mozilla.github.io/bigquery-etl/mozfun/hist/#merge-udf"><code>mozfun.hist.merge</code></a> and <a href="https://mozilla.github.io/bigquery-etl/mozfun/hist/#percentiles-udf"><code>mozfun.hist.percentiles</code></a> UDFs:</p>
<pre><code class="language-sql">WITH merged_histogram AS (
SELECT
mozfun.hist.merge(
ARRAY_AGG(mozfun.hist.extract(payload.histograms.FX_TAB_SWITCH_SPINNER_VISIBLE_MS))
) AS spinner_visible_ms,
FROM
telemetry.main_nightly -- Use telemetry.main_1pct for a 1% sample across channels
WHERE
normalized_channel = 'nightly' -- Only technically necessary if using telemetry.main or telemetry.main_1pct (see above)
AND normalized_os = 'Windows'
AND DATE(submission_timestamp) = '2020-04-20'
),
percentiles AS (
SELECT
mozfun.hist.percentiles(spinner_visible_ms, [.05, .25, .5, .75, .95]) AS percentile_nested
FROM
merged_histogram
)
SELECT
percentile,
value
FROM
percentiles
CROSS JOIN
UNNEST(percentiles.percentile_nested);
</code></pre>
<p><a href="https://sql.telemetry.mozilla.org/queries/71410/source"><code>STMO#71410</code></a></p>
<p>Which gives us this set of results:</p>
<div class="table-wrapper"><table><thead><tr><th>Percentile</th><th>Value</th></tr></thead><tbody>
<tr><td>0.05</td><td>13</td></tr>
<tr><td>0.25</td><td>56</td></tr>
<tr><td>0.50</td><td>165</td></tr>
<tr><td>0.75</td><td>698</td></tr>
<tr><td>0.95</td><td>1,000</td></tr>
</tbody></table>
</div>
<p>So we see for this set of results that 95th percentile is <code>1000ms</code>, the 75th percentile is <code>698ms</code>, and so on.</p>
<p>There's a bit of intermediate-to-advanced SQL in the above query, due to the fact that the <code>mozfun.hist.percentiles</code> UDF returns an <em>array</em> of results in a column (rather than a full-blown table) -- we wrangle the results into something we can handle using the <a href="https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#unnest"><code>UNNEST</code></a> operator combined with a cross-join at the end. If you don't immediately understand this, don't worry: it's just an implementation detail.</p>
<h2 id="viewing-change-of-percentiles-over-time"><a class="header" href="#viewing-change-of-percentiles-over-time">Viewing change of percentiles over time</a></h2>
<p>Knowing the approximate distribution of results on a given day is sort of interesting, but probably not what we really want: what we're usually interested in is the evolution of results <em>over time</em>. In particular, segmenting by <code>build_id</code> (a date-like structure in the <code>main</code> ping representing when Firefox was built) is a useful technique, as it allows us to see if changes to Firefox itself may have caused the distribution to change.</p>
<p>We can do this simply by <em>grouping by</em> the build id field, and then merging the histograms corresponding to each:</p>
<pre><code class="language-sql">WITH per_build_day AS (
SELECT
PARSE_DATETIME(&quot;%Y%m%d%H%M%S&quot;, application.build_id) AS build_id,
KEY,
SUM(value) AS value,
FROM
telemetry.main_nightly -- Use telemetry.main_1pct for a 1% sample across channels,
UNNEST(
mozfun.hist.extract(
payload.histograms.FX_TAB_SWITCH_SPINNER_VISIBLE_MS
).VALUES
)
WHERE
normalized_channel = 'nightly' -- Only technically necessary if using telemetry.main or telemetry.main_1pct (see above)
AND normalized_os = 'Windows'
AND application.build_id &gt; FORMAT_DATE(&quot;%Y%m%d&quot;, DATE_SUB(CURRENT_DATE, INTERVAL 2 WEEK))
AND application.build_id &lt;= FORMAT_DATE(&quot;%Y%m%d&quot;, CURRENT_DATE)
AND DATE(submission_timestamp) &gt;= DATE_SUB(CURRENT_DATE, INTERVAL 2 WEEK)
AND DATE(submission_timestamp) &lt;= CURRENT_DATE
GROUP BY
KEY,
build_id
),
per_build_day_as_struct AS (
SELECT
build_id,
STRUCT(ARRAY_AGG(STRUCT(KEY, value)) AS VALUES) AS spinner_visible_ms
FROM
per_build_day
GROUP BY
build_id
)
SELECT
build_id,
percentile,
value
FROM
per_build_day_as_struct
CROSS JOIN
UNNEST(
mozfun.hist.percentiles(
spinner_visible_ms,
[.05, .25, .5, .75, .95]
)
)
</code></pre>
<p><a href="https://sql.telemetry.mozilla.org/queries/71472/source"><code>STMO#71472</code></a></p>
<p>As an implementation note, observe that we don't use <code>histogram_merge</code> here as we do above: doing so would require using <a href="https://cloud.google.com/bigquery/docs/reference/standard-sql/functions-and-operators#array_agg"><code>ARRAY_AGG</code></a> which can break down when processing large amounts of data. Instead we create an intermediate result (the <code>per_build_day</code> <code>WITH</code> statement) and then reprocess it into a structured representation. If you're curious what the version using <code>histogram_merge</code> would look like, see <a href="https://sql.telemetry.mozilla.org/queries/71413/source"><code>STMO#71413</code></a>.</p>
<p>In any case, rendering the data this query returns, we get a chart like this:</p>
<p><img src="cookbooks/../assets/exponential_histograms_tutorial/example_visualization_of_histogram_percentiles.png" alt="example visualization of histogram percentiles" /></p>
<p>You'll note that the 75th and the 95th percentiles are often the same. Which is to say: in 25% of cases, the value was somewhere between <code>698ms</code> and <code>1000ms</code>. Does this mean that 25% of the time people are seeing a <em>very</em> long-running tab spinner? <em>No!</em> It actually points to a flaw in our methodology, which GLAM was explicitly designed to address. For the last part our tutorial, let's look into how it does it, and how to reproduce its approach.</p>
<h2 id="percentiles-from-client-normalized-histograms"><a class="header" href="#percentiles-from-client-normalized-histograms">Percentiles from client-normalized histograms</a></h2>
<p>The example above basically created one <em>giant</em> histogram and then gathered the percentiles out of each one. But histograms are not created equal! At the extreme end of things for the tab spinner, consider a user on an extremely old computer with various kinds of malware installed, constantly interacting with complex and slow web sites. Such a condition is going to trigger the tab spinner frequently, and for long periods. But it is not representative of the overall population, and probably shouldn't <em>overtly</em> influence our decision-making process.</p>
<p>A solution used by GLAM is to give each client &quot;one vote&quot;: that is, the aggregate histogram for a client over a day must sum up to one. Even in the extreme case where all tab spinner measurements fall between <code>658ms</code> and <code>1000ms</code> (the range of the highest bucket), the <em>maximum</em> number for that bucket is just &quot;1&quot;.</p>
<p>We can reproduce this approach by using the <a href="https://mozilla.github.io/bigquery-etl/mozfun/hist/#normalize-udf"><code>mozfun.hist.normalize</code></a> UDF, which explicitly takes a set of histograms and makes sure that the values for each one sum up to exactly one:</p>
<pre><code class="language-sql">WITH per_build_client_day AS (
SELECT
PARSE_DATETIME(&quot;%Y%m%d%H%M%S&quot;, application.build_id) AS build_id,
client_id,
mozfun.hist.normalize(
mozfun.hist.merge(
ARRAY_AGG(
mozfun.hist.extract(
payload.histograms.FX_TAB_SWITCH_SPINNER_VISIBLE_MS
)
)
)
) AS tab_switch_visible_ms
FROM
telemetry.main_nightly -- Use telemetry.main_1pct for a 1% sample across channels
WHERE
normalized_channel = 'nightly' -- Only technically necessary if using telemetry.main or telemetry.main_1pct (see above)
AND normalized_os = 'Windows'
AND application.build_id &gt; FORMAT_DATE(&quot;%Y%m%d&quot;, DATE_SUB(CURRENT_DATE, INTERVAL 14 DAY))
AND application.build_id &lt;= FORMAT_DATE(&quot;%Y%m%d&quot;, CURRENT_DATE)
AND DATE(submission_timestamp) &gt;= DATE_SUB(CURRENT_DATE, INTERVAL 14 DAY)
AND DATE(submission_timestamp) &lt;= CURRENT_DATE
GROUP BY
build_id,
client_id
),
merged_histograms AS (
SELECT
build_id,
KEY,
SUM(value) AS value,
FROM
per_build_client_day,
UNNEST(per_build_client_day.tab_switch_visible_ms.VALUES)
GROUP BY
KEY,
build_id
),
as_struct AS (
SELECT
build_id,
STRUCT(ARRAY_AGG(STRUCT(KEY, value)) AS VALUES) AS spinner_visible_long_ms
FROM
merged_histograms
GROUP BY
build_id
),
percentiles AS (
SELECT
build_id,
mozfun.hist.percentiles(
spinner_visible_long_ms,
[.05, .25, .5, .75, .95]
) AS percentile_nested
FROM
as_struct
)
SELECT
build_id,
percentile,
value
FROM
percentiles
CROSS JOIN
UNNEST(percentiles.percentile_nested);
</code></pre>
<p><a href="https://sql.telemetry.mozilla.org/queries/71489/source"><code>STMO#71489</code></a></p>
<p>You'll notice this query groups by <code>client_id</code> in addition to <code>build_id</code> before <code>mozfun.hist.normalize</code>. Grouping by <code>client_id</code> gives each user equal representation and prevents &quot;power users&quot; from skewing the result.</p>
<p>In any case, the result of this query is this graph:</p>
<p><img src="cookbooks/../assets/exponential_histograms_tutorial/example_visualization_of_normalized_histogram_percentiles.png" alt="example visualization of normalized histogram percentiles" /></p>
<p>Things are looking much better! The 95th percentile is still capped out at 1000, but the other percentiles are much lower.</p>
<h2 id="slicing-along-arbitrary-dimensions"><a class="header" href="#slicing-along-arbitrary-dimensions">Slicing along arbitrary dimensions</a></h2>
<p>OK, so we've reproduced GLAM, but that isn't particularly exciting in and of itself: if you just wanted to see a GLAM-like view of things, GLAM by itself is going to do a better job. The power of writing SQL comes when you want to see how things look on an arbitrary set of dimensions. Let's look at an arbitrary question: what do the tab spinner percentiles look like for Windows 7? These are likely to be much older machines, so we'd expect things to look worse. But how much?</p>
<p>We can filter our query to <em>just</em> that group of users by adding a <code>AND normalized_os_version=&quot;6.1&quot;</code> clause to our query above:</p>
<pre><code class="language-sql">WITH per_build_client_day AS (
SELECT
PARSE_DATETIME(&quot;%Y%m%d%H%M%S&quot;, application.build_id) AS build_id,
client_id,
mozfun.hist.normalize(
mozfun.hist.merge(
ARRAY_AGG(
mozfun.hist.extract(
payload.histograms.FX_TAB_SWITCH_SPINNER_VISIBLE_MS
)
)
)
) AS tab_switch_visible_ms
FROM
telemetry.main_nightly -- Use telemetry.main_1pct for a 1% sample across channels
WHERE
normalized_channel = 'nightly' -- Only technically necessary if using telemetry.main or telemetry.main_1pct (see above)
AND normalized_os = 'Windows'
AND normalized_os_version = &quot;6.1&quot;
AND application.build_id &gt; FORMAT_DATE(&quot;%Y%m%d&quot;, DATE_SUB(CURRENT_DATE, INTERVAL 14 DAY))
AND application.build_id &lt;= FORMAT_DATE(&quot;%Y%m%d&quot;, CURRENT_DATE)
AND DATE(submission_timestamp) &gt;= DATE_SUB(CURRENT_DATE, INTERVAL 14 DAY)
AND DATE(submission_timestamp) &lt;= CURRENT_DATE
GROUP BY
build_id,
client_id
),
merged_histograms AS (
SELECT
build_id,
KEY,
SUM(value) AS value,
FROM
per_build_client_day,
UNNEST(per_build_client_day.tab_switch_visible_ms.VALUES)
GROUP BY
KEY,
build_id
),
as_struct AS (
SELECT
build_id,
STRUCT(ARRAY_AGG(STRUCT(KEY, value)) AS VALUES) AS spinner_visible_long_ms
FROM
merged_histograms
GROUP BY
build_id
),
percentiles AS (
SELECT
build_id,
mozfun.hist.percentiles(
spinner_visible_long_ms,
[.05, .25, .5, .75, .95]
) AS percentile_nested
FROM
as_struct
)
SELECT
build_id,
percentile,
value
FROM
percentiles
CROSS JOIN
UNNEST(percentiles.percentile_nested);
</code></pre>
<p><a href="https://sql.telemetry.mozilla.org/queries/71437/source"><code>STMO#71437</code></a></p>
<p>If we do this, we see this chart:</p>
<p><img src="cookbooks/../assets/exponential_histograms_tutorial/example_visualization_of_normalized_histogram_percentiles_for_Windows_7.png" alt="example visualization of normalized histogram percentiles for Windows 7" /></p>
<p>As you can see, both the 75th and 100th percentile are now in the highest bucket, and the 50th percentile is much higher as well. From this we can intuit that the user experience for these users is likely considerably worse, which is exactly what we would have expected.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/main_ping_exponential_histograms.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="real-time"><a class="header" href="#real-time">Real-time</a></h1>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/realtime.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="working-with-live-data"><a class="header" href="#working-with-live-data">Working with Live Data</a></h1>
<p>Live ping tables are the final destination for the telemetry ingestion pipeline. Incoming ping data is loaded into these tables approximately every 10 minutes, though a delay of up to 30 minutes is normal. Data in these tables is set to expire after 30 days.</p>
<p>Data from live ping tables is expected to be accessed through user-facing views. The names of all views accessing live data should include the <code>_live</code> suffix (for example <code>monitoring.topsites_click_rate_live</code>). Live tables are generally clustered and partitioned by <code>submission_timestamp</code>, which allows for writing more efficient queries that filter over short time windows. When querying live data by running the same query multiple times ensure that query caching is turned off, otherwise newly arrived data might not appear in the query results.</p>
<p>Of note, user-facing views for live ping tables, like the ones for historical ping tables, will not automatically be provisioned. Instead, it is expected for live views to be more curated, with each view being tied to a specific use case.</p>
<p>Live tables in <code>_derived</code> datasets can be queried from Redash and Looker, however it is best practice to set up user-facing views for accessing the data.</p>
<h2 id="using-materialized-views"><a class="header" href="#using-materialized-views">Using Materialized Views</a></h2>
<p>When creating a view on live data, engineers should consider the specific use cases for the view and do some analysis of the cost associated with query patterns. In particular, cost will likely be an issue if the view is going to be used for some type of monitoring that involves frequent queries on Firefox-scale data. <a href="https://cloud.google.com/bigquery/docs/materialized-views-intro">BigQuery offers support for materialized views</a>. Materialized views are precomputed views that periodically cache the results of a query for increased performance and efficiency. Creating a materialized view that provides the relevant aggregates will be a significant cost savings when working with a large volume of live data.</p>
<p>When a materialized view is appropriate, it should be created in a relevant <code>_derived</code> dataset with a version suffix (following the same conventions as we do for derived tables populated via scheduled queries). It is then made user-facing via a simple <code>SELECT * FROM my_materialized_view</code> virtual view in a relevant user-facing dataset. More complex cases may need to union together multiple materialized views at this user-facing view level.</p>
<p>It is generally recommended that materialized views over live data have <code>enable_refresh=true</code> and <code>refresh_interval_minutes=10</code>. The <code>refresh_interval_minutes</code> parameter determines the minimum time between refreshes.</p>
<p>Although partition expiration of a materialized view is inherited from the underlying table, each materialized view should include an explicit start date in a <code>WHERE</code> clause to limit the amount of data scanned for an initial backfill and that date should be advanced any time the view is recreated. Be aware that any change to the schema of the base table will invalidate the entire materialized view, meaning that the next scheduled refresh will incur a full backfill; keep this in mind when estimating cost of a materialized view.</p>
<p>Definitions for materialized views live in the <a href="https://github.com/mozilla/bigquery-etl">bigquery-etl repository</a> and usually have the following structure:</p>
<pre><code class="language-sql">CREATE MATERIALIZED VIEW
IF NOT EXISTS my_dataset_derived.my_new_materialized_view_live_v1
OPTIONS
(enable_refresh = TRUE, refresh_interval_minutes = 10)
AS
SELECT
submission_timestamp,
some_interesing_columns
FROM
`moz-fx-data-shared-prod.dataset_live.events_v1`
submission_timestamp &gt; DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAYS) -- limit amount of data to be backfilled ($$$)
</code></pre>
<h2 id="tools-for-visualizing-live-data"><a class="header" href="#tools-for-visualizing-live-data">Tools for Visualizing Live Data</a></h2>
<p>Currently, live data in BigQuery could be considered &quot;near real time&quot; or nearline (as opposed to online), with latency usually below 1 hour to each live table in our ingestion-sink code base.</p>
<p>For near real time product monitoring use cases, it is recommended to use Looker for hosting dashboards that refresh on a regular interval such as every 10 minutes.</p>
<p>For real time operational monitoring use cases, it is recommended to rely on centralized SRE monitoring infrastructure, namely the InfluxCloud-hosted Grafana instance or the Stackdriver UI in the GCP console. For dashboards mixing monitoring data and (Mozilla Confidential) BigQuery data, then it should be possible to serve that need through the SRE Grafana infrastructure.</p>
<h2 id="access-controls-for-services-accessing-live-data-in-bigquery"><a class="header" href="#access-controls-for-services-accessing-live-data-in-bigquery">Access Controls for Services Accessing Live Data in BigQuery</a></h2>
<p>So far giving services access to the full data warehouse has been avoided, instead it is preferred to grant access only to the specific tables needed to support a documented use case. This reduces the possibility of data warehouse changes inadvertently breaking a consuming service.
When provisioning BigQuery access to a Mozilla service, it is recommended to create a use case-specific authorized view, granting the service access only to that specific view. It is also acceptable to grant the service access to a user-facing virtual view and all underlying tables or materialized views being referenced.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/live_data.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="see-my-pings"><a class="header" href="#see-my-pings">See My Pings</a></h1>
<p>So you want to see what you're sending the telemetry pipeline, huh?
Well follow these steps and we'll have you perusing your own data in no time.</p>
<h2 id="steps-to-viewing-your-pings"><a class="header" href="#steps-to-viewing-your-pings">Steps to Viewing Your Pings</a></h2>
<ol>
<li>
<p>Get your <code>clientId</code> from whatever product you're using. For desktop, it's available in <code>about:telemetry</code>.</p>
</li>
<li>
<p>Go <a href="https://sql.telemetry.mozilla.org">STMO</a>.</p>
</li>
<li>
<p>Enter the following query:</p>
</li>
</ol>
<pre><code class="language-sql">SELECT
submission_timestamp,
document_id
FROM
`moz-fx-data-shared-prod.telemetry_live.main_v4` -- or crash, event, core, etc
WHERE
submission_timestamp &gt; TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 3 HOUR)
AND client_id = '&lt;your_client_id&gt;'
ORDER BY
submission_timestamp DESC
LIMIT 10
</code></pre>
<p>This will show you the timestamp and document id of the ten most recent
<code>main</code> pings you've sent in the last 3 hours.
You may include any other fields here that might be of interest to you.</p>
<p>The tables in the <code>telemetry_live</code> dataset have only a few minutes of
latency, so you can query those tables for pings from your <code>client_id</code>
with minimal additional waiting.</p>
<p>One thing to note is that BigQuery has its own query cache, so if you
run the same query several times in a row, it may fetch results from
its cache. You can make any change at all (such as adding a comment)
to force the query to run again and fetch updated results.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/view_pings_cep.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="search-metrics"><a class="header" href="#search-metrics">Search metrics</a></h1>
<p>Search metrics are delivered in main pings and often reflect usage over the course of many previous hours, so there is inherent delay. The delay makes it more reasonable to call <code>introday</code> search metrics instead of <code>almost real-time</code> search metrics.</p>
<p>Below is the query to give the hourly <code>sap</code> search metrics per each major search engine since the start of current day in <code>CA</code> from <code>telemetry_live</code>.</p>
<pre><code class="language-sql">SELECT
DATE_TRUNC(submission_timestamp, HOUR) AS submission_hour,
`moz-fx-data-shared-prod.udf.normalize_search_engine`(split(key,&quot;.&quot;)[offset(0)]) as normalized_engine,
sum(mozfun.hist.`extract`(value).`sum`) AS searches
FROM
`moz-fx-data-shared-prod.telemetry_live.main_v4`,
UNNEST(payload.keyed_histograms.search_counts) AS sc
WHERE
DATE(submission_timestamp) &gt;= date_trunc(current_date(), DAY)
AND submission_timestamp &lt; TIMESTAMP_TRUNC(CURRENT_TIMESTAMP(), HOUR)
AND metadata.geo.country = 'CA'
GROUP BY 1,2
ORDER BY 1,2
</code></pre>
<p>You can include any other fields that might be of interest to you.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/real_time_search.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="metrics"><a class="header" href="#metrics">Metrics</a></h1>
<p>The most accurate and up-to-date metric information can be found on the <a href="https://mozilla-hub.atlassian.net/wiki/spaces/DATA/pages/620494911/Metrics+Inventory">Metrics Inventory</a> page in Confluence (Mozilla LDAP required).</p>
<p>The official source code definitions for the metrics can be found in <a href="https://github.com/mozilla/metric-hub/tree/main/definitions">metric-hub</a> in GitHub.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/metrics.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="operational"><a class="header" href="#operational">Operational</a></h1>
<p>This section contains tutorials on operational tasks that a data practitioner might want to perform. It is intended for a broad audience.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/operational/index.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="creating-a-prototype-data-project-on-google-cloud-platform"><a class="header" href="#creating-a-prototype-data-project-on-google-cloud-platform">Creating a Prototype Data Project on Google Cloud Platform</a></h1>
<p>If you are working on a more complex project (as opposed to ad-hoc or one-off analysis) which you intend to be run in production at some point, it may be worthwhile provisioning a separate <em>prototype</em> GCP project for it with access to our datasets. From the <a href="https://console.cloud.google.com/">Google Cloud Console</a>, you may then:</p>
<ul>
<li>Provision service accounts for querying BigQuery (including our production tables) or accessing other GCP resources from the command-line or inside Docker containers</li>
<li>Write and query data to private BigQuery tables, without worrying about interfering with what we have in production</li>
<li>Make Docker images available via the Google Container Registry (see <a href="cookbooks/deploying-containers.html">the cookbook on deploying containers</a>)</li>
<li>Create <a href="https://cloud.google.com/storage/">Google Cloud Storage</a> buckets for storing temporary data</li>
<li>Create <a href="https://cloud.google.com/compute/docs/instances">Google Compute Instances</a> for test-running software in the cloud</li>
<li>Create a temporary Kubernetes cluster for test-running a scheduled job with <a href="https://github.com/mozilla/telemetry-airflow">telemetry-airflow</a></li>
<li>Create static dashboards with protosaur (see <a href="cookbooks/./operational/protosaur.html">Creating Static Dashboards with Protosaur</a>)</li>
<li>Track the costs for all of the above using the Google Cost Dashboard feature of the GCP console</li>
</ul>
<p>This has a number of advantages over our traditional approach of creating bulk &quot;sandbox&quot; projects for larger teams:</p>
<ul>
<li>Easy to track costs of individual components</li>
<li>Can self-serve short-lived administrative credentials which exist only for the lifespan of the project.</li>
<li>Can easily spin down projects and resources which have run their course</li>
</ul>
<p>Note that these prototype GCP projects are not intended to be used for projects which are already in <em>production</em>-- those should be maintained on operations-supported projects, presumably after a development phase. Nor are they meant for ad-hoc analysis or experimentation-- for that, just file a request as outlined in the <a href="cookbooks/bigquery/access.html#access-request">Accessing BigQuery</a> cookbook.</p>
<p>Each sandbox project has a data engineering contact associated with it: this is the person that will create the project for you. Additionally, they are meant to be a resource you can freely ask for advice on how to query or use GCP, and how to build software that lends itself to productionization. If you are a data engineer, the data engineering contact may be yourself, but you should still follow the procedure below for tracking purposes in any case.</p>
<p>To request the creation of a prototype GCP project, <a href="https://bugzilla.mozilla.org/enter_bug.cgi?assigned_to=nobody%40mozilla.org&amp;bug_ignored=0&amp;bug_severity=normal&amp;bug_status=NEW&amp;bug_type=task&amp;cf_fx_iteration=---&amp;cf_fx_points=---&amp;comment=%2A%2A%20Please%20fill%20out%20the%20following%20information%20and%20needinfo%20the%20data%20engineering%20contact%20you%20specified%20below%20%28unless%20the%20contact%20is%20yourself%29%2C%20don%27t%20forget%20to%20change%20the%20title%20to%20use%20your%20project%20name%21%20%2A%2A%0D%0A%0D%0AGCP-compatible%20project%20name%20%28e.g.%20missioncontrol-v2-dev%2C%20adi-forecasting-dev%29%3A%0D%0ALDAP%20of%20people%20who%20require%20administrative%20privileges%20for%20this%20project%3A%20%0D%0AProject%20timeline%20%28maximum%206%20months%2C%20projects%20may%20be%20renewed%20if%20development%20is%20still%20ongoing%20at%20the%20end%20of%20that%20period%29%3A%0D%0AApproximate%20budget%20for%20this%20project%20%28if%20expected%20to%20be%20greater%20than%20%241000%29%3A%0D%0AWhether%20this%20project%20will%20be%20used%20to%20import%20external%20data%20into%20GCP%2C%20and%20if%20so%2C%20from%20where%20%28if%20the%20answer%20is%20yes%2C%20needinfo%20a%20member%20of%20Data%20SRE%20for%20an%20ops%20evaluation%29%3A%0D%0AData%20Engineering%20contact%20for%20this%20project%3A%0D%0A%0D%0AFor%20more%20information%2C%20please%20see%20%5Bthe%20gcp%20project%20cookbook%5D%28https%3A%2F%2Fdocs.telemetry.mozilla.org%2Fcookbooks%2Fgcp-projects.html%29%20on%20docs.telemetry.mozilla.org.&amp;component=General&amp;contenttypemethod=list&amp;contenttypeselection=text%2Fplain&amp;defined_groups=1&amp;filed_via=standard_form&amp;flag_type-4=X&amp;flag_type-607=X&amp;flag_type-800=X&amp;flag_type-803=X&amp;flag_type-936=X&amp;form_name=enter_bug&amp;maketemplate=Remember%20values%20as%20bookmarkable%20template&amp;op_sys=Unspecified&amp;priority=--&amp;product=Data%20Platform%20and%20Tools&amp;rep_platform=Unspecified&amp;short_desc=New%20GCP%20Project%20Request%3A%20name-of-project&amp;status_whiteboard=%5Bgcp-project-request%5D&amp;target_milestone=---&amp;version=unspecified">file a bug</a> using the provided template.
Not sure if you need a project like this? Don't know who to specify as a Data Engineering contact? Not sure what your project budget might be? <a href="cookbooks/../concepts/getting_help.html">Get in touch with the data platform team</a>.</p>
<p>We are currently <a href="https://mana.mozilla.org/wiki/display/DATA/Active+GCP+Prototype+Projects">tracking these projects on mana</a> (link requires Mozilla LDAP)</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/gcp-projects.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="creating-static-dashboards-with-protosaur"><a class="header" href="#creating-static-dashboards-with-protosaur">Creating Static Dashboards with Protosaur</a></h1>
<p><a href="https://protosaur.dev"><code>protosaur.dev</code></a> allows data practitioners at Mozilla to create <em>prototype</em> static dashboards behind Mozilla SSO (single-sign-on).
As the name implies, protosaur is intended for prototypes -- dashboards created using this system are not monitored or supported by Data Operations or Data Engineering.
Protosaur is a simple static hosting service: it does not provide form handling, databases, or any other kind of server-side operation.
However, for presenting dashboards and other types of data visualization, a static website is often all you need (see, for example, the galaxy of sites produced using <a href="https://pages.github.com/">GitHub Pages</a>).</p>
<p>Protosaur's architecture is simple: it serves files in a <a href="https://cloud.google.com/storage/">Google Cloud Storage</a> (GCS) bucket under the <a href="https://protosaur.dev"><code>protosaur.dev</code></a> domain. How you get the files into the bucket is entirely up to you: you can use CircleCI, Airflow, or any other method that you might choose.</p>
<p>The current procedure for creating a new protosaur dashboard is as follows:</p>
<ul>
<li>Find a GCP project to use for the GCS bucket. This could be a team project (if you have one already) or a <a href="cookbooks/operational/../gcp-projects.html">GCP prototype project</a>. Using a prototype project is preferred for larger efforts with multiple components in addition to a front-end dashboard.</li>
<li>Create a GCS bucket in said project.</li>
<li>Upload content into the bucket (e.g. via <a href="https://cloud.google.com/storage/docs/gsutil"><code>gsutil</code></a>).</li>
<li>Add the project to Protosaur's configuration, which currently lives in the <a href="https://github.com/mozilla/protodash">protodash repository</a> on GitHub. For up to date instructions on how to do this, <a href="https://github.com/mozilla/protodash/blob/master/README.md">see the project's README</a>.</li>
</ul>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/operational/protosaur.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="scheduling-queries"><a class="header" href="#scheduling-queries">Scheduling Queries</a></h1>
<p>Want to refresh a dashboard or other query against our data store automatically? There's a few options available, depending on what you want to do.</p>
<ul>
<li><a href="cookbooks/scheduling_queries.html#sqltelemetrymozillaorg">sql.telemetry.mozilla.org</a></li>
<li><a href="cookbooks/scheduling_queries.html#bigquery-etl">bigquery-etl</a></li>
<li><a href="cookbooks/scheduling_queries.html#scheduling-queries-using-gcp">Scheduling Queries using GCP</a></li>
</ul>
<h2 id="sqltelemetrymozillaorg"><a class="header" href="#sqltelemetrymozillaorg">sql.telemetry.mozilla.org</a></h2>
<p>This is by far the easiest option. Any query on <a href="cookbooks/../tools/stmo.html">STMO</a> can be made to refresh itself automatically. This can be used to power both dashboards inside sql.telemetry.mozilla.org as well as web sites like the <a href="https://mikeconley.github.io/bug1310250/">Tab Spinner Dashboard</a> and <a href="https://github.com/mozilla/funnel">Numbers that Matter</a> via Redash's CSV and JSON APIs.</p>
<p>However, there are disadvantages to this approach:</p>
<ul>
<li>Every time it is refreshed, STMO re-runs the query from scratch. This is fine for smaller or less frequently updated dashboards but when querying a large amount of data, <a href="cookbooks/./bigquery/optimization.html">this can get expensive</a>.</li>
<li>Aside from hand-rolled JavaScript or Python scripts, there isn't a way to run a query over the results of an STMO query.</li>
<li>Queries on STMO are not peer reviewed and are not supported by Data SRE (read: no one will be notified if your query breaks).</li>
</ul>
<h2 id="bigquery-etl"><a class="header" href="#bigquery-etl">bigquery-etl</a></h2>
<p>Mozilla's Data Engineering maintains a repository called <a href="https://github.com/mozilla/bigquery-etl">bigquery-etl</a> which can incrementally create datasets based on an SQL query using our Airflow infrastructure. This requires a little bit more work to set up than scheduling a query on STMO, but has the advantage of being more cost effective, reliable, and amenable to further exploration.</p>
<p>Also, since the procedure for scheduling a query in this way is submitting a pull request against the bigquery-etl repository, this is an easy route to getting peer review from the extended Data Team.</p>
<p>For more information on how to use this approach, see <a href="https://mozilla.github.io/bigquery-etl/cookbooks/creating_a_derived_dataset/">A quick guide to creating a derived dataset with bigquery-etl</a>.</p>
<h2 id="scheduling-queries-using-gcp"><a class="header" href="#scheduling-queries-using-gcp">Scheduling Queries using GCP</a></h2>
<p>Finally, <a href="https://cloud.google.com/bigquery/docs/scheduling-queries">you can schedule queries using GCP</a>. This is generally not recommended, <a href="cookbooks/../concepts/getting_help.html">reach out to the Data team</a> if you think you need to do this.</p>
<p>GCP scheduled queries are to be used only for short-lived queries: queries that are active for more than 30 days will be automatically unscheduled.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/scheduling_queries.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="building-and-deploying-containers-to-google-container-registry-gcr-with-circleci"><a class="header" href="#building-and-deploying-containers-to-google-container-registry-gcr-with-circleci">Building and Deploying Containers to Google Container Registry (GCR) with CircleCI</a></h1>
<p>The following cookbook describes how to set up automated build and deployment for containers with CircleCI, a useful pattern for scheduling custom jobs in Google Kubernetes Engine.</p>
<p>Note that this method intended for rapid prototyping rather than for production workloads.
If you need to transition a prototype to a production deployment,
<a href="https://bugzilla.mozilla.org/enter_bug.cgi?product=Data+Platform+and+Tools&amp;component=Operations">file a Data Platform and Tools &gt; Operations bug</a>
to start the conversation.</p>
<ul>
<li><a href="cookbooks/deploying-containers.html#assumptions">Assumptions</a></li>
<li><a href="cookbooks/deploying-containers.html#steps">Steps</a>
<ul>
<li><a href="cookbooks/deploying-containers.html#on-gcp">On GCP</a></li>
<li><a href="cookbooks/deploying-containers.html#on-circleci">On CircleCI</a></li>
<li><a href="cookbooks/deploying-containers.html#in-your-github-repo">In your GitHub Repo</a></li>
<li><a href="cookbooks/deploying-containers.html#optional">Optional</a></li>
</ul>
</li>
</ul>
<h2 id="assumptions"><a class="header" href="#assumptions">Assumptions</a></h2>
<ul>
<li>Your GitHub project's repository has a working Dockerfile at its root
<ul>
<li>If your file is not named <code>Dockerfile</code> or not located in the repo root, see the docs for the <a href="https://circleci.com/orbs/registry/orb/circleci/gcp-gcr">CircleCI GCP-GCR orb</a> for additional configuration</li>
</ul>
</li>
<li>The repository is in the <a href="https://github.com/mozilla"><code>mozilla</code> GitHub org</a> (or another org with a paid CircleCI account)</li>
</ul>
<h2 id="steps"><a class="header" href="#steps">Steps</a></h2>
<h3 id="on-gcp"><a class="header" href="#on-gcp">On GCP</a></h3>
<ul>
<li>Make sure &quot;Container Registry&quot; (https://console.cloud.google.com/gcr/images/&lt;your-project-id&gt;) is enabled for your GCP project</li>
<li>Create a service account, give it the &quot;Storage Admin&quot; role and create a key
<ul>
<li>Console Link: <a href="https://console.cloud.google.com/iam-admin/serviceaccounts">Google Cloud Platform</a></li>
<li>Additional documentation: <a href="https://cloud.google.com/container-registry/docs/access-control?hl=en_US">Configuring access control  |  Container Registry Documentation</a></li>
</ul>
</li>
</ul>
<h3 id="on-circleci"><a class="header" href="#on-circleci">On CircleCI</a></h3>
<ul>
<li><strong>IMPORTANT SECURITY STEP</strong>
<ul>
<li>Go to your projects CircleCI Advanced Settings Page (e.g. https://circleci.com/gh/mozilla/pensieve/edit#advanced-settings) and make sure that the &quot;<em>Pass secrets to builds from forked pull requests</em>&quot; option is TURNED OFF
<ul>
<li>This prevents a bad actor from creating a PR with a CI job that spits out your environment variables to the console, for instance</li>
</ul>
</li>
<li>If you can't access your project settings page, make sure youre logged into CircleCI via your Mozilla GitHub account and that you are a project administrator</li>
</ul>
</li>
<li>On the CircleCI Environment Variables page (e.g. https://circleci.com/gh/mozilla/pensieve/edit#env-vars), add:
<ul>
<li><code>GOOGLE_PROJECT_ID</code>: the project ID that you created in step 1</li>
<li><code>GOOGLE_COMPUTE_ZONE</code>: any compute zone will do, apparently -- try <code>us-west1</code> if you're agnostic</li>
<li><code>GCLOUD_SERVICE_KEY</code>: paste in the entire text of the service account key that you generated in step 2</li>
<li>Check out the docs for the <a href="https://circleci.com/orbs/registry/orb/circleci/gcp-gcr">CircleCI GCP-GCR orb</a> for other environment variables that you may set</li>
</ul>
</li>
</ul>
<h3 id="in-your-github-repo"><a class="header" href="#in-your-github-repo">In your GitHub Repo</a></h3>
<ul>
<li>In your CircleCI config file add a changeset like this:
<ul>
<li><a href="https://github.com/mozilla/pensieve/commit/b56f6f78b16d5893ff1cbf1ba895fa5bc85266c0">Add automated deployment of docker image to google container registry…</a></li>
<li>The <code>orb</code> directive allows the use of the <a href="https://circleci.com/orbs/registry/orb/circleci/gcp-gcr">CircleCI GCP-GCR orb</a> build-and-push-image job</li>
<li>In your <code>workflows</code> section, add <code>gcp-gcr/build-and-push-image</code> as a job and require any dependencies youd like to pass before pushing a new image. Assuming you only want this deployment to occur on new commits to main, add a filter for only the main branch (as in the changeset above)</li>
</ul>
</li>
<li>Create and merge a pull request for this changeset and your newly built image should be in your projects container registry in a few moments!</li>
</ul>
<h3 id="optional"><a class="header" href="#optional">Optional</a></h3>
<p>If your repository is public, you may want to make its container registry publicly readable as well. Go to the GCP container registrys Settings tab and in the &quot;Public access&quot; section change the visibility for <code>gcr.io</code> (the default host if you followed these instructions) to <code>Public</code>.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/deploying-containers.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="making-datasets-publicly-available"><a class="header" href="#making-datasets-publicly-available">Making Datasets Publicly Available</a></h1>
<p>Currently, only datasets and query results that are available in BigQuery and
defined in the <a href="https://github.com/mozilla/bigquery-etl">bigquery-etl</a> repository can be made publicly available.
See the <a href="https://mozilla.github.io/bigquery-etl/cookbooks/creating_a_derived_dataset/">bigquery-etl documentation</a>
for information on how to create and schedule datasets. Before data can be published, a data review is
required.</p>
<p>To make query results publicly available, a <a href="https://github.com/mozilla/bigquery-etl#query-metadata"><code>metadata.yaml</code> file</a>
must be added alongside the query in bigquery-etl. For example:</p>
<pre><code class="language-yaml">friendly_name: SSL Ratios
description: &gt;-
Percentages of page loads Firefox users have performed that were
conducted over SSL broken down by country.
owners:
- example@mozilla.com
labels:
application: firefox
incremental: true # incremental queries add data to existing tables
schedule: daily # scheduled in Airflow to run daily
public_json: true
public_bigquery: true
review_bug: 1414839 # Bugzilla bug ID of data review
incremental_export: false # non-incremental JSON export writes all data to a single location
</code></pre>
<p>The following options define how data is published:</p>
<ul>
<li><code>public_json</code>: data is available through the <a href="https://public-data.telemetry.mozilla.org">public HTTP endpoint</a></li>
<li><code>public_bigquery</code>: data is publicly available on BigQuery
<ul>
<li>tables will get published in the <code>mozilla-public-data</code> GCP project which is accessible
by everyone, also external users</li>
</ul>
</li>
<li><code>incremental_export</code>: determines how data gets split up
<ul>
<li><code>true</code>: data for each <code>submission_date</code> gets exported into separate directories (e.g.
<code>files/2020-04-15</code>, <code>files/2020-04-16</code>, ...)
<ul>
<li><code>false</code>: all data gets exported into one <code>files/</code> directory</li>
</ul>
</li>
</ul>
</li>
<li><code>incremental</code>: indicates how data gets updated based on the query and Airflow configuration
<ul>
<li><code>true</code>: data gets incrementally updated</li>
<li><code>false</code>: the entire table data gets updated</li>
</ul>
</li>
<li><code>review_bug</code>: Bugzilla bug number to the data review</li>
</ul>
<p>Data will get published when the query is executed in Airflow. Metadata of available public
data on Cloud Storage is updated daily through a separate Airflow task.</p>
<p>More information about accessing public data can be found in
<a href="cookbooks/../cookbooks/public_data.html">Accessing Public Data</a>.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/publishing_datasets.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="connecting-sheets-and-external-data-to-bigquery"><a class="header" href="#connecting-sheets-and-external-data-to-bigquery">Connecting Sheets and External Data to BigQuery</a></h1>
<p>Google Sheets and other external data sources can be connected to our Data Warehouse and made available as tables via <a href="https://github.com/mozilla/bigquery-etl">bigquery-etl</a>. Updates made to the connected data source will be instantly available in the BigQuery table. The created tables can be <a href="https://github.com/mozilla/lookml-generator/blob/main/custom-namespaces.yaml">made available in Looker</a> and restricted in access using <a href="https://mozilla-hub.atlassian.net/wiki/spaces/SRE/pages/27924789/Data+Access+Workgroups">Data Access Workgroups</a>.</p>
<h2 id="connecting-sheets"><a class="header" href="#connecting-sheets">Connecting Sheets</a></h2>
<p>To connect a Google Sheet to BigQuery, the following steps need to be followed:</p>
<ol>
<li>Clone the <a href="https://github.com/mozilla/bigquery-etl">bigquery-etl</a> repository locally. Setting up the <code>bqetl</code> CLI tooling is optional, all the steps here can be done manually.</li>
<li>Create a new destination table configuration in the project and dataset the table should be created in BigQuery:
<ul>
<li>If the spreadsheet should be accessible through the table <code>moz-fx-data-shared-prod.telemetry_derived.insightful_spreadsheet_v1</code> then create a new folder <code>insightful_spreadsheet_v1</code> under <code>sql/moz-fx-data-shared-prod/telemetry_derived</code>.</li>
</ul>
</li>
<li>Create a <code>metadata.yaml</code> file.
<ul>
<li>For <code>insightful_spreadsheet_v1</code> the file would need be created under <code>sql/moz-fx-data-shared-prod/telemetry_derived/insightful_spreadsheet_v1/metadata.yaml</code></li>
</ul>
</li>
<li>Open the <code>metadata.yaml</code> file and specify the configuration similar to the following:</li>
</ol>
<pre><code class="language-yaml">friendly_name: Insightful Spreadsheet
description: &gt;
A description of what the data represents
owners:
- example@mozilla.com
external_data:
format: google_sheets
source_uris:
- https://docs.google.com/spreadsheets/d/Avakdiasl341kdasdf # URL to the spreadsheet
options:
skip_leading_rows: 1 # number of rows that should be skipped, e.g if there are header rows
workgroup_access: # the workgroup_access is optional, used for restricting data access
- role: roles/bigquery.dataViewer
members:
- workgroup:secret/gp
</code></pre>
<ul>
<li>It is possible to provide multiple URLs to spreadsheets. These spreadsheets need to have the same structure (same columns and types) and will be combined (aka <code>UNION</code>ed) in the BigQuery table.</li>
<li><code>workgroup_access</code> is optional and does not need to be specified for data accessible by Mozilla employees. It only needs to be specified if a subset of people, belonging to a specific workgroup, should have access</li>
</ul>
<ol start="5">
<li>Create a <code>schema.yaml</code> file.
<ul>
<li>For <code>insightful_spreadsheet_v1</code> the file would need be created under <code>sql/moz-fx-data-shared-prod/telemetry_derived/insightful_spreadsheet_v1/schema.yaml</code></li>
</ul>
</li>
<li>Open the <code>schema.yaml</code> file and specify the structure of the spreadsheet (aka schema) similar to the following:</li>
</ol>
<pre><code class="language-yaml">fields:
- mode: NULLABLE
name: first_column_name # this will be the column name in the BigQuery table for the first spreadsheet column
type: STRING # this will be the data type used for this column in BigQuery
- mode: NULLABLE
name: second_column_name
type: DATE
- mode: NULLABLE
name: third_column_name
type: FLOAT64
</code></pre>
<ol start="7">
<li>Go to the spreadsheet, click on &quot;Share&quot; and add the following service account as <em>Editor</em>: <code>jenkins-node-default@moz-fx-data-terraform-admin.iam.gserviceaccount.com</code>
<ul>
<li>This is necessary to ensure the correct access permissions get applied to the spreadsheet</li>
</ul>
</li>
<li>Open a pull-request against <a href="https://github.com/mozilla/bigquery-etl">bigquery-etl</a> and tag someone for review.</li>
<li>Once the PR has been reviewed and merged, the table will be available the next day in BigQuery.
<ul>
<li>If the table should be made available immediately, then go to the <a href="https://workflow.telemetry.mozilla.org/dags/bqetl_artifact_deployment/grid"><code>bqetl_artifact_deployment</code> Airflow DAG</a> and clear the <code>publish_new_tables</code> task. This might need to be done by a data engineer or someone who has permissions to trigger and clear tasks in Airflow. The table will be available as soon as the task finishes.</li>
</ul>
</li>
</ol>
<p>For confidential data it is generally recommended to add these configurations to <a href="https://github.com/mozilla/private-bigquery-etl">private-bigquery-etl</a>. The process and configurations are the same, the only difference is the repository which is not publicly accessible.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/operational/connecting_external_data_bigquery.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="collecting-new-data"><a class="header" href="#collecting-new-data">Collecting New Data</a></h1>
<h2 id="guidelines"><a class="header" href="#guidelines">Guidelines</a></h2>
<p>For information about what sorts of data may be collected,
and for information on getting a data collection request reviewed,
please read the <a href="https://wiki.mozilla.org/Firefox/Data_Collection">Data Collection Guidelines.</a></p>
<h2 id="mechanics"><a class="header" href="#mechanics">Mechanics</a></h2>
<p>The mechanics of how to instrument new data collection in Firefox are covered in
<a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/start/adding-a-new-probe.html">Adding a new Telemetry probe</a>.</p>
<p>For non-Telemetry data collection, we have a mechanism for streamlining
ingestion of structured (JSON) data that utilizes the same underlying
infrastructure. See <a href="datasets/../cookbooks/new_ping.html">this cookbook</a> for details on using it.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/new_data.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="client-implementation-guidelines-for-experiments"><a class="header" href="#client-implementation-guidelines-for-experiments">Client Implementation Guidelines for Experiments</a></h1>
<p><strong>Note</strong>: This guidance is useful for implementing Normandy experiments. To ship experiments with the Nimbus platform, please see the guidance for engineers at <a href="https://experimenter.info/">https://experimenter.info/</a>.</p>
<p>There are three supported approaches for enabling experimental features for Firefox:</p>
<ul>
<li><a href="cookbooks/client_guidelines.html#prefs">Firefox Prefs</a>
<ul>
<li>Prefs can be used to control features that <strong>land in-tree</strong>.
<a href="cookbooks/client_guidelines.html#feature-gates">Feature Gates</a> provide a wrapper around prefs that can be used from JavaScript.</li>
</ul>
</li>
<li><a href="cookbooks/client_guidelines.html#extensions">Firefox Extensions</a> AKA &quot;<strong>Add-ons</strong>&quot;.
<ul>
<li>If the feature being tested should not land in the tree, or if it will ultimately ship as an extension, then an extension should be used.</li>
</ul>
</li>
</ul>
<p>New features go through the standard Firefox review, testing, and deployment processes, and are then enabled experimentally in the field using <a href="https://github.com/mozilla/normandy">Normandy</a>.</p>
<h2 id="prefs"><a class="header" href="#prefs">Prefs</a></h2>
<p>Firefox Preferences (AKA &quot;prefs&quot;) are commonly used to enable and disable features. However, prefs are more complex to implement correctly than <a href="cookbooks/client_guidelines.html#feature-gates">feature gates</a>.</p>
<p><strong>Each pref should represent a different experimental treatment</strong>. If your experimental feature requires multiple prefs, then Normandy does not currently support this but will soon. In the meantime, an <a href="cookbooks/client_guidelines.html#extensions">extension</a> such as <a href="https://github.com/nhnt11/multipreffer">multipreffer</a> may be used.</p>
<p>There are three types of Prefs:</p>
<ol>
<li>Built-in prefs - shipped with Firefox, in <code>firefox.js</code>.</li>
<li><code>user branch</code> - set by the user, overriding built-in prefs.</li>
<li><code>default branch</code> - Overrides both built-in and <code>user branch</code> prefs. Only persists until the browser session ends, next restart will revert to either built-in or <code>user branch</code> (if set).</li>
</ol>
<p><a href="https://github.com/mozilla/normandy">Normandy</a> supports overriding both the <code>user</code> and <code>default</code> branches, although the latter is preferred as it does not permanently override user settings. <code>default</code> branch prefs are simple to reset since they do not persist past a restart.</p>
<p><strong>In order for features to be activated experimentally using <code>default branch</code> prefs</strong>:</p>
<ul>
<li>The feature must not start up before <code>final-ui-startup</code> is observed.</li>
</ul>
<p>For instance, to set an observer:</p>
<pre><code class="language-js">Services.obs.addObserver(this, &quot;final-ui-startup&quot;, true);
</code></pre>
<p>In this example, <code>this</code> would implement an <code>observe(subject, topic, data)</code> function which will be called when <code>final-ui-startup</code> is observed. See the <a href="https://searchfox.org/mozilla-central/rev/59e797b66f5ce8a27ede0e7677688931be7aed20/xpcom/ds/nsIObserverService.idl#24-39">Observer documentation</a> for more information.</p>
<ul>
<li>It must be possible to enable/disable the feature at runtime, via a pref change.</li>
</ul>
<p>This is similar to the observer pattern above:</p>
<pre><code class="language-js">Services.prefs.addObserver(&quot;pref_name&quot;, this);
</code></pre>
<p>More information is available in the <a href="https://searchfox.org/mozilla-central/source/modules/libpref/nsIPrefService.idl">Preference service documentation</a>.</p>
<ul>
<li>
<p>Never use <code>Services.prefs.prefHasUserValue()</code>, or any other function specific to <code>user branch</code> prefs.</p>
</li>
<li>
<p>Prefs should be set by default in <code>firefox.js</code></p>
</li>
</ul>
<p>If your feature cannot abide by one or more of these rules (for instance, it needs to run at startup and/or cannot be toggled at runtime) then experimental preferences can be set on the <code>user branch</code>. This is more complex than using the methods described above; user branch prefs override the users choice, which is a really complex thing to try to support when flipping prefs experimentally. We also need to be careful to back up and reset the pref, and then figure out how to resolve conflicts if the user has changed the pref in the meantime.</p>
<h2 id="feature-gates"><a class="header" href="#feature-gates">Feature Gates</a></h2>
<p>A new Feature Gate library for Firefox Desktop is now available.</p>
<p><strong>Each feature gate should represent a different experimental treatment</strong>. If your experimental feature requires multiple flags, then Normandy will not be able to support this directly and an <a href="cookbooks/client_guidelines.html#extensions">extension</a> may be used.</p>
<h3 id="feature-gate-caveats"><a class="header" href="#feature-gate-caveats">Feature Gate caveats</a></h3>
<p>The current Feature Gate library comes with a few caveats, and may not be appropriate for your situation:</p>
<ul>
<li>Only JS is supported.</li>
<li>Always asynchronous.</li>
</ul>
<p>Future versions of the Feature Gate API will include C++/Rust support and a synchronous API.</p>
<h3 id="using-the-feature-gate-library"><a class="header" href="#using-the-feature-gate-library">Using the Feature Gate library</a></h3>
<p>Read <a href="https://firefox-source-docs.mozilla.org/toolkit/components/featuregates/featuregates/index.html">the documentation</a> to get started.</p>
<h2 id="extensions"><a class="header" href="#extensions">Extensions</a></h2>
<p>Firefox currently supports the <a href="https://developer.mozilla.org/en-US/docs/Mozilla/Add-ons/WebExtensions">Web Extensions API</a>.</p>
<p><strong>If new WebExtension APIs are needed, they should land in-tree</strong>. Extensions which are signed by Mozilla can load privileged code using the <a href="https://firefox-source-docs.mozilla.org/toolkit/components/extensions/webextensions/index.html">WebExtension Experiments</a>, but this is not preferred.</p>
<p>WebExtensions go through the same correctness and performance tests as other features. This is possible using the Mozilla tryserver by dropping your XPI into <code>testing/profiles/common/extensions</code> in <code>mozilla-central</code> and pushing to Tryserver - see the <a href="cookbooks/client_guidelines.html#testing-extensions">Testing Extensions</a> section below.</p>
<p>NOTE - it is ideal to test against the version of Firefox which the extension will release against, but there is a <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1435403">bug related to artifact builds on release channels</a> which must be worked around. The workaround is pretty simple (modify an <code>artifacts.py</code> file), but this bug being resolved will make it much simpler.</p>
<p><strong>Each extension can represent a different experimental treatment (preferred), or the extension can choose the branch internally</strong>.</p>
<h3 id="shield-studies"><a class="header" href="#shield-studies">SHIELD studies</a></h3>
<p>The previous version of the experiments program, SHIELD, always bundled privileged code with extensions and would do things such as mock UI features in Firefox.</p>
<p>This sort of approach is discouraged for new features - land these (or the necessary WebExtension APIs) in-tree instead.</p>
<p>For the moment, the <a href="https://github.com/mozilla/shield-studies-addon-utils/">SHIELD Study Add-on Utilities</a> may be used if the extension needs to control the lifecycle of the study, but using one extension per experimental treatment makes this unnecessary and is preferred. The APIs provided by the SHIELD Study Add-on Utilities will be available as privileged APIs shipped with Firefox soon.</p>
<h1 id="development-and-testing"><a class="header" href="#development-and-testing">Development and Testing</a></h1>
<h2 id="testing-built-in-features"><a class="header" href="#testing-built-in-features">Testing Built-in Features</a></h2>
<p>Firefox features go through standard development and testing processes. See the <a href="https://firefox-source-docs.mozilla.org/">Firefox developer guide</a> for more information.</p>
<h2 id="testing-extensions"><a class="header" href="#testing-extensions">Testing Extensions</a></h2>
<p>Extensions do not need to go through the same process, but should take advantage of Mozilla CI and bug tracking systems:</p>
<ol>
<li>Use the Mozilla CI to test changes (tryserver).</li>
<li>Performance tests (<strong>this step is required</strong>) - extension XPI files should be placed in <code>testing/profiles/common/extensions/</code>, which will cause test harnesses to load the XPI.</li>
<li>Custom unit/functional tests (AKA <code>xpcshell</code>/<code>mochitest</code>) may be placed in <code>testing/extensions</code>, although running these tests outside Mozilla CI is acceptable so these are <strong>optional</strong>.</li>
<li>Receive reviewer approval. A Firefox peer <strong>must sign off</strong> if this extension contains privileged code, aka WebExtension Experiments.</li>
</ol>
<ul>
<li>Any <a href="https://wiki.mozilla.org/Modules/All#Firefox">Firefox Peer</a> should be able to do the review, or point you to someone who can.</li>
</ul>
<ol start="5">
<li>Extension is signed.</li>
<li>Email to <code>pi-request@mozilla.com</code> is sent to request QA</li>
<li>QA approval signed off in Bugzilla.</li>
<li>Extension is shipped via <a href="https://github.com/mozilla/normandy">Normandy</a>.</li>
</ol>
<h2 id="example-extensions-testing-workflow"><a class="header" href="#example-extensions-testing-workflow">Example Extensions Testing Workflow</a></h2>
<p>Note that for the below to work you only need <a href="https://www.mercurial-scm.org/">Mercurial</a> installed, but if you want to do local testing you must be set up to <a href="https://firefox-source-docs.mozilla.org/setup/index.html">build Firefox</a>. You don't need to build Firefox from source; <a href="https://firefox-source-docs.mozilla.org/contributing/build/artifact_builds.html">artifact builds</a> are sufficient.</p>
<p>In order to use Mozilla CI (AKA &quot;<a href="https://firefox-source-docs.mozilla.org/tools/try/">Tryserver</a>&quot;), you must have a full clone of the <code>mozilla-central</code> repository:</p>
<pre><code class="language-bash">hg clone https://hg.mozilla.org/mozilla-central
cd mozilla-central
</code></pre>
<p>Copy in unsigned XPI, and commit it to your local Mercurial repo:</p>
<pre><code class="language-bash">cp ~/src/my-extension.xpi testing/profiles/common/extensions/
hg add testing/profiles/common/extensions/my-extension.xpi
hg commit -m &quot;Bug nnn - Testing my extension&quot; testing/profiles/common/extensions/my-extension.xpi
</code></pre>
<p>Push to Try:</p>
<pre><code class="language-bash">./mach try -p linux64,macosx64,win64 -b do -u none -t all --artifact
</code></pre>
<p>This will run Mozilla CI tests on all platforms</p>
<p>Note that you must have Level 1 commit access to use tryserver. If you are interested in interacting with Mozilla CI from Github (which only requires users to be in the Mozilla GitHub org), check out the <a href="https://github.com/biancadanforth/taskcluster-integration-poc/">Taskcluster Integration proof-of-concept</a>.</p>
<p>Also note that this requires an investment time to set up just as CircleCI or Travis-CI would, so it's not really appropriate for short-term projects. Use tryserver directly instead.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/client_guidelines.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="telemetry-events-best-practices"><a class="header" href="#telemetry-events-best-practices">Telemetry Events Best Practices</a></h1>
<h2 id="overview"><a class="header" href="#overview">Overview:</a></h2>
<p><a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/collection/events.html">The Telemetry Events API</a> allows users to define and record events in the browser.</p>
<p>Events are defined in <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/collection/events.html#the-yaml-definition-file"><code>Events.yaml</code></a> and each events creates records with the following properties:</p>
<ul>
<li>timestamp</li>
<li>category</li>
<li>method</li>
<li>object</li>
<li>value</li>
<li>extra</li>
</ul>
<p>With the following restrictions and features:</p>
<ul>
<li>The category, method, and object properties of any record produced by an event must have a value.</li>
<li>All combinations of values from the category, method, and object properties must be unique to that particular event (no other event can produce events with the same combination).</li>
<li>Events can be 'turned on' or 'turned off' by it's category value. i.e. we can instruct the browser to &quot;stop sending us events from the <code>devtools</code> category.&quot;</li>
</ul>
<p>These records are then stored in <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/data/event-ping.html">event pings</a> and available in the <a href="https://docs.telemetry.mozilla.org/datasets/batch_view/events/reference.html">events dataset</a>.</p>
<h2 id="identifying-events"><a class="header" href="#identifying-events">Identifying Events</a></h2>
<p>One challenge with this data is it can be difficult to identify all the records from a particular event.
Unlike Scalars and Histograms, which keep data in individual locations (like <code>scalar_parent_browser_engagement_total_uri_count</code> for <a href="https://searchfox.org/mozilla-central/rev/501eb4718d73870892d28f31a99b46f4783efaa0/toolkit/components/telemetry/Scalars.yaml#204"><code>total_uri_count</code></a>), all event records are stored together, regardless of which event generated them. The records themselves don't have a field identifying which event produced it[1].</p>
<p>Take, for example, the <a href="https://searchfox.org/mozilla-central/rev/501eb4718d73870892d28f31a99b46f4783efaa0/toolkit/components/telemetry/Events.yaml#151"><code>manage</code></a>
event in the <code>addonsManager</code> category.</p>
<pre><code>addonsManager: # category
manage: # event name
description: &gt;
...
objects: [&quot;extension&quot;, &quot;theme&quot;, &quot;locale&quot;, &quot;dictionary&quot;, &quot;other&quot;] # object values
methods: [&quot;disable&quot;, &quot;enable&quot;, &quot;sideload_prompt&quot;, &quot;uninstall&quot;] # method values
extra_keys: # extra values
...
notification_emails: ...
expiry_version: ...
record_in_processes: ...
bug_numbers: ...
release_channel_collection: ...
</code></pre>
<p>This event will produce records that look like:</p>
<div class="table-wrapper"><table><thead><tr><th>timestamp</th><th>category</th><th>method</th><th>object</th><th>value</th><th>extra</th></tr></thead><tbody>
<tr><td>...</td><td><code>addonsManager</code></td><td><code>install</code></td><td><code>extension</code></td><td></td><td>...</td></tr>
<tr><td>...</td><td><code>addonsManager</code></td><td><code>update</code></td><td><code>locale</code></td><td></td><td>...</td></tr>
<tr><td>...</td><td><code>addonsManager</code></td><td><code>sideload_prompt</code></td><td><code>other</code></td><td></td><td>...</td></tr>
</tbody></table>
</div>
<p>But none of these records will indicate that it was produced by the <code>manage</code> event. To find all records produced by <code>manage</code>, one would have to query all records where</p>
<pre><code>category = ...
AND method in [...,]
AND object in [...,]
</code></pre>
<p>which is not ideal.</p>
<p>Furthermore, if one encounters this data without knowledge of how the <code>manage</code> event works, they need to look up the event based on the category, method, and object values in order to find the event, and then query the data again to find all the related events. It's not immediately clear from the data if this record:</p>
<div class="table-wrapper"><table><thead><tr><th>timestamp</th><th>category</th><th>method</th><th>object</th><th>value</th><th>extra</th></tr></thead><tbody>
<tr><td>...</td><td><code>addonsManager</code></td><td><code>update</code></td><td><code>locale</code></td><td></td><td>...</td></tr>
</tbody></table>
</div>
<p>and and this record:</p>
<div class="table-wrapper"><table><thead><tr><th>timestamp</th><th>category</th><th>method</th><th>object</th><th>value</th><th>extra</th></tr></thead><tbody>
<tr><td>...</td><td><code>addonsManager</code></td><td><code>install</code></td><td><code>extension</code></td><td></td><td>...</td></tr>
</tbody></table>
</div>
<p>are related or not.</p>
<p>Another factor that can add to confusion is the fact that other events can share similar values for methods or objects (or even the combination of method and object). For example:</p>
<div class="table-wrapper"><table><thead><tr><th>timestamp</th><th>category</th><th>method</th><th>object</th><th>value</th><th>extra</th></tr></thead><tbody>
<tr><td>...</td><td><code>normandy</code></td><td><code>update</code></td><td><code>preference_rollout</code></td><td></td><td>...</td></tr>
</tbody></table>
</div>
<p>which can further confuse users.</p>
<p>[1]: Events do have name fields, but they aren't included in the event records and thus are not present in the resulting dataset. Also, If a user defines an event in <code>Events.yaml</code> without specifying a list of acceptable methods, the method will default to the name of the event for records created by that event.</p>
<h4 id="suggested-convention"><a class="header" href="#suggested-convention">Suggested Convention:</a></h4>
<p>To simplify things in the future, we suggest adding the event name to the category field using dot notation when designing new events:</p>
<pre><code>&quot;category.event_name&quot;
</code></pre>
<p>For example:</p>
<ul>
<li><code>&quot;navigation.search&quot;</code></li>
<li><code>&quot;addonsManager.manage&quot;</code></li>
<li><code>&quot;frame.tab&quot;</code></li>
</ul>
<p>This provides 3 advantages:</p>
<ol>
<li>Records produced by this event will be easily identifiable. Also, the event which produced the record will be easier to locate in the code.</li>
<li>Events can be controlled more easily. The category field is what we use to &quot;turn on&quot; and &quot;turn off&quot; events. By creating a 1 to 1 mapping between categories and events, we can control events on an individual level.</li>
<li>By having the category field act as the event identifier, it makes it easier to pass on events to Amplitude and other platforms.</li>
</ol>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/events_best_practices.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="sending-a-custom-ping"><a class="header" href="#sending-a-custom-ping">Sending a Custom Ping</a></h1>
<p>Got some new data you want to send to us? How in the world do you send a new ping? Follow this guide
to find out.</p>
<p><strong>Note</strong>: Most new data collection in Firefox via Telemetry or Glean does not require creating a new
ping document type. To add a histogram, scalar, or event collection to Firefox, please see the
documentation on <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/start/adding-a-new-probe.html">adding a new
probe</a>.</p>
<h2 id="write-your-questions"><a class="header" href="#write-your-questions">Write Your Questions</a></h2>
<p>Do not try and implement new pings unless you know specifically what questions you're trying to
answer. General questions about &quot;How do users use our product?&quot; won't cut it - these need to be
specific, concrete asks that can be translated to data points. This will also make it easier down
the line as you start data review.</p>
<p>More detail on how to design and implement new pings for Firefox Desktop <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/collection/custom-pings.html">can be found
here</a>.</p>
<h2 id="choose-a-namespace-and-doctype"><a class="header" href="#choose-a-namespace-and-doctype">Choose a Namespace and DocType</a></h2>
<p>Choose a namespace that uniquely identifies the product that will be generating the data. The
<code>telemetry</code> namespace is reserved for pings added by the Firefox Desktop Telemetry team.</p>
<p>The DocType is used to differentiate pings within a namespace. It can be as simple as <code>event</code>, but
should generally be descriptive of the data being collected.</p>
<p>Both namespace and DocType are limited to the pattern <code>[a-z-]</code>. In other words, hyphens and
lowercase letters from the <a href="https://en.wikipedia.org/wiki/ISO_basic_Latin_alphabet">ISO basic Latin alphabet</a>.</p>
<h2 id="create-a-schema"><a class="header" href="#create-a-schema">Create a Schema</a></h2>
<p>Write a JSON Schema. See the <a href="https://github.com/mozilla-services/mozilla-pipeline-schemas#adding-a-new-schema">&quot;Adding a new schema&quot;
documentation</a> and
examples schemas in the <a href="https://github.com/mozilla-services/mozilla-pipeline-schemas/">Mozilla Pipeline Schemas
repo</a>. This schema is used to
validate the incoming data; any ping that doesn't match the schema will be removed. This schema will
also be transformed into a BigQuery table schema via the <a href="https://github.com/mozilla/mozilla-schema-generator">Mozilla Schema
Generator</a>. Note that parquet schemas are no
longer necessary because of the generated schemas. Validate your JSON Schema using a <a href="https://jsonschemalint.com/#/version/draft-04/markup/json">validation
tool</a>.</p>
<h2 id="start-a-data-review"><a class="header" href="#start-a-data-review">Start a Data Review</a></h2>
<p>Data review for new pings is often more complicated than adding new probes. See <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1347266">Data Review for
Focus-Event Ping</a> as an example.
Consider where the data falls under the
<a href="https://wiki.mozilla.org/Firefox/Data_Collection">Data Collection Categories</a>.</p>
<h2 id="submit-schema-to-mozilla-servicesmozilla-pipeline-schemas"><a class="header" href="#submit-schema-to-mozilla-servicesmozilla-pipeline-schemas">Submit Schema to <code>mozilla-services/mozilla-pipeline-schemas</code></a></h2>
<p>Create a pull request including both a template and rendered schema to <code>mozilla-pipeline-schemas</code>.
Add at least one validation ping that exercises the structure of schema as a test.
These pings are validated during the build and help catch mistakes during the writing process.</p>
<h3 id="example-a-rendered-schema-for-response-times"><a class="header" href="#example-a-rendered-schema-for-response-times">Example: A rendered schema for response times</a></h3>
<p>Imagine we want to collect a set of response measurements in milliseconds on a per-client basis.
The pings take on the following shape:</p>
<pre><code class="language-json">{&quot;id&quot;: &quot;08317b11-85f7-4688-9b35-48af10c3ccdf&quot;, &quot;clientId&quot;: &quot;1d5ce2fc-a554-42f0-ab21-2ad8ada9bb88&quot;, &quot;payload&quot;: {&quot;response_ms&quot;: 324}}
{&quot;id&quot;: &quot;a97108ac-483b-40be-9c64-3419326f5113&quot;, &quot;clientId&quot;: &quot;3f1b2e1c-c241-464f-aa46-576f5795e488&quot;, &quot;payload&quot;: {&quot;response_ms&quot;: 221}}
{&quot;id&quot;: &quot;b8a7e3f9-38c0-4a13-b42a-c969feb454f6&quot;, &quot;clientId&quot;: &quot;14f27409-5f6f-46e0-9f9d-da5cd716ee42&quot;, &quot;payload&quot;: {&quot;response_ms&quot;: 549}}
</code></pre>
<p>This document can be described in the following way:</p>
<pre><code class="language-json">{
&quot;$schema&quot;: &quot;http://json-schema.org/draft-04/schema#&quot;,
&quot;type&quot;: &quot;object&quot;,
&quot;properties&quot;: {
&quot;id&quot;: {
&quot;type&quot;: &quot;string&quot;,
&quot;description&quot;: &quot;The document identifier&quot;
},
&quot;clientId&quot;: {
&quot;type&quot;: &quot;string&quot;,
&quot;description&quot;: &quot;The client identifier&quot;
},
&quot;payload&quot;: {
&quot;type&quot;: &quot;object&quot;,
&quot;properties&quot;: {
&quot;response_ms&quot;: {
&quot;type&quot;: &quot;integer&quot;,
&quot;minimum&quot;: 0,
&quot;description&quot;: &quot;Response time of the client, in milliseconds&quot;
}
}
}
}
}
</code></pre>
<p>Fields like <code>id</code> and <code>clientId</code> have template components as part of the build-system. These would be
included as <code>@TELEMETRY_ID_1_JSON@</code> and <code>@TELEMETRY_CLIENTID_1_JSON@</code> respectively. The best way to
become familiar with template schemas is to browse the repository; the
<a href="https://github.com/mozilla-services/mozilla-pipeline-schemas/blob/master/templates/telemetry/main/main.4.schema.json"><code>telemetry/main/main.4.schema.json</code>
document</a>
a good starting place.</p>
<p>As part of the automated deployment process, the JSON schemas are translated into a table schema
used by BigQuery. These schemas closely reflect the schemas used for data validation.</p>
<pre><code class="language-json">[
{
&quot;mode&quot;: &quot;NULLABLE&quot;,
&quot;name&quot;: &quot;clientId&quot;,
&quot;type&quot;: &quot;STRING&quot;
},
{
&quot;mode&quot;: &quot;NULLABLE&quot;,
&quot;name&quot;: &quot;id&quot;,
&quot;type&quot;: &quot;STRING&quot;
},
{
&quot;fields&quot;: [
{
&quot;mode&quot;: &quot;NULLABLE&quot;,
&quot;name&quot;: &quot;response_ms&quot;,
&quot;type&quot;: &quot;INT64&quot;
}
],
&quot;mode&quot;: &quot;NULLABLE&quot;,
&quot;name&quot;: &quot;payload&quot;,
&quot;type&quot;: &quot;RECORD&quot;
}
]
</code></pre>
<h3 id="ingestion-metadata"><a class="header" href="#ingestion-metadata">Ingestion Metadata</a></h3>
<p>The generated schemas contain metadata added to the schema before deployment to the ingestion
service. These are fields added to the ping at ingestion time; they might come from the URL
submitted to the edge server, or the IP Address used to make the request. <a href="https://github.com/mozilla-services/mozilla-pipeline-schemas/blob/master/schemas/metadata/telemetry-ingestion/telemetry-ingestion.1.schema.json">This
document</a>
lists available metadata fields for the telemetry-ingestion pings, which are largely shared across
all namespaces.</p>
<p>A list of metadata fields are included here for reference, but refer to the above document or the
schema explorer for an up-to-date list of metadata fields.</p>
<!-- table generated via `scripts/new_ping_metadata_table.py > src/cookbooks/new_ping_metadata_table.md` -->
<div class="table-wrapper"><table><thead><tr><th>field</th><th>description</th></tr></thead><tbody>
<tr><td><code>additional_properties</code></td><td>A JSON string containing any payload properties not present in the schema</td></tr>
<tr><td><code>document_id</code></td><td>The document ID specified in the URI when the client sent this message</td></tr>
<tr><td><code>normalized_app_name</code></td><td>Set to &quot;Other&quot; if this message contained an unrecognized app name</td></tr>
<tr><td><code>normalized_channel</code></td><td>Set to &quot;Other&quot; if this message contained an unrecognized channel name</td></tr>
<tr><td><code>normalized_country_code</code></td><td>An ISO 3166-1 alpha-2 country code</td></tr>
<tr><td><code>normalized_os</code></td><td>Set to &quot;Other&quot; if this message contained an unrecognized OS name</td></tr>
<tr><td><code>normalized_os_version</code></td><td>N/A</td></tr>
<tr><td><code>sample_id</code></td><td>Hashed version of client_id (if present) useful for partitioning; ranges from 0 to 99</td></tr>
<tr><td><code>submission_timestamp</code></td><td>Time when the ingestion edge server accepted this message</td></tr>
<tr><td><code>metadata.user_agent.browser</code></td><td>N/A</td></tr>
<tr><td><code>metadata.user_agent.os</code></td><td>N/A</td></tr>
<tr><td><code>metadata.user_agent.version</code></td><td>N/A</td></tr>
<tr><td><code>metadata.uri.app_build_id</code></td><td>N/A</td></tr>
<tr><td><code>metadata.uri.app_name</code></td><td>N/A</td></tr>
<tr><td><code>metadata.uri.app_update_channel</code></td><td>N/A</td></tr>
<tr><td><code>metadata.uri.app_version</code></td><td>N/A</td></tr>
<tr><td><code>metadata.header.date</code></td><td>Date HTTP header</td></tr>
<tr><td><code>metadata.header.dnt</code></td><td>DNT (Do Not Track) HTTP header</td></tr>
<tr><td><code>metadata.header.x_debug_id</code></td><td>X-Debug-Id HTTP header</td></tr>
<tr><td><code>metadata.header.x_pingsender_version</code></td><td>X-PingSender-Version HTTP header</td></tr>
<tr><td><code>metadata.geo.city</code></td><td>City name</td></tr>
<tr><td><code>metadata.geo.country</code></td><td>An ISO 3166-1 alpha-2 country code</td></tr>
<tr><td><code>metadata.geo.db_version</code></td><td>The specific <a href="https://dev.maxmind.com/geoip/geoip2/geoip2-city-country-csv-databases/">Geo database</a> version used for this lookup</td></tr>
<tr><td><code>metadata.geo.subdivision1</code></td><td>First major country subdivision, typically a state, province, or county</td></tr>
<tr><td><code>metadata.geo.subdivision2</code></td><td>Second major country subdivision; not applicable for most countries</td></tr>
<tr><td><code>metadata.isp.db_version</code></td><td>The specific <a href="https://dev.maxmind.com/geoip/geoip2/geoip2-isp-csv-database/">ISP database</a> version used for this lookup</td></tr>
<tr><td><code>metadata.isp.name</code></td><td>The name of the Internet Service Provider</td></tr>
<tr><td><code>metadata.isp.organization</code></td><td>The name of a specific business entity when available; otherwise the ISP name</td></tr>
</tbody></table>
</div>
<h3 id="testing-the-schema"><a class="header" href="#testing-the-schema">Testing The Schema</a></h3>
<p>For new data, use the <a href="https://github.com/mozilla-services/edge-validator">edge validator</a> to test
your schema.</p>
<h2 id="deployment"><a class="header" href="#deployment">Deployment</a></h2>
<p>Schemas are automatically deployed once a day around 00:00 UTC, scheduled after the probe scraper in
the following <a href="https://github.com/mozilla/telemetry-airflow/blob/master/dags/probe_scraper.py">Airflow
DAG</a>. The latest
schemas can be viewed at
<a href="https://github.com/mozilla-services/mozilla-pipeline-schemas/tree/generated-schemas"><code>mozilla-pipeline-schemas/generated-schemas</code></a>.</p>
<h2 id="start-sending-data"><a class="header" href="#start-sending-data">Start Sending Data</a></h2>
<p>Use the built-in Telemetry APIs when possible. A few examples are the <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/collection/custom-pings.html">Gecko Telemetry
APIs</a>,
or the <a href="https://github.com/mozilla-mobile/telemetry-ios">iOS Telemetry APIs</a>.</p>
<p><strong>Users on Android should use <a href="cookbooks/../concepts/glean/glean.html">Glean</a></strong>, which does not require building out custom pings.</p>
<p>For all other use-cases, send documents to the ingestion endpoint:</p>
<pre><code class="language-text">https://incoming.telemetry.mozilla.org
</code></pre>
<p>See <a href="cookbooks/../concepts/pipeline/http_edge_spec.html">the HTTP edge server specification</a> for documentation
about the expected format.</p>
<h2 id="access-your-data"><a class="header" href="#access-your-data">Access Your Data</a></h2>
<p>First confirm with the reviewers of <a href="cookbooks/new_ping.html#submit-schema-to-mozilla-servicesmozilla-pipeline-schemas">your schema pull
request</a> that your schemas have been
deployed. You may also check the diff of the latest commit to <a href="https://github.com/mozilla-services/mozilla-pipeline-schemas/tree/generated-schemas"><code>mozilla-pipeline-schemas/generated schemas</code></a>.</p>
<p>In the following links, replace <code>&lt;namespace&gt;</code>, <code>&lt;doctype&gt;</code> And <code>&lt;docversion&gt;</code> with appropriate
values. Also replace <code>-</code> with <code>_</code> in <code>&lt;namespace&gt;</code> if your namespace contains <code>-</code> characters.</p>
<h3 id="stmo--bigquery"><a class="header" href="#stmo--bigquery">STMO / BigQuery</a></h3>
<p>In the <code>Telemetry (BigQuery)</code> data source, several new tables will be created for your data.</p>
<p>The first table is the <code>live</code> table found under
<code>moz-fx-data-shared-prod.&lt;namespace&gt;_live.&lt;doctype&gt;_v&lt;docversion&gt;</code>. This table is updated on a 5
minute interval, partitioned on <code>submission_timestamp</code>, and may contain partial days of data.</p>
<pre><code class="language-sql">SELECT
count(*) AS n_rows
FROM
`moz-fx-data-shared-prod.telemetry_live.main_v4`
WHERE
submission_timestamp &gt; TIMESTAMP_SUB(current_timestamp, INTERVAL 30 minute)
</code></pre>
<p>The second table that is created is the <code>stable</code> clustered table (and corresponding view) under
<code>moz-fx-data-shared-prod.&lt;namespace&gt;.&lt;doctype&gt;</code>. This view will only contain complete
days of submissions. The data is clustered by <code>normalized_channel</code> and <code>sample_id</code> to improve the
efficiency of queries.</p>
<pre><code class="language-sql">SELECT
COUNT(DISTINCT client_id)*100 AS dau
FROM
`moz-fx-data-shared-prod.telemetry.main`
WHERE
submission_timestamp &gt; TIMESTAMP_SUB(current_timestamp, INTERVAL 1 day)
AND sample_id = 1
</code></pre>
<p>This table may take up to a day to appear in the BigQuery source; if you still don't see a table for
your new ping after 24 hours, <a href="https://mana.mozilla.org/wiki/display/SVCOPS/Contacting+Data+Operations">contact Data
Operations</a> so that they
can investigate. Once the table is available, it should contain all the pings sent during that first
day, regardless of how long it takes for the table to appear.</p>
<h3 id="spark-1"><a class="header" href="#spark-1">Spark</a></h3>
<p>Refer to the <a href="cookbooks/../cookbooks/bigquery/access.html#from-spark">Spark notes</a> for details on accessing the data
via Spark.</p>
<h2 id="build-dashboards-using-spark-or-stmo"><a class="header" href="#build-dashboards-using-spark-or-stmo">Build Dashboards Using Spark or STMO</a></h2>
<p>Last steps! What are you using this data for anyway?</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/cookbooks/new_ping.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h3 id="data-platform-reference-1"><a class="header" href="#data-platform-reference-1">Data Platform Reference</a></h3>
<p>This section contains detailed reference material on the Mozilla data platform, including links to other resources where appropriate.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/reference/index.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="data-stack-overview"><a class="header" href="#data-stack-overview">Data Stack Overview</a></h1>
<p>This is a quick overview of the tooling and components in our data stack:</p>
<table>
<tbody>
<tr>
<th></th>
<th>Data Platform</th>
</tr>
<tr>
<td>Collection</td>
<td>
<ul>
<li><a href="https://github.com/mozilla/gcp-ingestion">gcp-ingestion</a> - Mozilla's telemetry ingestion system deployed to Google Cloud Platform (GCP)
<ul>
<li><a href="https://docs.telemetry.mozilla.org/concepts/pipeline/gcp_data_pipeline.html">Architecture Overview</a></li>
</ul>
</li>
<li>Data Sources:
<ul>
<li><a href="https://docs.telemetry.mozilla.org/concepts/glean/glean.html">Glean</a> apps (including apps using glean.js)</li>
<li>Firefox legacy telemetry clients (Firefox desktop)</li>
<li><a href="https://docs.telemetry.mozilla.org/concepts/external_data_integration_using_fivetran.html">Fivetran</a></li>
<li>Custom integrations</li>
<li>Server side data</li>
</ul>
</li>
</ul>
</td>
</tr>
<tr>
<td>Data Warehouse</td>
<td>BigQuery</td>
</tr>
<tr>
<td>ETL</td>
<td>
<ul>
<li><a href="https://github.com/mozilla/bigquery-etl">bigquery-etl</a>
<ul>
<li>Internally developed tooling to create derived datasets</li>
</ul>
</li>
<li><a href="https://docs.telemetry.mozilla.org/concepts/external_data_integration_using_fivetran.html">Fivetran</a>
<ul>
<li><a href="https://github.com/mozilla/fivetran-connectors">Custom developed connectors</a></li>
<li>Use of <a href="https://www.fivetran.com/connectors">default connectors</a></li>
</ul>
</li>
</ul>
</td>
</tr>
<tr>
<td>Orchestration</td>
<td>
<ul>
<li><a href="https://github.com/mozilla/telemetry-airflow">Airflow</a>
<ul>
<li>Uses <a href="https://github.com/mozilla/bigquery-etl/tree/main/dags">DAGs generated via bigquery-etl</a></li>
<li>Manually defined DAGs in <a href="https://github.com/mozilla/telemetry-airflow">telemetry-airflow</a></li>
</ul>
</li>
</ul>
</td>
</tr>
<tr>
<td>Observability</td>
<td>
Custom tooling for data validation/data checks as part of bigquery-etl.
</td>
</tr>
<tr>
<td>Analsyis and Business Intelligence</td>
<td>
<ul>
<li><a href="https://docs.telemetry.mozilla.org/introduction/tools.html#looker">Looker</a>
<ul>
<li>For most reporting, summaries, and ad-hoc data exploration by non-full-time-data people</li>
</ul>
</li>
<li><a href="https://docs.telemetry.mozilla.org/introduction/tools.html#sqltelemetrymozillaorg-stmo">Redash</a>
<ul>
<li>For running ad-hoc SQL queries</li>
</ul>
</li>
</ul>
</td>
</tr>
<tr>
<td>Reverse ETL</td>
<td>None; We dont send a lot of data out, but when we do its been with custom integrations, using APIs, etc.</td>
</tr>
<tr>
<td>Experimentation</td>
<td><a href="https://experimenter.info/">Nimbus/Experimenter</a></td>
</tr>
<tr>
<td>Governance</td>
<td>
<ul>
<li>Firefox Data Governance: Complex and designed for a specific use case that may not be generally applicable. Currently being revisited.</li>
<li>Access Control: Support for relatively fine-grained access control at the BigQuery and Looker level, access management and approval process exists.</li>
<li>Transparency and Docs: Automated inventory+docs: <a href="https://dictionary.telemetry.mozilla.org/">Glean Dictionary</a></li>
</ul>
</td>
</tr>
<tr>
<td>Data Catalog</td>
<td>
<a href="https://mozilla.acryl.io/">Acryl</a> - for data lineage and finding data sets
</td>
</tr>
</tbody>
</table>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/tools/data_stack_overview.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="guiding-principles-for-data-infrastructure"><a class="header" href="#guiding-principles-for-data-infrastructure">Guiding Principles for Data Infrastructure</a></h1>
<p>So you want to build a data lake... Where do you start? What building blocks are
available? How can you integrate your data with the rest of the organization?</p>
<p>This document is intended for a few different audiences. Data consumers within
Mozilla will gain a better understanding of the data they interact with by
learning how the Firefox telemetry pipeline functions. Mozilla teams outside of
Firefox will get some concrete guidance about how to provision and lay out data
in a way that will let them integrate with the rest of Mozilla. Technical
audiences outside Mozilla will learn some general principles and come away with
links to concrete examples of code implementing those principles.</p>
<p>Considering that Mozilla has chosen GCP as its major cloud provider, BigQuery
stands out as the clear integration point for data at rest among GCP's portfolio
of products. BigQuery has proven to be a best-in-class data warehouse with
impressive performance and a familiar SQL interface. Beyond that, it provides
many conveniences that become important when scaling across an organization such
as automated retention policies and well-defined access controls that can be
provisioned across projects to allow different teams to have control over their
own data.</p>
<p>Data can be loaded into BigQuery or presented as external tables through a
growing list of Google-provided integrations (objects in GCS, logs in
Stackdriver, etc.). Users within Mozilla can also take advantage of
purpose-built infrastructure that other teams within the company have used for
loading data to BigQuery. The major example is our telemetry ingestion system
which accepts payloads from Firefox clients across the world but also provides a
generic interface for defining custom schemas and accepting payloads from any
system capable of making an HTTP request. We also have tooling available for
transforming data (ETL) once it's in BigQuery.</p>
<p>Once data is accessible through BigQuery, users within Mozilla also get the
benefit of leveraging common tools for data access. Beyond the Google-provided
BigQuery console, Mozilla provides access to instances of Redash,
Looker, and other tools either with connections to BigQuery already available
or with concrete instructions for provisioning connections.</p>
<p>Some near real-time use cases can be handled via BigQuery as well, with BigQuery
supporting dozens of batch loads per table per hour and even streaming inserts.
For true latency-sensitive applications, however, we pass data via Cloud
Pub/Sub, GCP's hosted messaging backend. Pub/Sub integrates very closely with
Cloud Dataflow to provide auto-scaling pipelines with relatively little setup
needed. We can also easily provision topics for subsets of data flowing through
the telemetry pipeline as input for custom streaming applications.</p>
<p>To avoid getting too abstract, we'll next dive into a case study of what it
looked like for a team within Mozilla to migrate from a custom pipeline to the
main GCP-based pipeline that supports telemetry data. From there, we'll discuss
specific best practice recommendations for use of each of the major GCP services
in use at Mozilla.</p>
<h2 id="integrating-with-the-data-pipeline-a-case-study"><a class="header" href="#integrating-with-the-data-pipeline-a-case-study">Integrating with the Data Pipeline: A Case Study</a></h2>
<p>Mozilla's core data platform has been built to support <em>structured ingestion</em> of
arbitrary JSON payloads whether they come from browser products on client
devices or from server-side applications that have nothing to do with Firefox;
any team at Mozilla can hook into structured ingestion by defining a schema and
registering it with pipeline. Once a schema is registered, everything else is
automatically provisioned, from an HTTPS endpoint for accepting payloads to a
set of tables in BigQuery for holding the processed data.</p>
<p>Over the course of 2019, the Activity Stream team migrated analytics for Firefox
Desktop's New Tab page from a custom service to the core data platform. The old
system already relied on sending JSON data over HTTP, so the team wanted to
minimize client-side development effort by maintaining the existing payload
structure. They registered the structure of these payloads by sending pull
requests to our schema repository with relevant <a href="https://json-schema.org/">JSON
Schema</a> definitions. As an example,
<a href="https://github.com/mozilla-services/mozilla-pipeline-schemas/pull/228"><code>mozilla-pipeline-schemas#228</code></a>
adds a new document namespace <code>activity-stream</code> and under that a document type
<code>impression-stats</code> with version specified as <code>1</code>. These changes are picked up by
an automated job that translates them into relevant BigQuery schemas and
provisions tables for each unique schema (see
<a href="tools/guiding_principles.html#defining-tables">Defining Tables</a> below).</p>
<p>With the schema now registered with the pipeline, clients can send payloads to
an endpoint corresponding to the new namespace, document type, and version:</p>
<pre><code>https://incoming.telemetry.mozilla.org/submit/activity-stream/impression-stats/1/&lt;document_id&gt;
</code></pre>
<p>where <code>&lt;document_id&gt;</code> should be a UUID that uniquely identifies the payload;
<code>document_id</code> is used within the pipeline for deduplication of repeated
documents. The payload is processed by a small <a href="https://github.com/mozilla/gcp-ingestion/tree/main/ingestion-edge">edge service</a>
that returns a 200
response to the client and publishes the message to a <em>raw</em> Pub/Sub topic. A
<a href="https://github.com/mozilla/gcp-ingestion/tree/main/ingestion-beam"><em>decoder</em> Dataflow</a>
job reads from this topic with low latency, validates that
the payload matches the schema registered for the endpoint, does some additional
metadata processing, and then emits the message back to Pub/Sub in a <em>decoded</em>
topic. A final job reads the <em>decoded</em> topic, batches together records
destined for the same table, and loads the records into the relevant <em>live ping
table</em> in BigQuery (<code>activity_stream_live.impression_stats_v1</code> in this case). A
nightly job reads all records for the previous day from the live ping table,
deduplicates the records based on <code>document_id</code> values, and loads the final
deduplicated day to the relevant <em>historical ping table</em>
(<code>activity_stream_stable.impression_stats_v1</code>). The results are automatically
presented to users through a view (<code>activity_stream.impression_stats</code>).</p>
<p>While most analysis use cases for this data are served via queries on the
user-facing BigQuery view, the Pocket team also needed to build an application
with access to <code>activity-stream</code> messages in real-time. To serve that need,
Pocket provisioned a Pub/Sub topic in a separate GCP project and worked with
Data Operations to provide write access to a relevant service account within the
telemetry pipeline. The pipeline is now configured to republish all messages
associated with the <code>activity-stream</code> namespace to Pocket's topic, and this has
been able to serve their real-time needs.</p>
<h2 id="glean-1"><a class="header" href="#glean-1">Glean</a></h2>
<p>While the Activity Stream case study above serves as an encouraging example of
the flexibility of the pipeline to accept custom payloads, we hope to insulate
most data producers from having to interact directly with HTTP
requests and JSON Schema definitions at all. The state of the art for analytics
at Mozilla is
<a href="tools/../concepts/glean/glean.html">Glean</a>, a set of
projects that reimagines the end-to-end experience of reporting and consuming
analytics data.</p>
<p>Glean sits on top of structured ingestion, but provides helpful
abstractions — instead of building JSON payloads and making HTTP requests, your
application declares logical metrics and makes calls to a generated SDK
idiomatic to your application's language. Support exists not only for a
wide range of language SDKs but also for a variety of prebuilt reporting tools
that understand Glean schemas such that your application's metrics are
automatically processed and presented.</p>
<p>All new use cases for producing analytics payloads within Mozilla should
consider Glean first. If a mature Glean SDK is available for your project's
language, building on top of Glean promises maintainable reporting code for your
application and data that can be more richly understood by the full ecosystem of
analytics tools at Mozilla.</p>
<h2 id="structured-ingestion"><a class="header" href="#structured-ingestion">Structured Ingestion</a></h2>
<p>We discussed the overall shape of Mozilla's structured ingestion system and how
to integrate with it in the case study earlier in this article, so this section
will be brief.</p>
<p>When you choose to build on top of structured ingestion, whether
using the Glean SDK or by registering custom named schemas, consider the
following concerns which are automatically handled for you:</p>
<ul>
<li>Validation of payloads against a JSON schema; messages failing validation are
routed to an errors table in BigQuery where they can be monitored and
backfilled if necessary.</li>
<li>Geo lookup using a GeoIP database; geo-city information is
presented as metadata, allowing the pipeline to discard source IP addresses to
protect user privacy.</li>
<li>User agent parsing; major user agent features are extracted and presented as
metadata, allowing the pipeline to discard the raw user agent string to
protect user privacy.</li>
<li>Extraction of client-level identifiers as metadata to use for generating a
<a href="https://docs.telemetry.mozilla.org/concepts/sample_id.html"><code>sample_id</code></a> field
and to support automated deletion of data upon user request.</li>
<li>Deduplication of messages; we provide best-effort deduplication for output
Pub/Sub topics and full deduplication within each UTC day in the historical
ping tables in BigQuery.</li>
</ul>
<p>If you have doubts about whether structured ingestion is appropriate for your
use case, <a href="https://docs.telemetry.mozilla.org/concepts/getting_help.html">please reach out to the Data Platform
team</a> and we can
consult on current and planned features for the pipeline.</p>
<h2 id="bigquery-1"><a class="header" href="#bigquery-1">BigQuery</a></h2>
<p><a href="https://cloud.google.com/bigquery/docs/">BigQuery</a> is the standard choice within
Mozilla's environment for storing structured data for non-real time analysis. It
is especially well suited to large and diverse organizations because of its access
controls and full separation between storage and compute infrastructure. Different
teams within Mozilla can provision BigQuery tables in separate GCP projects,
retaining full control over how they ingest data and how they grant access to
other teams. Once access is granted, though, it becomes trivial to write queries
that join data across projects.</p>
<h3 id="defining-tables"><a class="header" href="#defining-tables">Defining tables</a></h3>
<p>BigQuery tables can express complex nested structures via compound <code>STRUCT</code>
types and <code>REPEATED</code> fields. It's possible to model arbitrary JSON payloads as
BigQuery tables, but there are
<a href="https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-json#limitations">limitations to JSON modeling</a>
that are well-described in BigQuery's documentation.</p>
<p>We have developed tooling for translating JSON schemas into BigQuery table
schemas along with some conversion code to transform payloads to match the final
structure needed in BigQuery. One major example is map types; when the number of
possible keys is finite, they can be baked into the schema to present the map as
a BigQuery <code>STRUCT</code> type, but free-form maps have to be modeled in BigQuery as a
repeated <code>STRUCT</code> of keys and values. This is one case where we have chosen to
follow the same conventions that BigQuery itself uses for <a href="https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-avro#complex_types">converting complex
Avro types to BigQuery
fields</a>,
which requires modifying the JSON payload to convert</p>
<pre><code class="language-json">{
&quot;key1&quot;: &quot;value1&quot;,
&quot;key2&quot;: &quot;value2&quot;
}
</code></pre>
<p>into</p>
<pre><code class="language-json">[
{
&quot;key&quot;: &quot;key1&quot;,
&quot;value&quot;: &quot;value1&quot;
},
{
&quot;key&quot;: &quot;key2&quot;,
&quot;value&quot;: &quot;value2&quot;
}
]
</code></pre>
<p>For more detail on how the data pipeline prepares schemas and translates
payloads, see the
<a href="https://github.com/mozilla/jsonschema-transpiler"><code>jsonschema-transpiler</code></a>
project which is used by
<a href="https://github.com/mozilla/mozilla-schema-generator/"><code>mozilla-schema-generator</code></a>.</p>
<h3 id="how-to-get-data-into-bigquery"><a class="header" href="#how-to-get-data-into-bigquery">How to get data into BigQuery</a></h3>
<p>Google provides a variety of methods for loading data into BigQuery as discussed
in their <a href="https://cloud.google.com/bigquery/docs/loading-data">Introduction to Loading
Data</a>. The traditional path
for loading data is a custom application that uses a Google Cloud SDK to stage
objects in GCS and then issue BigQuery load jobs, but there is also a growing
list of more fully managed integrations for loading data. It is also possible to
present views into data stored in other Google services without loading via
external tables.</p>
<p>If you already have well-structured data being produced to Stackdriver or GCS,
it may be minimal effort to set up BigQuery Transfer Service to import that data
or even to modify your existing server application to additionally issue
BigQuery load jobs. For relational data in Cloud SQL instances, we can pull
data into BigQuery via <a href="https://cloud.google.com/bigquery/docs/federated-queries-intro">federated queries</a>.</p>
<p>And don't forget about the possibility of hooking into the core telemetry
pipeline through <em>structured ingestion</em> as discussed earlier.</p>
<p>If you have a more complex processing need that doesn't fit into an existing
server application, you may want to consider building your application as a
Dataflow pipeline (discussed further down in this document). Dataflow provides a
unified model for batch and streaming processing and includes a variety of
high-level I/O modules for reading from and writing to Google services such as
BigQuery.</p>
<p>For getting data of 3rd party services into BigQuery, consider using <a href="https://docs.telemetry.mozilla.org/concepts/external_data_integration_using_fivetran.html">Fivetran</a>.
Fivetran offers some pre-defined connectors to import data from external APIs,
but also allows for custom connectors to be created.</p>
<h3 id="time-based-partitioning-and-data-retention-in-bigquery"><a class="header" href="#time-based-partitioning-and-data-retention-in-bigquery">Time-based partitioning and data retention in BigQuery</a></h3>
<p>BigQuery provides built-in support for <a href="https://cloud.google.com/bigquery/docs/best-practices-storage">rolling time-based retention at the
dataset and table
level</a>. For the
telemetry pipeline, we have chosen to partition nearly all of our tables based
on the date we receive the payloads at our edge servers. Most tables will
contain a field named <code>submission_timestamp</code> or <code>submission_date</code> that
BigQuery automatically uses to control the assignment of rows to partitions as
they are loaded.</p>
<p>Full day partitions are the fundamental unit we use for all backfill and ETL
activities and BigQuery provides convenient support for operating on discrete
partitions. In particular, BigQuery jobs can be configured to specify an
individual partition as the destination for output (using a partition decorator
looks like <code>telemetry_stable.main_v4$20191201</code>), allowing processing to be
idempotent.</p>
<p>Partitions can also be used as the unit for data retention. For the telemetry
pipeline, we have long retention periods only for the <em>historical ping tables</em>
(e.g. <code>telemetry_stable.main_v4</code>) and downstream derived tables (e.g.
<code>telemetry_derived.clients_daily_v6</code>). Storing intermediate data for long periods
can be expensive and expose risk, so all of the intermediate tables including the
<em>live ping tables</em> (e.g. <code>telemetry_live.main_v4</code>) have partition-based expiration
such that partitions older than 30 days are automatically cleaned up. This
policy balances cost efficiency with the need for a window where we can recover
from errors in the pipeline.</p>
<p>The telemetry pipeline is building support for accepting <code>deletion-request</code> pings
from users and purging rows associated with those users via scheduled jobs. Such
a mechanism can be helpful in addressing policy and business requirements, so
the same considerations should be applied to custom applications storing
messages that contain user identifiers.</p>
<h3 id="access-controls-in-bigquery"><a class="header" href="#access-controls-in-bigquery">Access controls in BigQuery</a></h3>
<p>BigQuery's main access controls can be specified at the dataset or individual
table level.</p>
<p>You can also publish SQL views which are essentially prebuilt queries that are
presented alongside tables in BigQuery. View logic is executed at query time, so
views take up no space and users are subject to the same access controls when
querying a view as they would be querying the underlying tables themselves; a
query will fail if the user does not have read access to all of the datasets
accessed in the view.</p>
<p>Views, however, can also be <em>authorized</em> so that specific groups of users can
run queries who would not normally be allowed to read the underlying tables.
This allows view authors to provide finer-grained controls and to hide specific
columns or rows.
Note that BigQuery also more recently supports column-level access controls for
tables, so this is another option for more granular control.</p>
<h2 id="pubsub"><a class="header" href="#pubsub">Pub/Sub</a></h2>
<p><a href="https://cloud.google.com/pubsub/docs/">Google Cloud Pub/Sub</a> is the standard
choice within Mozilla's environment for transferring data between systems in
real-time. It shares many of the same benefits as BigQuery in terms of being
fully hosted, scalable, and well-integrated with the rest of the GCP environment,
particularly when it comes to access controls.</p>
<p>We use Pub/Sub as the messaging backbone for the telemetry ingestion system and
we can easily provision new Pub/Sub topics containing republished subsets of the
telemetry data for other systems to hook into. We have support for either
producing messages into an external topic controlled by a different team or for
provisioning a new topic within the telemetry infrastructure and granting read
access to individual service accounts as needed.</p>
<p>Pub/Sub is the clear integration point with the telemetry system for any
application that is concerned with up-to-the-minute latency. For applications
that only need to see periodic recent views of telemetry data, be aware that
<em>live ping tables</em> (i.e. <code>telemetry_live.main_v4</code>) in BigQuery are also an option.
New data is loaded into those tables throughout the day either on a 10 minute
cadence or as they arrive via streaming inserts to BigQuery. Please contact us
if there's a subset of data you'd like us to consider opting in for streaming
inserts.</p>
<h2 id="dataflow"><a class="header" href="#dataflow">Dataflow</a></h2>
<p><a href="https://cloud.google.com/dataflow/docs/">Google Cloud Dataflow</a> is a service
for running data processing applications using the Apache Beam SDKs in both
batch and streaming modes. Understanding the Beam programming model requires
a certain amount of developer investment, but Beam provides powerful abstractions
for data transformations like windowed joins that are difficult to implement
reliably by hand.</p>
<p>The Dataflow jobs in use by the data platform actually don't require complex
joins or windowing features, but we have found Beam's I/O abstractions useful
for being able to adapt a single code base to handle reading from and writing to
a variety of data stores. Dataflow also provides good built-in support for
auto-scaling streaming jobs based on latency and observed throughput, especially
when interacting with Pub/Sub. That said, the I/O abstractions allow only
limited control over performance and we have found the need to replace some of
our Dataflow jobs with custom applications running on GKE — particularly
the jobs focused on batching together messages from Pub/Sub and sinking to GCS
or BigQuery.</p>
<p>Beam's <code>BigQueryIO</code> module requires shuffling data several times when writing,
checkpointing the intermediate state to local disk. This incurs expense for
provisioning local solid state drives to handle the checkpointing throughput and
introduces the possibility of data loss on unclean shutdown since messages have
to be acknowledged back to Pub/Sub at the time data is first checkpointed rather
than when it is finally written to BigQuery. We were able to achieve lower cost
and more straight-forward delivery guarantees by writing a custom application
using the Google Cloud Java SDK. We still use a streaming Dataflow job for the
<em>decoder</em> step of the pipeline since no checkpointing is needed for a simple job
that both reads from and writes to Pub/Sub. We also rely on Dataflow batch jobs
for all backfill activities.</p>
<p>If your team has streaming needs where Dataflow makes sense, be aware that the
Data Operations team can provide operational support to help you launch and
manage pipelines.</p>
<h2 id="derived-data-and-airflow"><a class="header" href="#derived-data-and-airflow">Derived Data and Airflow</a></h2>
<p>The structure in which data is ingested to BigQuery is often not the most
convenient or efficient structure for analysis queries, so it is often necessary
to provide logical views of the data to support users. Our interface for
defining such views is the
<a href="https://github.com/mozilla/bigquery-etl"><code>bigquery-etl</code></a> repository which
provides instructions for how to propose new tables and views by sending pull
requests containing SQL queries.</p>
<p>We use BigQuery <em>views</em> heavily to improve the usability of raw data and we
recommend that you do too! As discussed in the
<a href="tools/guiding_principles.html#access-controls-in-bigquery">BigQuery Access Controls</a> section above,
views take up no storage resources and are essentially reusable snippets
that appear like tables, but the underlying logic is executed every time a user
queries a view. For simple cases like renaming fields or unnesting array
columns, a view is often the right choice as it can be defined once and requires
no ongoing scheduling or maintenance.</p>
<p>If, however, you want to provide users with a view that involves joins or
aggregations that hit a great deal of data, you may find that queries slow down
and become expensive. In those cases, it may be better to materialize the
results of the view into a derived table. See the <a href="https://docs.telemetry.mozilla.org/cookbooks/bigquery-airflow.html">Scheduling BigQuery Queries
in Airflow</a>
cookbook for a walk-through of how to define a query in <code>bigquery-etl</code> and
get it scheduled to run nightly on the data platform's Airflow instance.</p>
<h2 id="final-thoughts"><a class="header" href="#final-thoughts">Final Thoughts</a></h2>
<p>While no single data architecture can meet all needs, the core pipeline at
Mozilla has been built with flexibility in mind. We have a growing list of
success cases and some major projects in the works to migrate legacy pipelines
to the system — these are good indicators that we are meeting a broad set of
needs for the majority data warehousing use case and that we provide a stable
ingestion system for streaming applications as well.</p>
<p>GCP's roster of services is fairly well-focused compared to other cloud
providers, but it can still be overwhelming to sort through the available
options, particularly where multiple services seem to occupy the same space.
Consult the unofficial <a href="https://grumpygrace.dev/posts/gcp-flowcharts/">GCP
flowcharts</a> for a broad view of
how to sort through the features of services that apply to different problem
domains. We encourage the use of BigQuery, Pub/Sub, and Dataflow as core
building blocks for custom data applications across Mozilla for ease of access
control across projects and for leveraging shared knowledge about how to operate
and integrate with those services. Possibilities in the cloud can seem endless,
but the more we can standardize architectural approaches across the company, the
better prepared we will be to collaborate across product teams and ultimately
the better positioned we will be to realize our mission. Let's work together to
keep individuals empowered, safe, and independent on the Internet.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/tools/guiding_principles.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="glean-2"><a class="header" href="#glean-2">Glean</a></h1>
<p>For Mozilla, getting reliable data from our products is critical to inform our decision making. Glean is our new product analytics &amp; telemetry solution that provides a consistent experience and behavior across all of our products.</p>
<p>The list of supported platforms and implementations is <a href="https://mozilla.github.io/glean/dev/core/internal/implementations.html">available in the Glean SDK Book</a>.</p>
<blockquote>
<p>Note that this is different from <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/index.html">Telemetry for Firefox Desktop</a> although it provides similar capabilities.</p>
</blockquote>
<p>Contents:</p>
<ul>
<li><a href="concepts/glean/glean.html#overview">Overview</a></li>
<li><a href="concepts/glean/glean.html#the-glean-design-principles">The Glean design principles</a></li>
<li><a href="concepts/glean/glean.html#how-to-use-glean">How to use Glean</a></li>
<li><a href="concepts/glean/glean.html#contact">Contact</a></li>
<li><a href="concepts/glean/glean.html#references">References</a></li>
</ul>
<h1 id="overview-1"><a class="header" href="#overview-1">Overview</a></h1>
<p><img src="concepts/glean/../../assets/glean-overview.png" alt="An overview of the Glean project: On the left the product core records data using the Glean SDK. The Glean SDK then sends out this data to the Data Platform. The analysis tools (redash, Looker, GLAM, Debug View) to the right receive the data from the data platform" /></p>
<p>The <strong>Glean SDK</strong> performs measurements and sends data from our products.
It provides a set of <strong><a href="https://mozilla.github.io/glean/book/reference/metrics/index.html">metric types</a></strong> for individual measurements that are carefully designed to avoid common pitfalls with measurement.
Metrics are then rolled up into <strong><a href="https://mozilla.github.io/glean/book/user/pings">pings</a></strong> to send over the network.
There are a number of built-in pings that are sent on predefined schedules, but it also possible to send custom pings at any desired cadence.</p>
<p>The <strong>Data Platform</strong> validates and stores these pings in database tables.
A fault tolerant design allows data to be retained in the event of problems such as traffic spikes or invalid data.
See <a href="concepts/glean/../pipeline/gcp_data_pipeline.html">An overview of Mozillas Data Pipeline</a> for details.
Derived and cleaned data can also be automatically created at this stage.</p>
<p>The <strong>Analysis Tools</strong> are used to query and visualize the data.
This includes <a href="https://sql.telemetry.mozilla.org/">Redash</a>, <a href="https://mozilla.cloud.looker.com/">Looker</a>, <a href="https://glam.telemetry.mozilla.org/">GLAM</a> and the <a href="https://debug-ping-preview.firebaseapp.com/">Debug Ping View</a>.
Because Glean knows more about the individual data, such as its type and the ranges of acceptable values, it can in many cases provide the most appropriate visualization automatically.</p>
<h1 id="the-glean-design-principles"><a class="header" href="#the-glean-design-principles">The Glean design principles</a></h1>
<p><strong>Provide a consistent base of telemetry</strong></p>
<p>A baseline of analysis is important for all our products, from counting active users to retention and session times. This is supported out-of-the-box by the SDK, and funnels directly into visualization tools like the <a href="https://gud.telemetry.mozilla.org/">Growth and Usage Dashboard (GUD)</a>.</p>
<p>Metrics that are common to all products, such as the operating system and architecture, are provided automatically in a consistent way.</p>
<p>Any issues found with these base metrics only need to be fixed in Glean to benefit all SDK-using products.</p>
<p><strong>Encourage specificity</strong></p>
<p>Rather than just treating metrics as generic data points, Glean wants to know as much as possible about the things being measured, and be opinionated about how data is measured and aggregated.</p>
<p>From this information, it can:</p>
<ul>
<li>Provide a well-designed API to perform specific types of measurements, which is consistent and avoids common pitfalls</li>
<li>Reject invalid data, and report them as errors</li>
<li>Store the data in a consistent way, rather than custom, ad hoc data structures</li>
<li>Provide the most appropriate visualization and analysis automatically</li>
</ul>
<p>A side-effect of this design is that Glean telemetry is write-only: it would be impossible to enforce all of these constraints and achieve all of these benefits if client code could read, modify and update data.</p>
<p><strong>Follow <a href="https://leandatapractices.com/">lean data practices</a></strong></p>
<p>The Glean system enforces that all measurements received <a href="https://wiki.mozilla.org/Firefox/Data_Collection">data review</a>, and it is impossible to collect measurements that haven't been declared.
It also makes it easy to limit data collection to only what's necessary:</p>
<ul>
<li>Enforced expiration dates for every metric</li>
<li>Some metric types can automatically limit resolution</li>
<li>It's easy to send data that isn't associated with the client id</li>
</ul>
<p>Glean also supports data transparency by automatically generating documentation for all of the metrics sent by an application.</p>
<p><strong>Provide a self-serve experience</strong></p>
<p>Adding new metric is designed to be as easy as possible.
Simply by adding a few lines of configuration, everything to make them work across the entire suite of tools happens automatically.
This includes previously manual and error-prone steps such as updating the ping payload and database schemas.</p>
<h1 id="how-to-use-glean"><a class="header" href="#how-to-use-glean">How to use Glean</a></h1>
<ul>
<li>
<p><a href="https://mozilla.github.io/glean/book/user/adding-glean-to-your-project/index.html">Integrate the Glean SDK</a> into your product.</p>
</li>
<li>
<p><a href="https://mozilla.cloud.looker.com/">Use Looker</a> to build Explores and Dashboards using your product's datasets.</p>
</li>
<li>
<p>If Looker does not provide the necessary Explores you can resort to <a href="https://sql.telemetry.mozilla.org/">using Redash</a> to write SQL queries &amp; build dashboards using your products datasets, e.g.:</p>
<ul>
<li><code>org_mozilla_fenix.baseline</code></li>
<li><code>org_mozilla_fenix.events</code></li>
<li><code>org_mozilla_fenix.metrics</code></li>
<li>There is <a href="concepts/glean/../../cookbooks/accessing_glean_data.html">more documentation about accessing Glean data</a>.</li>
</ul>
</li>
<li>
<p>For experimentation, you can use <a href="https://github.com/mozilla/application-services/blob/main/components/nimbus/README.md">Nimbus SDK</a>, which integrates with Glean.</p>
</li>
</ul>
<h1 id="contact"><a class="header" href="#contact">Contact</a></h1>
<ul>
<li><code>#glean</code> on slack</li>
<li><a href="https://chat.mozilla.org/#/room/#glean:mozilla.org">#glean:mozilla.org</a> on matrix</li>
<li><a href="mailto:glean-team@mozilla.com"><code>glean-team@mozilla.com</code></a> to reach out</li>
</ul>
<h1 id="references"><a class="header" href="#references">References</a></h1>
<ul>
<li>The <a href="https://github.com/mozilla/glean/">Glean SDK</a> implementation.</li>
<li><a href="https://bugzilla.mozilla.org/enter_bug.cgi?product=Data%20Platform%20and%20Tools&amp;component=Glean%3A%20SDK">Reporting issues &amp; bugs for the Glean SDK</a>.</li>
<li>Datasets documentation (TBD)</li>
</ul>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/concepts/glean/glean.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="an-overview-of-mozillas-data-pipeline"><a class="header" href="#an-overview-of-mozillas-data-pipeline">An overview of Mozillas Data Pipeline</a></h1>
<p>This post describes the architecture of Mozillas data pipeline,
which is used to collect Telemetry data from our products and logs from various services.</p>
<p>The bulk of the data handled by this pipeline is Firefox Telemetry data, but the
same tool-chain is used to collect, store, and analyze data coming from many
sources, including <a href="concepts/pipeline/../glean/glean.html">Glean</a> applications.</p>
<p>Here is a simplified diagram of how data is ingested into the data warehouse.</p>
<p>The code for the ingestion pipeline lives in the <a href="https://github.com/mozilla/gcp-ingestion"><code>gcp-ingestion</code></a> repository.</p>
<pre class="mermaid">graph TD
f1(fa:fa-firefox Firefox) --&gt;|HTTP Post| d0(fa:fa-filter Ingestion Edge)
d0 --&gt; p1(fa:fa-stream Raw Topic)
p1 --&gt; d1(fa:fa-exchange-alt Landfill Sink)
d1 --&gt; b1(fa:fa-database Landfill BQ)
p1 --&gt; d2(fa:fa-exchange-alt Decoder)
d2 --&gt;|success| p2(fa:fa-stream Decoded Topic)
d2 -.-&gt;|fail| p3(fa:fa-stream Errors Topic)
p3 --&gt; d4(fa:fa-exchange-alt Errors Sink)
p2 --&gt; d3(fa:fa-exchange-alt BigQuery Sink)
d3 --&gt; b2(fa:fa-database Live Tables BQ)
d4 --&gt; b3(fa:fa-database Error Tables BQ)
classDef pubsub fill:#eff,stroke:#099;
classDef exec fill:#efe,stroke:#090;
classDef producers fill:#fee,stroke:#f90;
classDef bq fill:#ececff,stroke:#9370db;
class p1,p2,p3 pubsub
class d0,d1,d2,d3,d4 exec
class f1 producers
class b1,b2,b3 bq
</pre>
<h2 id="firefox"><a class="header" href="#firefox">Firefox</a></h2>
<p>There are different APIs and formats to <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/collection/index.html">collect data</a> in Firefox, all suiting different use cases:</p>
<ul>
<li><a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/collection/histograms.html">histograms</a> – for recording multiple data points;</li>
<li><a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/collection/scalars.html">scalars</a> – for recording single values;</li>
<li><a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/collection/measuring-time.html">timings</a> – for measuring how long operations take;</li>
<li><a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/collection/events.html">events</a> – for recording time-stamped events.</li>
</ul>
<p>These are commonly referred to as <em><a href="concepts/pipeline/../../datasets/new_data.html">probes</a></em>.
Each probe must declare the <a href="https://wiki.mozilla.org/Firefox/Data_Collection">collection policy</a> it conforms to: either <em>release</em> or <em>prerelease</em>.
When adding a new measurement data-reviewers carefully inspect the probe and eventually approve the requested collection policy:</p>
<ul>
<li>Release data is collected from all Firefox users.</li>
<li>Prerelease data is collected from users on Firefox Nightly and Beta channels.</li>
</ul>
<p>Users may choose to turn the data collection off in preferences.</p>
<p>A <em>session</em> begins when Firefox starts up and ends when it shuts down.
As a session could be long-running and last weeks, it gets sliced into
smaller logical units called <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/concepts/sessions.html#subsessions">subsessions</a>.
Each subsession generates a batch of data containing the current state
of all probes collected so far, in the form of a [<code>main</code> ping], which is
sent to our servers.
The <code>main</code> ping is just one of the many <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/concepts/pings.html#ping-types">ping types</a> we support.
Developers can <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/collection/custom-pings.html">create their own ping types</a> if needed.</p>
<p><em>Pings</em> are submitted via an <a href="https://searchfox.org/mozilla-central/rev/501eb4718d73870892d28f31a99b46f4783efaa0/toolkit/components/telemetry/app/TelemetryController.jsm#231">API</a> that performs a HTTP POST request to our edge servers.
If a ping fails to successfully <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/concepts/submission.html#submission">submit</a> (e.g. because of missing internet connection),
Firefox will store the ping on disk and retry to send it until the maximum ping age is exceeded.</p>
<h2 id="ingestion-1"><a class="header" href="#ingestion-1">Ingestion</a></h2>
<p>Submissions coming in from the wild hit a load balancer and then an
HTTP Server that <a href="concepts/pipeline/http_edge_spec.html">accepts POST requests</a> containing a
message body of optionally-gzipped JSON.</p>
<p>These messages are forwarded to a PubSub message queue with minimal processing,
and made available in a <strong>Raw</strong> topic.</p>
<p>A <a href="https://cloud.google.com/dataflow/docs/">Dataflow</a> job reads this topic and writes the raw messages to a BigQuery <strong>Landfill</strong> sink.
This Landfill data is not used for analysis, but is stored in its raw form for
recovery and backfill purposes.</p>
<p>If there is a processing error or data-loss downstream in the pipeline, this is an important fail-safe.</p>
<h2 id="decoding"><a class="header" href="#decoding">Decoding</a></h2>
<p>Once the raw data has been added to the PubSub queue, it's time to process it.</p>
<p>The decoder is implemented as a <a href="https://cloud.google.com/dataflow/docs/">Dataflow</a> job, and is written in Java.</p>
<p>The decoding process tackles decompression, parsing, validation, deduplication,
and enrichment of incoming messages.</p>
<p>After a message is decompressed and parsed as JSON, we apply <a href="https://json-schema.org/understanding-json-schema/">JSONSchema validation</a>
to ensure that submissions are well-formed.</p>
<p>Sometimes duplicate submissions are sent to the pipeline, either due to normal
networking failures or <a href="https://chuttenblog.wordpress.com/2017/05/02/data-science-is-hard-anomalies-part-2/">weird behaviour</a> out there in the world.
We watch for duplicate submissions, and discard any subsequent occurrences of
already-seen records.</p>
<p>Submissions are also enriched with some metadata about the request itself,
including things like HTTP headers, GeoIP information, and submission timestamp.</p>
<p>Messages that pass <em>successfully</em> through all these steps are written to another
PubSub <strong>Decoded</strong> topic.</p>
<p>A failure in any of these steps results in messages being sent to the <strong>Errors</strong> sink.
This separates invalid data from valid data, while still making it available for
monitoring and debugging.
This is a good way to keep an eye on the health of the pipeline and the data
flowing through.</p>
<h2 id="data-warehouse"><a class="header" href="#data-warehouse">Data Warehouse</a></h2>
<p>Decoded data is ultimately written out to BigQuery, which acts as the data warehouse.</p>
<p>By this time, incoming data has already been validated against the corresponding
JSONSchema specification for each document type.
Part of the decoding process above transforms this JSON structure into something
more easily represented in BigQuery.
One important transformation here is to convert all incoming fields from
<code>UPPER CASE</code> or <code>camelCase</code> to <code>snake_case</code>.
Another important transformation is to incorporate metadata about known probes
and metrics to generate more complete schemas.</p>
<p>This is handled by a combination of the decoder above, the <a href="https://github.com/mozilla/jsonschema-transpiler">schema transpiler</a>
and the <a href="https://github.com/mozilla/mozilla-schema-generator">schema generator</a>.
The result are tables that contains SQL-friendly field names for all known
measures, as implemented in the <a href="https://github.com/mozilla/probe-scraper">probe scraper</a>.</p>
<p>A <a href="https://cloud.google.com/dataflow/docs/">Dataflow</a> job reads from the Decoded topic and writes out to
<strong><a href="concepts/pipeline/../../cookbooks/bigquery/querying.html#table-layout-and-naming">live ping tables</a></strong>.
These tables are updated frequently, and typically reflect data within a few
minutes of it being ingested. They are optimized for accessing recent data,
but are only guaranteed to contain a few days of history.</p>
<p>Historical raw ping data is stored in <strong><a href="concepts/pipeline/../../cookbooks/bigquery/querying.html#table-layout-and-naming">historical ping tables</a></strong>,
also known as <strong>stable tables</strong>.
These tables include only completed days of data, are populated once a day
shortly after midnight UTC.
Data in the Stable tables is partitioned by day, and optimized for accessing
larger time periods. It is also optimized for limiting analysis to a fraction
of the data using the <a href="concepts/pipeline/../channels/channel_normalization.html"><code>normalized_channel</code></a> and <a href="concepts/pipeline/../sample_id.html"><code>sample_id</code></a> fields.</p>
<h1 id="beyond-the-data-warehouse"><a class="header" href="#beyond-the-data-warehouse">Beyond the Data Warehouse</a></h1>
<p>The diagram above shows the path data takes to get into the data warehouse.
After that, we have to start using it!</p>
<h2 id="workflow-management-and-etl"><a class="header" href="#workflow-management-and-etl">Workflow Management and ETL</a></h2>
<p>We use <a href="https://github.com/mozilla/telemetry-airflow/">Airflow</a> for workflow management.</p>
<p>It orchestrates the daily creation of the Stable tables described above,
as well as many other derived datasets.</p>
<p>The ETL code to create derived datasets is commonly implemented using queries in BigQuery.</p>
<p>Many examples can be found in the <a href="https://github.com/mozilla/bigquery-etl">bigquery-etl</a> repository.</p>
<p>Data in BigQuery is also accessible via Spark, and several ETL jobs also run via Dataproc.</p>
<p>These jobs produce data sets that are used for downstream analysis and data
applications (such as <a href="https://telemetry.mozilla.org/">measurement</a>,
<a href="https://github.com/mozilla/taar">addon recommendation</a>, and other <a href="concepts/pipeline/../../tools/projects.html#data-applications">data products</a>).</p>
<h2 id="data-analysis"><a class="header" href="#data-analysis">Data Analysis</a></h2>
<p>Once the data reaches our data warehouse in BigQuery it can be processed
in a number of ways as described in the <a href="concepts/pipeline/../../cookbooks/bigquery.html">Accessing BigQuery</a> article.</p>
<p>Data analysis is most commonly done using <a href="https://mozilla.cloud.looker.com">Looker</a> or using <a href="https://sql.telemetry.mozilla.org/">SQL queries</a>.</p>
<p>In summary, the following diagram illustrates how Recording, Ingestion, Transformation, Scheduling and Analytics
fit together:</p>
<p><img src="concepts/pipeline/../../assets/analytics_and_ingestion.jpg" alt="" /></p>
<h2 id="handling-sensitive-data"><a class="header" href="#handling-sensitive-data">Handling Sensitive Data</a></h2>
<p>Some data is more sensitive than others and introduces more risk around how it
is handled. We characterize sensitivity of data broadly into four categories
as described on the <a href="https://wiki.mozilla.org/Data_Collection#Data_Collection_Categories">Data Collection wiki page</a>.</p>
<p>Sensitive data — by which we mean category 3 and 4 data — increases the
level of risk and so warrants extra care. For such data, we apply three main
mitigation measures:</p>
<p><strong>Limited access</strong>
Access to sensitive data is limited to individuals or groups with clearly
articulated need. Access is also limited in time, so access expires by default.</p>
<p><strong>Limited retention</strong>
Sensitive data is kept for a finite period of time after which it is
automatically deleted. For example, data containing search terms is only
kept for a short period of time.</p>
<p><strong>Sanitization and aggregation of data</strong>
Where data has the potential to be personally identifiable, we have best
effort (and improving over time) ETL jobs that scrub incoming data of
potentially identifying information, which reduces risk. When possible, we
build and work with aggregate datasets rather than individual-level data.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/concepts/pipeline/gcp_data_pipeline.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="http-edge-server-specification"><a class="header" href="#http-edge-server-specification">HTTP Edge Server Specification</a></h1>
<p>This document specifies the behavior of the server that accepts submissions from
any HTTP client e.g. Firefox telemetry.</p>
<p>The original implementation of the HTTP Edge Server was tracked in
<a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1129222">Bug 1129222</a>.</p>
<h2 id="general-data-flow"><a class="header" href="#general-data-flow">General Data Flow</a></h2>
<p>HTTP submissions come in from the wild, hit a load balancer,
then the HTTP Edge Server described in this document.
Data is accepted via a POST/PUT request from clients,
and forwarded to the <a href="concepts/pipeline/gcp_data_pipeline.html">Data Pipeline</a>, where
any further processing, analysis, and storage will be handled.</p>
<p>Submission payloads are expected to be optionally-gzipped JSON
documents described by a <a href="https://json-schema.org/">JSONSchema</a>.</p>
<h2 id="server-requestresponse"><a class="header" href="#server-requestresponse">Server Request/Response</a></h2>
<h3 id="get-request"><a class="header" href="#get-request">GET Request</a></h3>
<p>Accept GET on <code>/status</code>, returning <code>OK</code> if all is well. This can be used to
check the health of web servers.</p>
<h3 id="get-response-codes"><a class="header" href="#get-response-codes">GET Response codes</a></h3>
<ul>
<li><em>200</em> - OK. <code>/status</code> and all's well</li>
<li><em>404</em> - Any GET other than <code>/status</code></li>
<li><em>500</em> - All is not well</li>
</ul>
<h3 id="postput-request"><a class="header" href="#postput-request">POST/PUT Request</a></h3>
<p>Treat POST and PUT the same. Accept POST or PUT to URLs of the form:</p>
<p><code>/submit/&lt;namespace&gt;/&lt;docType&gt;/&lt;docVersion&gt;/&lt;docId&gt;</code></p>
<p>A specific example submission URL looks like:</p>
<p><code>/submit/eng-workflow/hgpush/1/2c3a0767-d84a-4d02-8a92-fa54a3376049</code></p>
<p>With the following components:</p>
<ul>
<li><code>namespace</code> - an identifier used for grouping a set of related document types. Typically this represents an application that produces data.</li>
<li><code>docType</code> - a short descriptive name of the document type. Examples include <code>event</code>, <code>crash</code>, or <code>baseline</code></li>
<li><code>docVersion</code> - a numeric value indicating the version of the schema for this <code>docType</code></li>
<li><code>docId</code> - a UUID identifying the exact submission. If the same <code>docId</code> is seen more than once, it will be discarded as a duplicate.</li>
</ul>
<p>The combination of <code>namespace</code>, <code>docType</code> and <code>docVersion</code> together identify a specific schema to be used for validating submissions to the above endpoint.</p>
<p>If a schema is not present in the [schemas repository] corresponding to this combination, the submission
will be considered an error and will not proceed to the data lake.</p>
<h4 id="special-handling-for-firefox-desktop-telemetry"><a class="header" href="#special-handling-for-firefox-desktop-telemetry">Special handling for Firefox Desktop Telemetry</a></h4>
<p>Firefox Desktop Telemetry uses a slightly different URL scheme:</p>
<p><code>/submit/telemetry/docId/docType/appName/appVersion/appUpdateChannel/appBuildID?v=4</code></p>
<p>A specific example:</p>
<p><code>/submit/telemetry/ce39b608-f595-4c69-b6a6-f7a436604648/main/Firefox/61.0a1/nightly/20180328030202?v=4</code></p>
<p>Here the <code>namespace</code> is fixed as &quot;telemetry&quot;, and there is no <code>docVersion</code> in the URL.
This means that incoming JSON documents must be parsed to determine the schema version
to apply for validation. This logic is part of the downstream [decoder] job.
Also note the required query parameter suffix <code>?v=4</code>.
Documents sent under <code>/submit/telemetry</code> without <code>v=4</code> will be rejected at the edge.</p>
<h3 id="postput-response-codes"><a class="header" href="#postput-response-codes">POST/PUT Response codes</a></h3>
<ul>
<li><em>200</em> - OK. Request accepted into the pipeline.</li>
<li><em>400</em> - Bad request, for example an un-encoded space in the URL.</li>
<li><em>404</em> - not found - POST/PUT to an unknown namespace</li>
<li><em>405</em> - wrong request type (anything other than POST/PUT)</li>
<li><em>411</em> - missing content-length header</li>
<li><em>413</em> - request body too large (Note that if we have badly-behaved clients that retry on <code>4XX</code>, we may opt to send back 202 on body/path too long).</li>
<li><em>414</em> - request path too long (See above)</li>
<li><em>500</em> - internal error</li>
</ul>
<h3 id="supported-http-headers"><a class="header" href="#supported-http-headers">Supported HTTP Headers</a></h3>
<p>The following headers will be passed through the pipeline and made available as metadata.</p>
<ul>
<li><code>Date</code> - The client-supplied timestamp of the incoming request.
Used for computing client clock skew.</li>
<li><code>DNT</code> - The &quot;Do Not Track&quot; header.</li>
<li><code>X-PingSender-Version</code> - The version of <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/internals/pingsender.html">Pingsender</a> used to send this ping (if applicable).</li>
<li><code>X-Debug-ID</code> - An optional tag used to make data available to the <a href="https://debug-ping-preview.firebaseapp.com/">Glean Debug View</a>.</li>
<li><code>X-Source-Tags</code> - An optional comma-separated list of tags related to the client source; pings sent from automated testing should include the &quot;automation&quot; tag so that they are not included in analyses</li>
</ul>
<h2 id="other-considerations"><a class="header" href="#other-considerations">Other Considerations</a></h2>
<h3 id="compression"><a class="header" href="#compression">Compression</a></h3>
<p>Compression of submission payloads is optional but recommended.</p>
<p>The supported compression scheme is <code>gzip</code>.</p>
<p>We do not decompress or validate the content of submissions at the edge,
the server will reply with a success code even if a message is badly formed.</p>
<p>Badly formed data is still accepted and made available for monitoring, recovery,
analysis, and analysis purposes.</p>
<h3 id="bad-messages"><a class="header" href="#bad-messages">Bad Messages</a></h3>
<p>Since the actual message is not examined by the edge server the only failures
that occur are defined by the response status codes above. Messages are only
forwarded to the pipeline when a response code of <code>200</code> is returned to the client.</p>
<h3 id="geoip-lookups"><a class="header" href="#geoip-lookups">GeoIP Lookups</a></h3>
<p>No GeoIP lookup is performed by the edge server. If a client IP is available the
the decoder performs the lookup and then discards the IP before the message hits
long-term storage.</p>
<h3 id="data-retention"><a class="header" href="#data-retention">Data Retention</a></h3>
<p>The edge server only stores data while waiting for it to be accepted to
PubSub, spilling to local disk in the case of a PubSub outage.</p>
<p>This means that in the normal case, data is not retained on the edge at all.
In the case of errors writing to PubSub, data is retained until the service
is restored and messages can be flushed to the queue.
Based on <a href="https://status.cloud.google.com/incident/cloud-pubsub">past outages</a>, this is typically a few hours or less.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/concepts/pipeline/http_edge_spec.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="event-data-pipeline"><a class="header" href="#event-data-pipeline">Event Data Pipeline</a></h1>
<p>We collect event-oriented data from different sources. This data is collected and processed in a
specific path through our data pipeline, which we will detail here.</p>
<pre class="mermaid">graph TD
subgraph Products
fx_code(fa:fa-cog Firefox code) --&gt; firefox(fa:fa-firefox Firefox Telemetry)
fx_extensions(fa:fa-cog Mozilla extensions) --&gt; firefox
mobile(fa:fa-cog Mobile products) --&gt; mobile_telemetry(fa:fa-firefox Glean)
end
subgraph Data Platform
firefox -.-&gt;|main ping, Firefox &lt;62| pipeline((fa:fa-database Firefox Data Pipeline))
firefox --&gt;|event ping, Firefox 62+| pipeline
mobile_telemetry --&gt; |events ping| pipeline
pipeline --&gt;|Firefox &lt;62 events| main_summary[fa:fa-bars main summary table]
pipeline --&gt;|Firefox 62+ events| mobile_events_table[fa:fa-bars events table]
main_summary --&gt; events_table
pipeline --&gt;|Glean events| events_table[fa:fa-bars events table]
end
subgraph Data Tools
events_table --&gt; looker
events_table --&gt; looker
main_summary --&gt; looker(fa:fa-bar-chart Looker)
end
style fx_code fill:#f94,stroke-width:0px
style fx_extensions fill:#f94,stroke-width:0px
style mobile fill:#f94,stroke-width:0px
style firefox fill:#f61,stroke-width:0px
style mobile_telemetry fill:#f61,stroke-width:0px
style pipeline fill:#79d,stroke-width:0px
style main_summary fill:lightblue,stroke-width:0px
style events_table fill:lightblue,stroke-width:0px
style mobile_events_table fill:lightblue,stroke-width:0px
style looker fill:salmon,stroke-width:0px
</pre>
<h1 id="overview-2"><a class="header" href="#overview-2">Overview</a></h1>
<p>Across the different Firefox teams there is a common need for a more fine grained understanding of
product usage, like understanding the order of interactions or how they occur over time.
To address that our data pipeline needs to support working with event-oriented data.</p>
<p>We specify a common event data format, which allows for broader, shared usage of data processing tools.
To make working with event data feasible, we provide different mechanisms to get the event data
from products to our data pipeline and make the data available in tools for analysis.</p>
<h1 id="the-event-format"><a class="header" href="#the-event-format">The event format</a></h1>
<p>Events are submitted as an array, e.g.:</p>
<pre><code class="language-javascript">[
[2147, &quot;ui&quot;, &quot;click&quot;, &quot;back_button&quot;],
[2213, &quot;ui&quot;, &quot;search&quot;, &quot;search_bar&quot;, &quot;google&quot;],
[
2892,
&quot;ui&quot;,
&quot;completion&quot;,
&quot;search_bar&quot;,
&quot;yahoo&quot;,
{ querylen: &quot;7&quot;, results: &quot;23&quot; },
],
[5434, &quot;dom&quot;, &quot;load&quot;, &quot;frame&quot;, null, { prot: &quot;https&quot;, src: &quot;script&quot; }],
// ...
];
</code></pre>
<p>Each event is of the form:</p>
<pre><code class="language-javascript">[timestamp, category, method, object, value, extra];
</code></pre>
<p>Where the individual fields are:</p>
<ul>
<li><code>timestamp</code>: <code>Number</code>, positive integer. This is the time in ms when the event was recorded, relative to the main process start time.</li>
<li><code>category</code>: <code>String</code>, identifier. The category is a group name for events and helps to avoid name conflicts.</li>
<li><code>method</code>: <code>String</code>, identifier. This describes the type of event that occurred, e.g. <code>click</code>, <code>keydown</code> or <code>focus</code>.</li>
<li><code>object</code>: <code>String</code>, identifier. This is the object the event occurred on, e.g. <code>reload_button</code> or <code>urlbar</code>.</li>
<li><code>value</code>: <code>String</code>, optional, may be null. This is a user defined value, providing context for the event.</li>
<li><code>extra</code>: <code>Object</code>, optional, may be null. This is an object of the form <code>{&quot;key&quot;: &quot;value&quot;, ...}</code>, both keys and values need to be strings. This is used for events when additional richer context is needed.</li>
</ul>
<p>See also the <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/collection/events.html#serialization-format">Firefox Telemetry documentation</a>.</p>
<h1 id="event-data-collection"><a class="header" href="#event-data-collection">Event data collection</a></h1>
<h2 id="firefox-event-collection"><a class="header" href="#firefox-event-collection">Firefox event collection</a></h2>
<p>To collect this event data in Firefox there are different APIs in Firefox, all addressing different
use cases:</p>
<ul>
<li>The <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/collection/events.html"><em>Telemetry event API</em></a>
allows easy recording of events from Firefox code.</li>
<li>The <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/collection/events.html#registerevents"><em>dynamic event API</em></a>
allows code from Mozilla addons to record new events into Telemetry without shipping Firefox
code.</li>
<li>The <em><a href="https://searchfox.org/mozilla-central/rev/55da592d85c2baf8d8818010c41d9738c97013d2/toolkit/components/extensions/schemas/telemetry.json#87">Telemetry WebExtension API</a></em> (<a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1280234">introduced here</a>)
which allows Mozilla extensions to record new events into Telemetry.</li>
</ul>
<p>For all these APIs, events will get sent to the pipeline through the
<a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/data/event-ping.html">event ping</a>, which gets sent hourly, if any pings were recorded, or up to every 10 minutes whenever 1000 events were recorded.
Before Firefox 62, events were sent through the <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/data/main-ping.html">main ping</a> instead, with a hard limit of 500 events per ping.
From Firefox 61, all events recorded through these APIs are <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1440673">automatically counted in scalars</a>.</p>
<p>Finally, <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/collection/custom-pings.html"><em>custom pings</em></a>
can follow the event data format and potentially connect to the existing tooling with some integration work.</p>
<h2 id="mobile-event-collection"><a class="header" href="#mobile-event-collection">Mobile event collection</a></h2>
<p>Mobile data collection is done through <a href="concepts/pipeline/glean_data.html">Glean</a>. <a href="https://mozilla.github.io/glean/book/reference/metrics/event.html">Glean events</a> are recorded for our mobile applications.</p>
<h1 id="datasets"><a class="header" href="#datasets">Datasets</a></h1>
<p>On the pipeline side, the event data is made available in different datasets:</p>
<ul>
<li><a href="concepts/pipeline/../../datasets/batch_view/main_summary/reference.html"><code>main_summary</code></a> has a row for each main ping and includes
its event payload for Firefox versions before 62.</li>
<li><a href="concepts/pipeline/../../datasets/batch_view/events/reference.html"><code>events</code></a> contains a row for each event received from main pings and event pings. See <a href="https://sql.telemetry.mozilla.org/queries/52582/source"><code>STMO#52582</code></a>.</li>
<li>For applications that collect events through Glean, each application has a separate <code>events</code> dataset.</li>
</ul>
<h1 id="data-tooling"><a class="header" href="#data-tooling">Data tooling</a></h1>
<p>The above datasets are all accessible through <a href="concepts/pipeline/../../tools/stmo.html">STMO</a> and <a href="concepts/pipeline/../../cookbooks/looker/intro.html">Looker</a>.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/concepts/pipeline/event_pipeline.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="generated-schemas"><a class="header" href="#generated-schemas">Generated Schemas</a></h1>
<ul>
<li><a href="concepts/pipeline/schemas.html#overview">Overview</a></li>
<li><a href="concepts/pipeline/schemas.html#schema-deploys-faq">Schema deploys FAQ</a>
<ul>
<li><a href="concepts/pipeline/schemas.html#how-do-i-make-changes-to-a-schema">How do I make changes to a schema?</a></li>
<li><a href="concepts/pipeline/schemas.html#when-will-i-see-new-changes-to-the-schema">When will I see new changes to the schema?</a></li>
<li><a href="concepts/pipeline/schemas.html#what-does-it-mean-when-a-schema-deploy-is-blocked">What does it mean when a schema deploy is blocked?</a></li>
</ul>
</li>
<li><a href="concepts/pipeline/schemas.html#schema-repository">Schema Repository</a>
<ul>
<li><a href="concepts/pipeline/schemas.html#schema-transpiler">Schema Transpiler</a></li>
<li><a href="concepts/pipeline/schemas.html#mozilla-schema-generator">Mozilla Schema Generator</a></li>
</ul>
</li>
<li><a href="concepts/pipeline/schemas.html#data-ingestion">Data Ingestion</a>
<ul>
<li><a href="concepts/pipeline/schemas.html#validation">Validation</a></li>
<li><a href="concepts/pipeline/schemas.html#decoding">Decoding</a>
<ul>
<li><a href="concepts/pipeline/schemas.html#name-normalization">Name Normalization</a></li>
<li><a href="concepts/pipeline/schemas.html#data-structure-normalization">Data Structure Normalization</a></li>
</ul>
</li>
</ul>
</li>
<li><a href="concepts/pipeline/schemas.html#deploying-to-bigquery">Deploying to BigQuery</a>
<ul>
<li><a href="concepts/pipeline/schemas.html#updating-generated-schemas">Updating generated-schemas</a></li>
<li><a href="concepts/pipeline/schemas.html#deploying-schemas-to-production">Deploying schemas to production</a></li>
</ul>
</li>
</ul>
<h2 id="overview-3"><a class="header" href="#overview-3">Overview</a></h2>
<p>Schemas describe the structure of ingested data. They are used in the pipeline to validate the types
and values of data, and to define a table schema in a data store. We use a repository of JSON
Schemas to sort incoming data into <a href="concepts/pipeline/../../cookbooks/bigquery/querying.html#projects-with-bigquery-datasets"><code>decoded</code> and <code>error</code> datasets</a>. We also generate
BigQuery table schemas on business days from the JSON Schemas: you can see the current status of
this job on the <a href="https://protosaur.dev/mps-deploys/"><code>mozilla-pipeline-schemas</code> deploy dashboard</a>.</p>
<pre class="mermaid">graph TD
%% Nodes
subgraph mozilla-pipeline-schemas
main
schemas(generated-schemas)
end
generator(mozilla-schema-generator)
transpiler(jsonschema-transpiler)
probe-info(probe-scraper)
airflow(telemetry-airflow)
ingestion(gcp-ingestion)
bigquery(BigQuery)
%% Node hyperlinks
click bigquery &quot;../../cookbooks/bigquery.html&quot;
click main &quot;https://github.com/mozilla-services/mozilla-pipeline-schemas&quot;
click schemas &quot;https://github.com/mozilla-services/mozilla-pipeline-schemas/tree/generated-schemas&quot;
click generator &quot;https://github.com/mozilla/mozilla-schema-generator&quot;
click transpiler &quot;https://github.com/mozilla/jsonschema-transpiler&quot;
click probe-info &quot;https://github.com/mozilla/probe-scraper&quot;
click ingestion &quot;https://mozilla.github.io/gcp-ingestion/ingestion-beam/&quot;
click airflow &quot;https://github.com/mozilla/telemetry-airflow&quot;
%% Edges
main --&gt; |git clone| generator
transpiler --&gt; |used by| generator
probe-info --&gt; |used by| generator
generator --&gt; |scheduled by| airflow
airflow --&gt; |run nightly| schemas
schemas --&gt; |defines table| bigquery
schemas --&gt; |defines is valid| ingestion
schemas --&gt; |defines normalization| ingestion
ingestion --&gt; |inserts into| bigquery
</pre>
<p><strong>Figure</strong>: <em>An overview of generated schemas. Click on a node to navigate to the relevant
repository or documentation.</em></p>
<h2 id="schema-deploys-faq"><a class="header" href="#schema-deploys-faq">Schema deploys FAQ</a></h2>
<p>This section answers some basic questions about the schema deployment pipeline.</p>
<h3 id="how-do-i-make-changes-to-a-schema"><a class="header" href="#how-do-i-make-changes-to-a-schema">How do I make changes to a schema?</a></h3>
<p>This is dependent on what application you are working on.</p>
<p>If you are working on Firefox Telemetry and are adding a new probe, then you don't have to do
anything. Changes are automatically picked up by the <a href="https://github.com/mozilla/probe-scraper"><code>probe-scraper</code></a> from the
<code>histograms.json</code> and <code>scalars.yaml</code> files in <code>mozilla-central</code>. Non-probe changes (for example,
modifications to the telemetry environment) will require you to make changes to
<a href="https://github.com/mozilla-services/mozilla-pipeline-schemas"><code>mozilla-pipeline-schemas</code></a>.</p>
<p>If you are working on an application using the <a href="concepts/pipeline/../glean/glean.html">Glean SDK</a>, then the
probe-scraper will automatically pick up changes from <code>metrics.yaml</code>.</p>
<h3 id="when-will-i-see-new-changes-to-the-schema"><a class="header" href="#when-will-i-see-new-changes-to-the-schema">When will I see new changes to the schema?</a></h3>
<p>Schema deploys happen on business days around UTC+04 when new changes are found in the
<a href="https://github.com/mozilla-services/mozilla-pipeline-schemas/tree/generated-schema"><code>generated-schemas</code> branch of <code>mozilla-pipeline-schemas</code></a>. This means that any
changes merged after UTC+04 on Friday will not propagate until Monday UTC+04. See the
<a href="https://protosaur.dev/mps-deploys/"><code>mozilla-pipeline-schemas</code> deploy</a> dashboard for up-to-date information on the most
recent deploys.</p>
<h3 id="what-does-it-mean-when-a-schema-deploy-is-blocked"><a class="header" href="#what-does-it-mean-when-a-schema-deploy-is-blocked">What does it mean when a schema deploy is blocked?</a></h3>
<p>The schema deployment pipeline has a hard dependency on the [<code>probe-scraper</code>], a service that scours
repositories for new metrics to include in generated schemas. When the probe-scraper fails, it will
prevent the <a href="https://github.com/mozilla/mozilla-schema-generator"><code>mozilla-schema-generator</code></a> from running. If there are new changes to the main
branch of <code>mozilla-pipeline-schemas</code>, then they will not be added to the <code>generated-schemas</code> branch
until the failure has been resolved. Similarly, new probes and pings in either Telemetry or Glean
will not be picked up until the <code>probe-scraper</code> failures are resolved.</p>
<p>If a new schema field is not registered in the schema repository before collection begins, it will
be available in the <code>additional_properties</code> field of the generated table. If a new schema for a ping
is not registered before collection begins, then it will be sorted into the error stream. Please
<a href="concepts/pipeline/../reporting_a_problem.html">file a bug</a> or <a href="concepts/pipeline/../getting_help.html">reach out</a> if you believe your data
may be affected by blocked schema deploys.</p>
<h2 id="schema-repository"><a class="header" href="#schema-repository">Schema Repository</a></h2>
<pre class="mermaid">graph LR
subgraph mozilla-pipeline-schemas
subgraph origin/main
templates --&gt;|cmake| schemas
end
subgraph origin/generated-schemas
schemas --&gt;|mozilla-schema-generator| artifact(schemas)
end
end
</pre>
<p><strong>Figure</strong>: <em>Template schemas are built locally to generate static JSON Schema. On a regular basis,
the Mozilla Schema Generator is run to generate BigQuery schemas.</em></p>
<p>Refer to <a href="concepts/pipeline/../../cookbooks/new_ping.html">Sending a Custom Ping</a> for an in-depth guide for adding new
schemas to the repository.</p>
<h3 id="schema-transpiler"><a class="header" href="#schema-transpiler">Schema Transpiler</a></h3>
<p>The structure validated in JSON Schema can be mapped to BigQuery columns.
This is done by the <code>jsonschema-transpiler</code>, a Rust application for translating between schema formats.
<a href="concepts/pipeline/schemas.html#decoding">Data normalization as part of decoding</a> is required before inserting into BigQuery e.g. snake casing and type casting.
These workarounds are based transformations that are done when importing Avro into BigQuery.</p>
<pre class="mermaid">graph LR
%% nodes
subgraph input
json(JSON Schemas)
end
subgraph output
avro(Avro schemas)
bigquery(BigQuery schemas)
end
transpiler[jsonschema-transpiler]
%% hyperlinks
click json &quot;https://json-schema.org/&quot;
click avro &quot;https://avro.apache.org/docs/current/spec.html&quot;
click bigquery &quot;https://cloud.google.com/bigquery/docs/schemas&quot;
click transpiler &quot;https://github.com/mozilla/jsonschema-transpiler&quot;
%% edges
json --&gt; transpiler
transpiler --&gt; avro
transpiler --&gt; bigquery
</pre>
<h3 id="mozilla-schema-generator"><a class="header" href="#mozilla-schema-generator">Mozilla Schema Generator</a></h3>
<p>The schema generator will populate schemas with metadata and insert generated sub-schemas at certain paths.
It generates JSON Schemas that are translated into BigQuery schemas, but <em>not</em> used for validation.
It uses the probe information service to enumerate map-type fields.
These fields are converted into a structured column that can be accessed in BigQuery with <a href="https://cloud.google.com/bigquery/docs/reference/standard-sql/functions-and-operators">Standard SQL</a>.
Metadata includes fields added during <a href="concepts/pipeline/schemas.html#data-ingestion">data ingestion</a> including fields like <code>submission_timestamp</code> and <code>sample_id</code>.</p>
<p>In addition to generating BigQuery schemas, schemas are aliased in several locations.
For example, the <code>first_shutdown</code> ping is a copy of the <code>main_ping</code>.
Schemas are also altered in the generator to accommodate various edge-cases in the data.
For example, a field that validates both boolean and integer types may be altered to assume a boolean type.</p>
<p>The main entry-point is a script that merges and generates <code>*.schema.json</code> under the <code>schemas</code> directory, then translates these to <code>*.bq</code>.
It commits the schema to the <code>generated-schemas</code> branch, with a change-log referencing commits in the <code>main</code> branch.</p>
<h2 id="data-ingestion"><a class="header" href="#data-ingestion">Data Ingestion</a></h2>
<h3 id="validation"><a class="header" href="#validation">Validation</a></h3>
<p>Data that fails validation is sent to the <code>payload_bytes_error</code> table. Each row contains an
information about the error that caused it, as well as the name of the job associated with it.</p>
<pre><code class="language-sql">SELECT
document_namespace,
document_type,
document_version,
error_message,
error_type,
exception_class,
job_name
FROM
`moz-fx-data-shared-prod`.payload_bytes_error.telemetry
WHERE
submission_timestamp &gt; TIMESTAMP_SUB(current_timestamp, INTERVAL 1 hour)
AND exception_class = 'org.everit.json.schema.ValidationException'
LIMIT 5
</code></pre>
<div class="table-wrapper"><table><thead><tr><th>Column</th><th>Example Value</th><th>Notes</th></tr></thead><tbody>
<tr><td><code>document_namespace</code></td><td>telemetry</td><td></td></tr>
<tr><td><code>document_type</code></td><td>main</td><td></td></tr>
<tr><td><code>document_version</code></td><td>null</td><td>The version in the <code>telemetry</code> namespace is generated after validation</td></tr>
<tr><td><code>error_message</code></td><td><code>org.everit.json.schema.ValidationException: #/environment/system/os/version: #: no subschema matched out of the total 1 subschemas</code></td><td></td></tr>
<tr><td><code>error_type</code></td><td><code>ParsePayload</code></td><td>The <code>ParsePayload</code> type is associated with schema validation or corrupt data</td></tr>
<tr><td><code>exception_class</code></td><td><code>org.everit.json.schema.ValidationException </code></td><td>Java <a href="https://github.com/everit-org/json-schema">JSON Schema Validator</a> library</td></tr>
<tr><td><code>job_name</code></td><td><code>decoder-0-0121192636-9c56ac6a</code></td><td>Name of the Dataflow job that can be used to determine the version of the schema artifact</td></tr>
</tbody></table>
</div>
<h3 id="decoding-1"><a class="header" href="#decoding-1">Decoding</a></h3>
<p>The BigQuery schemas are used to normalize relevant payload data and determine additional
properties. Normalization involves renaming field names and transforming certain types of data.
Snake casing is employed across all schemas and ensures a consistent querying experience. Some data
must be transformed before insertion, such as map-types (a.k.a. dictionaries in Python), due to
limitations in BigQuery data representation. Other data may not be specified in the schema, and
instead placed into a specially constructed column named <code>additional_properties</code>.</p>
<h4 id="name-normalization"><a class="header" href="#name-normalization">Name Normalization</a></h4>
<p>A reference <a href="https://github.com/acmiyaguchi/test-casing/blob/master/src/main.py">Python
implementation</a> of the snake
casing algorithm is ensured to be compatible with the implementations in the decoder and transpiler using <a href="https://github.com/acmiyaguchi/test-casing/tree/master/test-cases">a shared
test-suite</a>. To illustrate the
transformation, consider the <a href="https://probes.telemetry.mozilla.org/?view=detail&amp;probeId=scalar%2Fa11y.theme"><code>a11y.theme</code> keyed
scalar</a> in the <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/data/main-ping.html">main
ping</a>. In
the JSON document, as seen in <code>about:telemetry</code>, it is accessed as follows:</p>
<pre><code class="language-python"># Python/Javascript syntax
ping[&quot;payload&quot;][&quot;processes&quot;][&quot;parent&quot;][&quot;keyedScalars&quot;][&quot;a11y.theme&quot;]
</code></pre>
<p>The decoder will normalize the path with snake casing in BigQuery:</p>
<pre><code class="language-sql">SELECT
payload.processes.parent.keyed_scalars.a11y_theme
FROM `moz-fx-data-shared-prod`.telemetry.main
WHERE date(submission_timestamp) = date_sub(current_date, interval 1 day)
LIMIT 1
</code></pre>
<h4 id="data-structure-normalization"><a class="header" href="#data-structure-normalization">Data Structure Normalization</a></h4>
<p>Thee decoder is also responsible for transforming the data to
accommodate BigQuery limitations in data representation. All transformations are defined in
<a href="https://github.com/mozilla/gcp-ingestion/blob/main/ingestion-beam/src/main/java/com/mozilla/telemetry/transforms/PubsubMessageToTableRow.java"><code>ingestion-beam</code> under
<code>com.mozilla.telemtry.transforms.PubsubMessageToTableRow</code></a>.</p>
<p>The following transformations are currently applied:</p>
<div class="table-wrapper"><table><thead><tr><th>Transformation</th><th>Description</th></tr></thead><tbody>
<tr><td>Map Types</td><td>JSON objects that contain an unbounded number of keys with a shared value type are represented as a <a href="concepts/pipeline/../../cookbooks/bigquery/querying.html#accessing-map-like-fields">repeated structure containing a <code>key</code> and <code>value</code> column</a>.</td></tr>
<tr><td>Nested Arrays</td><td>Nested lists are represented using a structure containing a repeated <code>list</code> column.</td></tr>
<tr><td>Tuples to Anonymous Structures</td><td>A <a href="https://json-schema.org/understanding-json-schema/reference/array.html#tuple-validation">tuple of items</a> is represented as an anonymous structure with column names starting at <code>_0</code> up to <code>_{n}</code> where <code>n</code> is the length of the tuple.</td></tr>
<tr><td>JSON to String coercion</td><td>A sub-tree in a JSON document will be coerced to string if specified in the BigQuery schema. One example is of transformation is to <a href="concepts/pipeline/../../cookbooks/bigquery/querying.html#accessing-histograms">represent histograms in the main ping</a>.</td></tr>
<tr><td>Boolean to Integer coercion</td><td>A boolean may be cast into an integer type.</td></tr>
</tbody></table>
</div>
<p>Additional properties are fields within the ingested JSON document that are not found in the schema.
When all transformations are completed, any fields that were not traversed in the schema will be
reconstituted into the <a href="concepts/pipeline/../../cookbooks/bigquery/querying.html#structure-of-ping-tables-in-bigquery">top-level <code>additional_properties</code>
field</a>.</p>
<h2 id="deploying-to-bigquery"><a class="header" href="#deploying-to-bigquery">Deploying to BigQuery</a></h2>
<p>In this section, we discuss deployment of generated schemas to BigQuery. Refer to <a href="concepts/pipeline/../../cookbooks/bigquery/querying.html#table-layout-and-naming">Table Layout and
Naming</a> for details about the resulting
structure of the projects.</p>
<p>Tables are updated on every push to <code>generated-schemas</code>. The schemas must be backwards
compatible, otherwise the checks in the staging Dataflow and BigQuery instances will fail. This must
be resolved by pushing a new tip to the <code>generated-schemas</code> branch in the schema repository. <a href="https://cloud.google.com/bigquery/docs/managing-table-schemas">Valid
changes to schemas</a> include relaxing
a column mode from <code>REQUIRED</code> to <code>NULLABLE</code> or adding new columns.</p>
<p>Each table is tagged with the revision of schema repository attached. Consider the
<code>org_mozilla_fenix</code> namespace:</p>
<pre><code class="language-bash">$ bq ls --max_results=3 moz-fx-data-shared-prod:org_mozilla_fenix_stable
tableId Type Labels Time Partitioning Clustered Fields
------------------- ------- --------------------------------------- ----------------------------------- -------------------------------
activation_v1 TABLE schema_id:glean_ping_1 DAY (field: submission_timestamp) normalized_channel, sample_id
schemas_build_id:202001230145_be1f11e
baseline_v1 TABLE schema_id:glean_ping_1 DAY (field: submission_timestamp) normalized_channel, sample_id
schemas_build_id:202001230145_be1f11e
bookmarks_sync_v1 TABLE schema_id:glean_ping_1 DAY (field: submission_timestamp) normalized_channel, sample_id
</code></pre>
<p>The <code>schema_id</code> is derived from the value of the <code>$schema</code> property of each JSON Schema.
The <code>schemas_build_id</code> label contains an identifier that includes the timestamp of the generated schema.
This label may be used to trace the last deployed commit from <code>generated-schemas</code>.</p>
<h3 id="updating-generated-schemas"><a class="header" href="#updating-generated-schemas">Updating generated-schemas</a></h3>
<pre class="mermaid">graph TD
subgraph workflow.tmo
manual
scheduled
end
subgraph mozilla-pipeline-schemas
main
schemas(generated-schemas)
end
generator(mozilla-schema-generator)
%%
manual --&gt; |run now| generator
scheduled --&gt; |run at midnight UTC| generator
main --&gt;|git pull| generator
generator --&gt; |git push| schemas
</pre>
<p>A new push to the <code>generated-schemas</code> branch is made every time the <a href="https://github.com/mozilla/telemetry-airflow/blob/main/dags/probe_scraper.py"><code>probe-scraper.schema_generator</code></a> task is run by Airflow.
<code>mozilla-schema-generator</code> runs in a container that commits snapshots of generated schemas to the remote repository.
Generated schemas may change when <code>probe-scraper</code> finds new probes in defined repositories e.g. <code>hg.mozilla.org</code> or <a href="https://github.com/mozilla/probe-scraper/blob/main/repositories.yaml"><code>glean</code></a>.
It may also change when the <code>main</code> branch contains new or updated schemas under the <code>schemas/</code> directory.</p>
<p>To manually trigger a new push, clear the state of a single task in the workflow admin UI.
To update the schedule and dependencies, update the DAG definition.</p>
<h3 id="deploying-schemas-to-production"><a class="header" href="#deploying-schemas-to-production">Deploying schemas to production</a></h3>
<pre class="mermaid">graph TD
subgraph mozilla-pipeline-schemas
schemas(generated-schemas)
end
artifact[Generate schema artifact]
subgraph moz-fx-data-shar-nonprod-efed
bigquery[Update BigQuery tables]
views[Update BigQuery views]
ingestion[Redeploy Dataflow ingestion]
end
status{Deploy prod}
schemas --&gt; |labeled and archived| artifact
artifact --&gt; |run terraform| bigquery
bigquery --&gt; |run terraform| views
views --&gt; |drain and submit| ingestion
ingestion --&gt; status
</pre>
<p>Jenkins is used to automate deploys of the pipeline in the <code>nonprod</code> and <code>prod</code> projects.
Jenkins polls the <code>generated-schemas</code> branch for new commits.
The tip of the branch will be labeled and archived into an artifact that is used during deploys.
The artifact is first used update the table schemas in the <code>nonprod</code> project.
This staging step will stop on schema incompatible changes, such as removing a schema or a column in a schema.
Once the tables are up to date, the Dataflow job will be drained and redeployed so it is writing to the updated tables.
Once schemas have successfully deployed to the <code>nonprod</code> project, then it may be manually promoted to production by an operator.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/concepts/pipeline/schemas.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="glean-data"><a class="header" href="#glean-data">Glean Data</a></h1>
<p>The following describes in detail how we structure Glean data in BigQuery. For information on
the actual software which does this, see the <a href="concepts/pipeline/schemas.html">Generated Schemas</a> reference.
This document intended as a reference, if you want a tutorial on how best to access Glean Data in BigQuery,
see <a href="concepts/pipeline/../../cookbooks/accessing_glean_data.html">Accessing Glean Data</a>.</p>
<h2 id="tables"><a class="header" href="#tables">Tables</a></h2>
<p>Each ping type is recorded in its own table, and these tables are named using <code>{application_id}.{ping_type}</code>.
For example, for Fenix, the application id is <code>org.mozilla.fenix</code>, so its <code>metrics</code> pings are available in the table <code>org_mozilla_fenix.metrics</code>.</p>
<h2 id="columns"><a class="header" href="#columns">Columns</a></h2>
<p>Fields are nested inside BigQuery STRUCTs to organize them into groups, and we can use dot notation to specify individual subfields in a query.
For example, columns containing Glean's built-in client information are in the <code>client_info</code> struct, so accessing its columns involves using a <code>client_info.</code> prefix.</p>
<p>The top-level groups are:</p>
<ul>
<li><code>client_info</code>: <a href="https://mozilla.github.io/glean/book/user/pings/index.html#the-client_info-section">Client information provided by Glean</a>.</li>
<li><code>ping_info</code>: <a href="https://mozilla.github.io/glean/book/user/pings/index.html#the-ping_info-section">Ping information provided by Glean</a>.</li>
<li><code>metrics</code>: <a href="https://mozilla.github.io/glean/book/user/metrics/index.html">Custom metrics</a> defined by the application and its libraries.</li>
<li><code>events</code>: <a href="https://mozilla.github.io/glean/book/user/metrics/event.html">Custom events</a> defined by the application and its libraries.</li>
</ul>
<h3 id="ping-and-client-info-sections"><a class="header" href="#ping-and-client-info-sections">Ping and Client Info sections</a></h3>
<p><a href="https://mozilla.github.io/glean/book/user/pings/index.html#glean-pings">Core attributes sent with every ping</a> are mapped to the <a href="https://mozilla.github.io/glean/book/user/pings/index.html#the-client_info-section"><code>client_info</code></a> and <a href="https://mozilla.github.io/glean/book/user/pings/index.html#the-ping_info-section"><code>ping_info</code></a> sections.
For example, the client id is mapped to a column called <code>client_info.client_id</code>.</p>
<h3 id="the-metrics-group"><a class="header" href="#the-metrics-group">The <code>metrics</code> group</a></h3>
<p>Custom metrics in the <code>metrics</code> section have two additional levels of indirection in their column name: they are organized by the metric type, and then by their category: <code>metrics.{metric_type}.{category}_{name}</code>.</p>
<p>For example, suppose you had the following <code>boolean</code> metric defined in a <code>metrics.yaml</code> file (abridged for clarity):</p>
<pre><code class="language-yaml">browser:
is_default:
type: boolean
description: &gt;
Is this application the default browser?
send_in_pings:
- metrics
</code></pre>
<p>It would be available in the column <code>metrics.boolean.browser_is_default</code>.</p>
<h3 id="the-events-group"><a class="header" href="#the-events-group">The <code>events</code> group</a></h3>
<p>Events are stored as a set of records in a single column called &quot;events&quot;: there might be many events sent as part of a single ping.
Each record has the following fields which allow you to filter for the specific metrics of interest:</p>
<ul>
<li>category (maps to the metric category)</li>
<li>name (maps to the metric name)</li>
</ul>
<p>For example, suppose you had the following <code>event</code> metric defined in a <code>metrics.yaml</code> file (again, abridged for clarity):</p>
<pre><code class="language-yaml">engine_tab:
foreground_metrics:
type: event
description: |
Event collecting data about the state of tabs when the app comes back to
the foreground.
extra_keys:
extra_keys:
background_active_tabs:
description: |
Number of active tabs (with an engine session assigned) when the app
went to the background.
...
</code></pre>
<p>In this case the event's <code>category</code> would be <code>engine_tab</code> and its name would be <code>foreground_metrics</code>.</p>
<p>You can use the record's <code>timestamp</code> and <code>extra</code> fields to get the event's timestamp and specifics related
to the event.
For a complete example, see <a href="concepts/pipeline/../../cookbooks/accessing_glean_data.html#event-metrics">&quot;event metrics&quot; under Accessing Glean Data</a>.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/concepts/pipeline/glean_data.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="channel-normalization"><a class="header" href="#channel-normalization">Channel Normalization</a></h1>
<p>This document describes how the data pipeline normalizes channel information
sent by Firefox and makes it accessible to data consumers.</p>
<ul>
<li><a href="concepts/channels/channel_normalization.html#what-are-firefox-channels">What are Firefox channels?</a></li>
<li><a href="concepts/channels/channel_normalization.html#app-update-channel">App Update Channel</a></li>
<li><a href="concepts/channels/channel_normalization.html#normalized-channel">Normalized Channel</a></li>
</ul>
<h2 id="what-are-firefox-channels"><a class="header" href="#what-are-firefox-channels">What are Firefox channels?</a></h2>
<p>In addition to the the <code>release</code> channel, which is what we ship to most users,
we also ship development versions of Firefox and an &quot;extended support&quot; (<code>esr</code>).
The full list is:</p>
<ul>
<li><code>release</code></li>
<li><code>beta</code></li>
<li><code>aurora</code> (this is <code>dev-edition</code>, and <a href="https://www.mozilla.org/en-US/firefox/developer/">is just a beta repack</a>)</li>
<li><code>nightly</code></li>
<li><code>esr</code></li>
</ul>
<p>For more information on this topic, see the <a href="https://wiki.mozilla.org/Release_Management/Release_Process">Firefox Release Process page</a>.</p>
<h2 id="app-update-channel"><a class="header" href="#app-update-channel">App Update Channel</a></h2>
<p>This is the channel reported by Firefox.
This could really be anything, but is usually one of the expected release channels listed above.</p>
<p>For BigQuery tables corresponding to Telemetry Ping types, such as <code>main</code>, <code>crash</code> or <code>event</code>,
the field here is called <code>app_update_channel</code> and is found in <code>metadata.uri</code>. For example:</p>
<pre><code class="language-sql">SELECT
metadata.uri.app_update_channel
FROM
telemetry.main
WHERE
DATE(submission_timestamp) = '2019-09-01'
LIMIT
10
</code></pre>
<h2 id="normalized-channel"><a class="header" href="#normalized-channel">Normalized Channel</a></h2>
<p>This field is a normalization of the directly reported channel, and replaces unusual
and unexpected values with the string <code>Other</code>.
There are a couple of exceptions, notably that variations on <code>nightly-cck-*</code> become <code>nightly</code>.
<a href="https://github.com/mozilla/gcp-ingestion/blob/92ba503c4debc887e746d5f2ff5ee60becb8072f/ingestion-beam/src/main/java/com/mozilla/telemetry/transforms/NormalizeAttributes.java#L38">See the relevant code here</a>.</p>
<p>Normalized channel is available in the Telemetry Ping tables as a top-level field
called <code>normalized_channel</code>.
For example:</p>
<pre><code class="language-sql">SELECT
normalized_channel
FROM
telemetry.crash
WHERE
DATE(submission_timestamp) = '2019-09-01'
LIMIT
10
</code></pre>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/concepts/channels/channel_normalization.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="sampling-in-telemetry-data"><a class="header" href="#sampling-in-telemetry-data">Sampling in Telemetry data</a></h1>
<p>Since the early days of Telemetry, it has been desirable to have a quick and
simple way to do analysis on a sample of the full population of Firefox
clients.</p>
<p>The mechanism for doing that is encoded in the data itself, namely the
<code>sample_id</code> field.</p>
<p>This is a field that is computed from the telemetry <code>client_id</code> using
the <a href="https://en.wikipedia.org/wiki/Cyclic_redundancy_check">CRC</a> hash function.</p>
<p>This CRC hash is then bucketed into 100 possible values from 0 to 99,
each of which represents a roughly 1% uniform sample of the <code>client_id</code> space.</p>
<p>All ping tables that contain a client id, as well as many derived datasets,
include the <code>sample_id</code> field.</p>
<p>TL;DR <code>sample_id = crc32(client_id) % 100</code></p>
<p>An example python implementation:</p>
<pre><code class="language-python"># USAGE: python cid2sid.py 859c8a32-0b73-b547-a5e7-8ef4ed9c4c2d
# Prints
# Client ID b'859c8a32-0b73-b547-a5e7-8ef4ed9c4c2d' =&gt; Sample ID 55
import binascii
import sys
clientid = sys.argv[1].encode()
crc = binascii.crc32(clientid)
sampleid = (crc &amp; 0xFFFFFFFF) % 100
print(&quot;Client ID {} =&gt; Sample ID {}&quot;.format(clientid, sampleid))
</code></pre>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/concepts/sample_id.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="filtering-data"><a class="header" href="#filtering-data">Filtering Data</a></h1>
<h2 id="table-of-contents-8"><a class="header" href="#table-of-contents-8">Table of Contents</a></h2>
<ul>
<li><a href="concepts/pipeline/filtering.html#overview">Overview</a></li>
<li><a href="concepts/pipeline/filtering.html#stages">Stages</a>
<ul>
<li><a href="concepts/pipeline/filtering.html#edge-filtering">Edge filtering</a></li>
<li><a href="concepts/pipeline/filtering.html#beam-filtering">Beam Filtering</a></li>
<li><a href="concepts/pipeline/filtering.html#json-schema-filtering">JSON Schema Filtering</a></li>
<li><a href="concepts/pipeline/filtering.html#filtering-from-the-stable-tables">Filtering from the stable tables</a></li>
<li><a href="concepts/pipeline/filtering.html#filtering-from-the-exposed-views">Filtering from the exposed views</a></li>
<li><a href="concepts/pipeline/filtering.html#optional-filtering-in-looker-explores">Optional filtering in Looker Explores</a></li>
</ul>
</li>
<li><a href="concepts/pipeline/filtering.html#querying-the-error-stream">Querying the Error Stream</a></li>
</ul>
<h2 id="overview-4"><a class="header" href="#overview-4">Overview</a></h2>
<p>Data is filtered out of production streams at almost every stage of the pipeline.
The following outlines each stage and both the data currently filtered and the
data that could be filtered. This should help answer two classes of questions:</p>
<ol>
<li>Did my data get filtered out?</li>
<li>We've uncovered spurious data being ingested, how should we handle that?</li>
</ol>
<p><em>Note</em>: <a href="concepts/pipeline/filtering.html#json-schema-filtering">JSON Schema filtering</a> is our primary method of filtering out bad data. That should be used before any other methods of dropping data from the pipeline.</p>
<h2 id="stages"><a class="header" href="#stages">Stages</a></h2>
<p><strong>Where</strong> - Which stage of the pipeline this filtering occurs in</p>
<p><strong>What</strong> - What happens to the data when filtered here</p>
<p><strong>When</strong> - Which situations this filtering should be, and is, used in</p>
<p><strong>How</strong> - What kind of data can be filtered at this stage</p>
<h3 id="edge-filtering"><a class="header" href="#edge-filtering">Edge filtering</a></h3>
<p>Where: Filtered by nginx, <a href="https://github.com/mozilla-services/cloudops-infra/blob/master/projects/data-ingestion/k8s/charts/data-ingestion/templates/filter-configmap.yaml#L14-L41">currently we use it to filter out non-v4 pings</a>
(<a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1678497">to be removed, but capability to remain</a>).</p>
<p>What: Drops data entirely from the pipeline; there will be no traces of it downstream from the edge server.</p>
<p>When: Only to be used in extreme situations (e.g. PII exposure). We also use it for dropping <a href="https://github.com/mozilla/gcp-ingestion/blob/master/docs/architecture/overview.md#limits">too-large messages</a> and <a href="https://github.com/mozilla/gcp-ingestion/blob/master/ingestion-edge/ingestion_edge/util.py#L95">headers</a>.</p>
<p>How: Can be used to filter by URI, namespaces, apps, etc. (from the URL or from the HTTP headers); but not anything in the payload.</p>
<h3 id="beam-filtering"><a class="header" href="#beam-filtering">Beam Filtering</a></h3>
<p>Where: Filtered in the <a href="https://github.com/mozilla/gcp-ingestion/blob/master/ingestion-beam/src/main/java/com/mozilla/telemetry/decoder/MessageScrubber.java">message scrubber</a>.</p>
<p>What: Causes data to be written to the <a href="concepts/pipeline/filtering.html#querying-the-error-stream">error stream</a> or to be dropped entirely.</p>
<p>When: Filter out data we absolutely know we will never need to see (e.g. data from forked applications).</p>
<p>How: Can filter out namespaces, doctypes, or URIs currently; in the extreme can filter on any <a href="https://github.com/mozilla/gcp-ingestion/blob/master/ingestion-core/src/main/java/com/mozilla/telemetry/ingestion/core/Constant.java#L8">message Attribute</a> or payload field.</p>
<h3 id="json-schema-filtering"><a class="header" href="#json-schema-filtering">JSON Schema Filtering</a></h3>
<p>Where: During ingestion, as defined in the <a href="https://github.com/mozilla-services/mozilla-pipeline-schemas/">payload schema</a>.</p>
<p>What: Causes data to be written to the <a href="concepts/pipeline/filtering.html#querying-the-error-stream">error stream</a>.</p>
<p>When: When trying to remove bad analysis data that we know we will never need (e.g. huge values, improper strings, etc.). Usually these indicate something went wrong with the payload.</p>
<p>How: Can filter on values in the payload, using the <a href="https://json-schema.org/understanding-json-schema/">JSON schema</a>.</p>
<h3 id="filtering-from-the-stable-tables"><a class="header" href="#filtering-from-the-stable-tables">Filtering from the stable tables</a></h3>
<p>Where: After ingestion to live tables, but <a href="https://github.com/mozilla/bigquery-etl/blob/master/bigquery_etl/copy_deduplicate.py#L40">before copying to the stable tables</a>.</p>
<p>What: Allows data to exist in the live tables, but removes it from the stable tables.</p>
<p>When: Use for data that may be needed for some analyses on recent data, but not for data that will need long-term historical analyses or for use in any downstream reporting. For example, we <a href="https://mozilla.github.io/glean/book/user/debugging/index.html?highlight=sourcetags#enabling-debugging-features-through-environment-variables">filter out pings from automation</a> (e.g. CI) here, so that analysis is unaffected by them, but we can still analyze what the recent CI data looks like. We also drop duplicate pings (per the document-id).</p>
<p>How: Can filter on any field in the schema, or any metadata.</p>
<h3 id="filtering-from-the-exposed-views"><a class="header" href="#filtering-from-the-exposed-views">Filtering from the exposed views</a></h3>
<p>Where: After ingestion to stable tables (<a href="https://github.com/mozilla/bigquery-etl/blob/master/sql/moz-fx-data-shared-prod/telemetry/lockwise_mobile_events_v1/view.sql#L17">example</a>).</p>
<p>What: Allows data to exist in stable tables, but not be exposed to users when accessing views.</p>
<p>When: Use for data that is a footgun for end-users (e.g. data that was collected before a product was launched), but will probably be needed by data science or eng.</p>
<p>How: Can filter on any field in the schema, or any metadata.</p>
<h3 id="optional-filtering-in-looker-explores"><a class="header" href="#optional-filtering-in-looker-explores">Optional filtering in Looker Explores</a></h3>
<p>Where: In the explore, Looker creates a <a href="https://docs.looker.com/reference/field-params/default_value">default filter for a field</a>.</p>
<p>What: Allows data to exist in views, and optionally allows users to query that data (but not by default).</p>
<p>When: Use this for data that most of the time should not be queried in Looker. Downside is too many of these will clutter the Looker explore.</p>
<p>How: Can filter on any field in the schema, or any metadata.</p>
<h2 id="querying-the-error-stream"><a class="header" href="#querying-the-error-stream">Querying the Error Stream</a></h2>
<p>The data engineering team has exposed some tables to make querying the error stream easier.</p>
<p><a href="https://sql.telemetry.mozilla.org/dashboard/schema-errors">The schema errors dashboard</a> will let you choose your namespace and doctype to see
errors over the past week.</p>
<p>If that data is not granular enough, the error stream can be queried directly:</p>
<pre><code class="language-sql">SELECT
udf.parse_desktop_telemetry_uri(uri) AS parsed_uri,
* EXCEPT(payload),
udf_js.gunzip(payload) AS payload
FROM
`moz-fx-data-shared-prod.payload_bytes_error.telemetry`
WHERE
DATE(submission_timestamp) = &quot;2021-01-07&quot;
LIMIT
1000
</code></pre>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/concepts/pipeline/filtering.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="bigquery-artifact-deployment"><a class="header" href="#bigquery-artifact-deployment">BigQuery Artifact Deployment</a></h1>
<p>Artifacts that get deployed automatically, usually during nightly Airflow runs, to BigQuery include:</p>
<ul>
<li>user-defined functions</li>
<li>datasets</li>
<li>tables</li>
<li>views</li>
</ul>
<h2 id="dataset-deployment"><a class="header" href="#dataset-deployment">Dataset Deployment</a></h2>
<p>Terraform will deploy datasets defined in <a href="https://github.com/mozilla/bigquery-etl">bigquery-etl</a> and <a href="https://github.com/mozilla/private-bigquery-etl">private-bigquery-etl</a> and datasets that are configured via <a href="https://github.com/mozilla-services/cloudops-infra/blob/master/projects/data-shared/tf/prod/envs/prod/bigquery-new/namespaces.auto.tfvars.json">cloudops-infra</a>. The dataset deployment is triggered via Jenkins whenever bigquery-etl publishes a new container or after schema deployment.</p>
<h2 id="user-defined-function-udf-deployment"><a class="header" href="#user-defined-function-udf-deployment">User-defined Function (UDF) Deployment</a></h2>
<p>There are two categories of user-defined functions:</p>
<ul>
<li>UDFs for internal use only: These UDFs are published to the <code>udf</code> and <code>udf_js</code> datasets in the <code>moz-fx-data-shared-prod</code> project and managed via <a href="https://github.com/mozilla/bigquery-etl/tree/main/sql/moz-fx-data-shared-prod/udf">bigquery-etl</a></li>
<li>public UDFs that can be used even outside of Mozilla projects: These UDFs are published to the <code>mozfun</code> project and managed via <a href="https://github.com/mozilla/bigquery-etl/tree/main/sql/mozfun">bigquery-etl</a></li>
</ul>
<p>The UDF deploy is triggered through the <code>publish_public_udfs</code> and <code>publish_persistent_udfs</code> Airflow task in the <a href="https://workflow.telemetry.mozilla.org/tree?dag_id=bqetl_artifact_deployment"><code>bqetl_artifact_deployment</code></a> Airflow DAG which run nightly, but can be triggered manually by clearing the tasks.</p>
<h2 id="table-deployment"><a class="header" href="#table-deployment">Table Deployment</a></h2>
<p>Tables get deployed through the <a href="https://workflow.telemetry.mozilla.org/tree?dag_id=bqetl_artifact_deployment"><code>publish_new_tables</code> Airflow task</a> which runs nightly. This task will run all SQL generators, generate schemas for each query and will deploy schemas for tables and apply any schema-compatible changes for existing tables. The task will fail if changes that are incompatible with existing schemas (such as removing fields or changing field types) are to be applied.</p>
<h2 id="view-deployment"><a class="header" href="#view-deployment">View Deployment</a></h2>
<p>View deployment runs after new tables have been published through the <a href="https://workflow.telemetry.mozilla.org/tree?dag_id=bqetl_artifact_deployment"><code>publish_views</code> Airflow task</a>. This task will run all SQL generators, which ensures that SQL for generated views will be available. Only views with either changes to their SQL definition, schema, metadata or newly created views will be deployed.</p>
<p>Views that have been defined in bigquery-etl will be tagged with a <code>managed</code> label. This label is used to automatically remove views from BigQuery that have been deleted in bigquery-etl. Having this label ensures that manually created views or views created through other tooling won't get deleted as part of this process.</p>
<p>Views get published to <code>moz-fx-data-shared-prod</code> and publicly-facing views get published to <code>mozdata</code>.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/concepts/pipeline/artifact_deployment.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="analysis-gotchas"><a class="header" href="#analysis-gotchas">Analysis Gotchas</a></h1>
<p>When you perform an analysis on any data, there are some mistakes that are easy to make:</p>
<ul>
<li>Do you know what question that you hope to answer?</li>
<li>Is your sample representative of your population?</li>
<li>Is your result &quot;real&quot;? How precisely can you state your conclusion?</li>
</ul>
<p>This section is not about those traps. Instead, it is about quirks and pitfalls that are specific to Mozilla's data systems.</p>
<h2 id="table-of-contents-9"><a class="header" href="#table-of-contents-9">Table of Contents</a></h2>
<ul>
<li><a href="concepts/analysis_gotchas.html#intermittent-issues">Intermittent issues</a></li>
<li><a href="concepts/analysis_gotchas.html#notable-historic-events">Notable historic events</a></li>
<li><a href="concepts/analysis_gotchas.html#pseudo-replication">Pseudo-replication</a></li>
<li><a href="concepts/analysis_gotchas.html#profiles-vs-users">Profiles vs Users</a></li>
<li><a href="concepts/analysis_gotchas.html#opt-in-versus-opt-out">Opt-in versus Opt-out</a></li>
<li><a href="concepts/analysis_gotchas.html#trusting-dates">Trusting Dates</a></li>
<li><a href="concepts/analysis_gotchas.html#date-formats">Date Formats</a></li>
<li><a href="concepts/analysis_gotchas.html#delays">Delays</a>
<ul>
<li><a href="concepts/analysis_gotchas.html#pingsender">Pingsender</a></li>
<li><a href="concepts/analysis_gotchas.html#submission-date">Submission Date</a></li>
</ul>
</li>
<li><a href="concepts/analysis_gotchas.html#pings-from-robots">Pings from Robots</a></li>
<li><a href="concepts/analysis_gotchas.html#build-ids">Build Ids</a></li>
</ul>
<h2 id="intermittent-issues"><a class="header" href="#intermittent-issues">Intermittent issues</a></h2>
<p>Despite best efforts, problems may occur from time to time with the ingestion of data or the faithful creation of datasets.</p>
<p>Issues undergoing investigation are marked with the <code>[data-quality]</code> whiteboard tag in Bugzilla. See <a href="https://bugzilla.mozilla.org/buglist.cgi?bug_status=UNCONFIRMED&amp;bug_status=NEW&amp;bug_status=ASSIGNED&amp;bug_status=REOPENED&amp;classification=Client%20Software&amp;classification=Developer%20Infrastructure&amp;classification=Components&amp;classification=Server%20Software&amp;classification=Other&amp;priority=P1&amp;priority=P2&amp;priority=P3&amp;priority=--&amp;product=Data%20Platform%20and%20Tools&amp;resolution=---&amp;status_whiteboard=%5Bdata-quality%5D&amp;status_whiteboard_type=allwordssubstr&amp;list_id=15179084">currently open issues</a>.</p>
<p>Especially severe problems with production data are announced on the <code>fx-data-dev</code> mailing list (see <a href="concepts/getting_help.html">getting help</a>). Subscribing to this mailing list is recommended if you are a current or aspiring data practitioner.</p>
<h2 id="notable-historic-events"><a class="header" href="#notable-historic-events">Notable historic events</a></h2>
<p>See also <a href="https://docs.google.com/spreadsheets/d/16Cyx_KBieRdQkSBKolivqpBaK2H-VceN9LEZcL0snHg/edit#gid=0">the spreadsheet of notable historic events</a>. This spreadsheet is imported into BigQuery, and can be found at <code>moz-fx-data-shared-prod.static.data_incidents_v1</code>.</p>
<p><strong>If you add an entry here, please add it to that spreadsheet as well!</strong></p>
<p>When you start to evaluate trends, be aware of events from the past that may invite comparisons with history. Here are a few to keep in mind:</p>
<ul>
<li><strong>Aug 22, 2024</strong> - Adjust is disabled in mobile clients (Firefox Android &amp; Firefox iOS).</li>
<li><strong>May 20, 2024 - June 14, 2024</strong> - Excessive Glean database writes degraded Fenix performance on startup, pageload, scrolling, video playback, and possibly other areas. Some metrics were disabled. See <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1892230">bug 1892230</a>.</li>
<li><strong>Apr 18, 2024</strong> - Google began rolling out a new SERP experience on Firefox Android, which impacted standard search metrics. See <a href="https://docs.google.com/spreadsheets/d/16Cyx_KBieRdQkSBKolivqpBaK2H-VceN9LEZcL0snHg/edit#gid=0">the spreadsheet</a> for more details.</li>
<li><strong>March 15, 2024</strong> - Spike in clients with null default search engine (~32%) for Firefox iOS. This issue was fixed for Firefox 126 and above. More details can be found in this <a href="https://docs.google.com/document/d/1iMoGOnTZZ920oth6aWbzD_npP79vSQq6OQx_bz4vFXw/edit">incident report</a></li>
<li><strong>Feb 29 , 2024</strong> - Spike in the <a href="https://dictionary.telemetry.mozilla.org/apps/firefox_desktop/metrics/search_engine_default_changed">search engine changed probe</a> for users who had an engine update for versions &gt;= 124 . This is due to search engine default changed probes being triggered during engine updates even when users don't actually change their default search engines post-update <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1876178">bug</a>. More details can be found <a href="https://mozilla-hub.atlassian.net/browse/RS-1051">here</a>.</li>
<li><strong>Jan 15 - May 1, 2024</strong> - Legacy Telemetry pings containing os information from Arch Linux clients without the <code>lsb-release</code> package were <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1875874">dropped</a>.</li>
<li><strong>Dec 8, 2023</strong> - (ongoing) Firefox iOS Clients coming through the French ISP Netskope report an increased number of 4xx HTTP errors on submission (<a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1870670">bug 1870670</a>).</li>
<li><strong>Dec 7, 2023</strong> - Contextual Services data for Firefox Desktop versions 116 and up <a href="http://mozilla-hub.atlassian.net/browse/DSRE-1489">now supplied by Glean</a>.</li>
<li><strong>Nov 20, 2023</strong> - Changeover day for Onboarding data sent via Messaging System from PingCentre to Glean. Views and datasets downstream of <code>messaging_system.onboarding</code> <a href="https://github.com/mozilla/bigquery-etl/pull/4457">began being fueled by Glean-sent data instead of PingCentre-sent data</a>.</li>
<li><strong>Jul 16, 2023 - Jul 24, 2023</strong> - During the migration from release to ESR of Firefox users on obsolete versions of MacOS and Windows, Firefox sent deletion request pings for clients in the migration, which also reset the <code>client_id</code>. <a href="https://docs.google.com/document/d/1vdn9OFSoKPD5wt14dmTwyh0kGs-96fWx26ESui95jo0/edit">See the summary of the incident here.</a>. Approximately 2 million clients were affected by this bug; as a result of this, around 1.3 million clients were double counted due to both the old <code>client_id</code> and reset <code>client_id</code> being active on the same day.</li>
<li><strong>Mar 17, 2023 - May 9, 2023</strong> - Firefox for Android was collecting but not sending <code>perf.page_load</code> events during this period. The recorded events started being sent after May 9, 2023 resulting in a spike of events that eventually returned to normal levels. <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1833178">See Bug 1833178 for more info.</a></li>
<li><strong>Mar 14, 2023</strong> - Firefox for Android began reporting significantly fewer new installs, due to <a href="https://docs.google.com/document/d/1Tf8F2FndPsOAWc7peLxgUZd4t-LUJu8RMotiDhgKF7I/edit#heading=h.xyargldz6xg0">a fix for Client ID regeneration</a>. This also affected retention for both new and existing users.</li>
<li><strong>Nov 15, 2022</strong> - A major bug in the <code>search with ads</code> probes was fixed on Firefox Desktop. The <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1800506">bug fix</a> impacts Firefox 109+ and resulted in significant increases in the number of searches with ads recorded.</li>
<li><strong>Aug 31 2022</strong> - <a href="https://mozilla-hub.atlassian.net/browse/DSRE-999">A small number of records were missing from stable tables until October 5, 2022 and not reprocessed into downstream ETL tables</a>.</li>
<li><strong>July 19 - August 3, 2022</strong> - <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1781085">Fenix v103 seeing an increase in <code>null</code> values in <code>client_info</code> fields</a>.
Glean failed to properly collect data for the <code>client_info</code> fields <code>android_sdk_version</code>, <code>device_model</code>, <code>device_manufacturer</code> and <code>locale</code>.
This has been fixed in subsequent releases and is fixed in Fenix 103.2 and all later releases.
No backfill.</li>
<li><strong>May 24 - Jun 10, 2022</strong> - <code>search_with_ads</code> drops on Firefox Desktop globally. Upon <a href="https://docs.google.com/document/d/1wdU1O6Anmqs87PdyYXympTXHznoskU6pVdSgTS6ilpA/edit">investigation</a>, the issue is believed to be related to Google's core algorithm update in May 2022.</li>
<li><strong>May 15, 2022</strong> - <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1673868">Fixed potential under report <code>search_with_ads</code></a>.
Ad impressions were not tracked for SERP that took longer than 1 second to load. This was initially uncovered by QA for ad impressions on DuckDuckGo SERP. The fix addresses for all search partners and is not limited to DuckDuckGo.</li>
<li><strong>Dec 1, 2021 - Jan 23, 2022</strong> - <a href="https://mozilla-hub.atlassian.net/browse/DO-673">Search values in Android Focus from core telemetry fell</a>.</li>
<li><strong>Nov 16, 2021</strong> - <a href="https://mozilla-hub.atlassian.net/browse/DS-1843">Submissions were rejected from 17:44 to 18:10 UTC</a>.</li>
<li><strong>Nov 4, 2021</strong> - <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1676676">CORS headers added to support receiving submissions from Glean.js websites</a>.</li>
<li><strong>Sep 30 2021 - Oct 06 2021</strong> - <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1733953">Submissions from some countries were rejected</a>.</li>
<li><strong>Sep 30 2021 - Oct 04 2021</strong> - <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1733953">Submissions from clients on some older platforms were dropped</a>.</li>
<li><strong>Aug 23 2021 - Aug 29 2021</strong> - <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1729069">Approximately 1/251 of pings were improperly labeled as coming from Kansas City, US</a>.</li>
<li><strong>Aug 05 2021 - Aug 31 2021</strong> - Drop in search metrics (<code>tagged_sap</code>, <code>tagged_follow_on</code>, <code>search_with_ads</code>, <code>ad_click</code>) in Fenix due to probe expiry. <a href="https://docs.google.com/document/d/1C29HmYponPcqtX4yR4QA7uBkhhkAM76WqMW3PQBnL_g/edit">Incident report</a> and <a href="https://sql.telemetry.mozilla.org/queries/82098/source#203423"><code>STMO#203423</code></a>.</li>
<li><strong>Feb 16 2021 - Feb 25 2021</strong> - <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1694764">A small number of stub installer pings may have been discarded due to URI deduplication</a>.</li>
<li><strong>Jan 28, 2021</strong> - <a href="https://docs.google.com/document/d/1MEsAUqjaIZCUtWLFAhXHxq-m1hDEw1QAOKYEva4DDZk/edit">Fenix DAU jumped rapidly, due to increased sending of the baseline ping</a></li>
<li><strong>August 6, 2020</strong> - <a href="https://github.com/mozilla/bigquery-etl/pull/1215">Pings with &quot;automation&quot; tag in X-Source-Tags will no longer appear in stable tables</a>
This is particularly relevant for removing pings related to automated testing of Fenix.</li>
<li><strong>August 1, 2020 - August 31, 2020</strong> - <a href="https://sql.telemetry.mozilla.org/queries/89203#220890">Fennec was migrated to Fenix</a>, causing changes in both how data was reported (Glean rather than the core ping) and some reported metrics (e.g. DAU, as people dropped off).</li>
<li><strong>July 20, 2020</strong> - <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1653244">Glean dropping application lifetime metrics from <code>metrics</code> pings</a>.
Glean Android bindings from version <code>v25.0.0</code> up to and including <code>v31.4.0</code> had a bug that would cause metrics with “lifetime: application” to be cleared before they could be collected for metrics pings sent during startup. This can result in application lifetime metrics like experiment information being randomly missing from the data.</li>
<li><strong>April 14, 2020</strong> - <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1630096">Telemetry edge server rejects pings for an hour</a>.
Clients generally retry periodically until success is achieved. Therefore, most of these messages were eventually ingested; submission timestamps appear later than normal. A small number of pings are attributed to a later day or were never sent due to the client never being reopened.</li>
<li><strong>February 11, 2020</strong> - Firefox 73 was released and began the start of <a href="https://docs.google.com/document/d/1oJhnvAOx2c8Mp-Xpk-3j-2d45yu_fghYS2yAbn1aeNY/edit#heading=h.iba82gckexg7">4-week release cycles</a>. There was a gradual transition from 7/8 week ones to 4 week ones.</li>
<li><strong>December 4 2019</strong> - <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1598815">AWS Ingestion Pipeline decommissioned</a>.
Specifically, the last ping relayed through the AWS machinery had a
timestamp of <code>2019-12-04 22:04:45.912204 UTC</code>.</li>
<li><strong>October 29 2019</strong> - Glean SDK Timing Distribution(s) report buckets
1 nanosecond apart. This occurred because of a potential rounding bug in Glean SDK
versions less than <code>19.0.0</code>. See <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1591938">Bug 1591938</a>.</li>
<li><strong>October 23 2019</strong> - <a href="https://docs.google.com/document/d/1gQF-iU3E21SG985Cl2Ius4LoRXduUrNa5In9hafLIqs/edit">Hot-fix shipped through add-ons</a> that
reset the Telemetry endpoint preference back to the default for a large number of users.</li>
<li><strong>September 1 - October 18 2019</strong> - BigQuery Ping tables are
<a href="https://github.com/mozilla-services/cloudops-infra/pull/1491">missing the <code>X-PingSender-Version</code> header information</a>.
This data is available before and after this time period.</li>
<li><strong>May 4 - May 11 2019</strong> - <a href="https://blog.mozilla.org/blog/2019/05/09/what-we-do-when-things-go-wrong/">Telemetry source data deleted</a>.
No source data is available for this period and derived tables may have
missing days or imputed values.
Derived tables that depend on multiple days may have have affected dates
beyond the deletion region.</li>
<li><strong>January 31 2019</strong> - <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1474285">Profile-per-install</a> landed in <code>mozilla-central</code>
and affected how new profiles were created.
See <a href="https://github.com/mozilla/bigquery-etl/issues/212">discussion in <code>bigquery-etl#212</code></a>.</li>
<li><strong>October 25 2018</strong> - many <code>client_id</code>s on Firefox Android were reset to the
same <code>client_id</code>.
For more information, see the <a href="https://docs.google.com/document/d/1r1PDQnqhsrPkft0pB46v9uhXGxR_FzK4laKJLGttXdA">post-mortem</a>
or <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1501329">Bug 1501329</a>.</li>
<li><strong>November 2017</strong> - Quantum Launch. There was a surge in new profiles and usage.</li>
<li><strong>June 1 and 5, 2016</strong> - <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1482509">Main Summary <code>v4</code> data is missing</a>
for these two days.</li>
<li><strong>March 2016</strong> - Unified Telemetry launched.</li>
</ul>
<h2 id="pseudo-replication"><a class="header" href="#pseudo-replication">Pseudo-replication</a></h2>
<p>Telemetry data is a collection of pings.
A single main-ping represents a single subsession.
Some clients have more subsessions than others.</p>
<p>When you say <a href="https://mzl.la/2q75dbF">&quot;63% of beta 53 has Firefox set as its default browser&quot;</a>, you need to specify that it is 63% of <em>pings</em> because it represents only around 46% of clients.
(Apparently users with Firefox Beta 53 set as their default browser submitted
more main-pings than users who did not).</p>
<h2 id="profiles-vs-users"><a class="header" href="#profiles-vs-users">Profiles vs Users</a></h2>
<p>You may have noticed that the term &quot;clients&quot; and not &quot;users&quot; was applied in the above-listed section because of all the things can be counted, users is not one of them:</p>
<p>Users can have multiple Firefox profiles that runs on the same system at
the same time (like developers).</p>
<p>Users can have the same Firefox profile that runs on several systems on
different days of the week (also developers).</p>
<p>The only things we can count are pings and clients.
Clients can be counted because a <code>client_id</code> is sent with each ping that uniquely
identifies the profile from which it originated.
This is generally close enough to the idea of a so-called &quot;user&quot; that can be considered for counting profiles and calling them users. However, you may run into some instances
where the distinction matters.</p>
<p>When in doubt, be precise. You count <em>clients</em>.</p>
<p><a href="concepts/./profile/index.html">This article</a> describes the concept of &quot;profiles&quot; in detail.</p>
<h2 id="opt-in-versus-opt-out"><a class="header" href="#opt-in-versus-opt-out">Opt-in versus Opt-out</a></h2>
<p>Mozilla does not collect the same information from everyone.</p>
<p>Every profile that does not have Telemetry disabled sends &quot;opt-out&quot; Telemetry, which
includes:</p>
<ul>
<li>Nearly all the data in the <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/data/environment.html">Environment</a></li>
<li>Some specific <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/collection/histograms.html">Histograms</a>, <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/collection/scalars.html">Scalars</a>, and <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/collection/events.html">Events</a> that are marked
<code>&quot;releaseChannelCollection&quot;: &quot;opt-out&quot;</code></li>
</ul>
<p>Most probes are &quot;opt-in&quot;: No information is received from the probes unless a user
opts into sending this information by installing any pre-release version of Firefox:
Beta, Nightly or Developer Edition (the latter is similar to Beta).</p>
<p>If you want to encourage users to collect good information for Mozilla, ask them to install a Beta release.</p>
<h2 id="trusting-dates"><a class="header" href="#trusting-dates">Trusting Dates</a></h2>
<p>Do not assume that the time reported by an instance of Firefox desktop is correct. The situation is somewhat better on mobile devices, but you should still be cautious.</p>
<p>Any timestamp recorded by the user is subject to &quot;clock skew.&quot;
The user's clock can be set (purposefully or accidentally) to any time at all.
SSL certificates tend to keep timestamps in a certain relatively-accurate window
because a user whose clock has been set too far in the past or too far in the future
may confuse certain expiration-date-checking code.</p>
<p>Examples of client times from Firefox desktop pings:</p>
<ul>
<li><code>crashDate</code></li>
<li><code>crashTime</code></li>
<li><code>meta/Date</code></li>
<li><code>sessionStartDate</code></li>
<li><code>subsessionStartDate</code></li>
<li><code>profile/creationDate</code></li>
</ul>
<p>Examples of client times from Glean pings:</p>
<ul>
<li><a href="https://mozilla.github.io/glean/book/user/pings/index.html#the-ping_info-section"><code>ping_info.end_time</code></a></li>
</ul>
<p>Examples of server times that you can trust:</p>
<ul>
<li><code>submission_timestamp</code></li>
<li><code>submission_date</code></li>
</ul>
<p><em>Note</em> <code>submission_date</code> does not appear in the <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/data/common-ping.html">ping documentation</a>
because it is added in post-processing.</p>
<h2 id="date-formats"><a class="header" href="#date-formats">Date Formats</a></h2>
<p>Not all dates and times are created equal.
Most of the dates and times in Telemetry pings are <a href="https://en.wikipedia.org/wiki/ISO_8601">ISO 8601</a>.
Most are full timestamps, though their resolution may differ from per-second to per-day.</p>
<p>Then there's <code>profile/creationDate</code> which is just a number of days since epoch (January 1, 1970).
Like <code>17177</code> for the date 2017-01-11.</p>
<p><strong>Tip:</strong> If you want to convert <code>profile/creationDate</code> to a usable date in SQL:
<code>DATE_FROM_UNIX_DATE(SAFE_CAST(environment.profile.creation_date AS INT64))</code></p>
<p>In derived datasets ISO dates are sometimes converted to strings using one of
the following formats: <code>%Y-%m-%d</code> or <code>%Y%m%d</code>.</p>
<p>The date formats for different rows in <code>main_summary</code> are described on the
<a href="concepts/../datasets/batch_view/main_summary/reference.html#time-formats"><code>main_summary</code> reference page</a>.</p>
<p>Although build IDs look like dates, they are not. If you take the first eight characters, you can use them as a proxy for the day when the build was released.</p>
<p><code>metadata.header.date</code> represents an HTTP Date header in a <a href="https://datatracker.ietf.org/doc/html/rfc7231#section-7.1.1.1">RFC 7231</a>-compatible format.</p>
<p><strong>Tip:</strong> If you want to parse <code>metadata/Date</code> to become a usable date in SQL:
<code>SAFE.PARSE_TIMESTAMP('%a, %d %b %Y %T %Z', REPLACE(metadata.header.date, 'GMT+00:00', 'GMT'))</code>
Alternatively, you can use the already parsed version that is available in user-facing views (<code>metatdata.header.parsed_date</code>)</p>
<h2 id="delays"><a class="header" href="#delays">Delays</a></h2>
<p>There is an inherent delay between Telemetry data being created on the client and it being received by Mozilla.
Most Telemetry data produced by desktop Firefox is represented in the main ping. It is sent at the beginning of a client's <em>next</em> Firefox session.
If the user shuts down Firefox for the weekend, Firefox does not receive any data that has been generated on Friday data until Monday morning.</p>
<p>Generally speaking, data from two days ago is usually fairly representative.</p>
<p>If you'd like to read more about this subject, there is a series of blog posts <a href="https://chuttenblog.wordpress.com/2017/02/09/data-science-is-hard-client-delays-for-crash-pings/">here</a>, <a href="https://chuttenblog.wordpress.com/2017/07/12/latency-improvements-or-yet-another-satisfying-graph/">here</a> and <a href="https://chuttenblog.wordpress.com/2017/09/12/two-days-or-how-long-until-the-data-is-in/">here</a>.</p>
<h3 id="pingsender"><a class="header" href="#pingsender">Pingsender</a></h3>
<p>Pingsender greatly reduces any delay before sending pings to Mozilla.
However, only some types of pings are sent by Pingsender.
<a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1310703">Bug 1310703</a> introduced Pingsender for crash pings and was merged in Firefox 54,
which was included in release on June 13, 2017.
<a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1336360">Bug 1336360</a> moved shutdown pings to Pingsender and was merged in Firefox 55,
which was included in release on August 8, 2017.
<a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1374270">Bug 1374270</a> added sending health pings on shutdown using Pingsender and was
merged in Firefox 56, which was included in release on Sept 28, 2017.
Other types of pings are not sent with Pingsender.
This is usually okay because Firefox is expected to continue to run long
enough to send these pings.</p>
<p>Mobile clients do not have Pingsender. Therefore, a delay occurs as described in <a href="https://sql.telemetry.mozilla.org/queries/49867#134105"><code>STMO#49867</code></a>.</p>
<h3 id="submission-date"><a class="header" href="#submission-date">Submission Date</a></h3>
<p><code>submission_date</code> or <code>submission_timestamp</code> represents the server time at which a ping is received from a client. It is used as a partitioning column (useful for both query optimization and restricting the range of data under consideration) and should be considered reliable.</p>
<p>In <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1422892">bug 1422892</a> it was decided
to standardize on using <code>submission_date</code> as opposed to client-generated dates.</p>
<p>Summary of reasons for this decision:</p>
<ul>
<li>not subject to client clock skew</li>
<li>doesn't require normalization</li>
<li>good for backfill</li>
<li>good for daily processing</li>
<li>and usually good enough</li>
</ul>
<h2 id="pings-from-robots"><a class="header" href="#pings-from-robots">Pings from Robots</a></h2>
<p>In general, data coming from an application instance not run by a human is not wanted in analysis. As of this writing, <a href="https://github.com/mozilla/geckodriver">GeckoDriver</a> (one of the official mechanisms to launch and control an automated version of Firefox for e.g. web compatibility testing) is <a href="https://searchfox.org/mozilla-central/rev/baf1cd492406a9ac31d9ccb7a51c924c7fbb151f/testing/geckodriver/src/prefs.rs#154">configured <em>not</em> to send Telemetry by default</a> but we can't control for other things people might do in the field.</p>
<p>On desktop, one field to watch out for is headless mode (<code>environment.system.gfx.headless</code> in the main ping): if that field is set, you are for certain not working with a version of Firefox being operated by a real human. You can see an example of some client pings with this field set skewing the nightly numbers in <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1643341">bug 1643341</a>. An easy solution is to just filter out these types of clients in your analysis. You can see an example of this pattern in <a href="https://sql.telemetry.mozilla.org/queries/71781/source"><code>STMO#71781</code></a>.</p>
<h2 id="build-ids"><a class="header" href="#build-ids">Build Ids</a></h2>
<p>Generally speaking, application versions are monotonically increasing multipart alphanumeric strings like &quot;89.0a1&quot; or &quot;68.0.3&quot;.
Build Ids are not this.
A Build Id is a sequence of characters that is unique to a specific build of a product.
Since the application version may not vary across shipped versions (for example, a Firefox nightly version stays the same across its entire cycle), a build id helps identify which code changes were included in a build as well as what features may have been enabled for it.
For example, in Firefox Desktop, the build id is the date and time the build was built in yyyymmddhhmmss format.
A build id might be formatted in any way and contain the time or version control system revision of the code included in the build.</p>
<p>Do not assume build id's are consistent across the products we ship. A build id format may vary between products, between channels of the same product, or over time within the same channel of the same product.
The build id format for Firefox Desktop has been very stable over time thus far, but even it can be different for different platforms in some respin circumstances (if e.g. only one platform's builder failed).</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/concepts/analysis_gotchas.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="sql-style-guide"><a class="header" href="#sql-style-guide">SQL Style Guide</a></h1>
<h2 id="table-of-contents-10"><a class="header" href="#table-of-contents-10">Table of Contents</a></h2>
<ul>
<li><a href="concepts/sql_style.html#consistency">Consistency</a></li>
<li><a href="concepts/sql_style.html#reserved-words">Reserved Words</a></li>
<li><a href="concepts/sql_style.html#variable-names">Variable Names</a></li>
<li><a href="concepts/sql_style.html#be-explicit">Be Explicit</a>
<ul>
<li><a href="concepts/sql_style.html#aliasing">Aliasing</a></li>
<li><a href="concepts/sql_style.html#joins">Joins</a></li>
<li><a href="concepts/sql_style.html#grouping-columns">Grouping Columns</a></li>
</ul>
</li>
<li><a href="concepts/sql_style.html#left-align-root-keywords">Left Align Root Keywords</a></li>
<li><a href="concepts/sql_style.html#code-blocks">Code Blocks</a></li>
<li><a href="concepts/sql_style.html#join-conditions">Join Conditions</a></li>
<li><a href="concepts/sql_style.html#parentheses">Parentheses</a></li>
<li><a href="concepts/sql_style.html#boolean-at-the-beginning-of-line">Boolean at the Beginning of Line</a></li>
<li><a href="concepts/sql_style.html#nested-queries">Nested Queries</a></li>
<li><a href="concepts/sql_style.html#about-this-document">About this Document</a></li>
</ul>
<h2 id="consistency"><a class="header" href="#consistency">Consistency</a></h2>
<p>From <a href="https://www.python.org/dev/peps/pep-0008/#a-foolish-consistency-is-the-hobgoblin-of-little-minds">Pep8</a>:</p>
<blockquote>
<p>A style guide is about consistency.
Consistency with this style guide is important.
Consistency within a project is more important.
Consistency within one module or function is the most important.</p>
<p>However, know when to be inconsistent --
sometimes style guide recommendations just aren't applicable.
When in doubt, use your best judgment.
Look at other examples and decide what looks best.
And don't hesitate to ask!</p>
</blockquote>
<h2 id="reserved-words"><a class="header" href="#reserved-words">Reserved Words</a></h2>
<p>Always use uppercase for reserved keywords like <code>SELECT</code>, <code>WHERE</code>, or <code>AS</code>.</p>
<h2 id="variable-names"><a class="header" href="#variable-names">Variable Names</a></h2>
<ol>
<li>Use consistent and descriptive identifiers and names.</li>
<li>Use lower case names with underscores, such as <code>first_name</code>.
Do not use camelCase.</li>
<li>Functions, such as <code>cardinality</code>, <code>approx_distinct</code>, or <code>substr</code>,
<a href="https://www.postgresql.org/docs/10/static/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS">are identifiers</a>
and should be treated like variable names.</li>
<li>Names must begin with a letter and may not end in an underscore.</li>
<li>Only use letters, numbers, and underscores in variable names.</li>
</ol>
<h2 id="be-explicit"><a class="header" href="#be-explicit">Be Explicit</a></h2>
<p>When choosing between explicit or implicit syntax, prefer explicit.</p>
<h3 id="aliasing"><a class="header" href="#aliasing">Aliasing</a></h3>
<p>Always include the <code>AS</code> keyword when aliasing a variable or table name,
it's easier to read when explicit.</p>
<p><strong>Good</strong></p>
<pre><code class="language-sql">SELECT
date(submission_timestamp) AS day
FROM
telemetry.main
LIMIT
10
</code></pre>
<p><strong>Bad</strong></p>
<pre><code class="language-sql">SELECT
date(submission_timestamp) day
FROM
telemetry.main
LIMIT
10
</code></pre>
<h3 id="joins"><a class="header" href="#joins">Joins</a></h3>
<p>Always include the <code>JOIN</code> type rather than relying on the default join.</p>
<p><strong>Good</strong></p>
<pre><code class="language-sql">-- BigQuery Standard SQL Syntax
SELECT
submission_date,
experiment.key AS experiment_id,
experiment.value AS experiment_branch,
count(*) AS count
FROM
telemetry.clients_daily
CROSS JOIN
UNNEST(experiments.key_value) AS experiment
WHERE
submission_date &gt; '2019-07-01'
AND sample_id = '10'
GROUP BY
submission_date,
experiment_id,
experiment_branch
</code></pre>
<p><strong>Bad</strong></p>
<pre><code class="language-sql">-- BigQuery Standard SQL Syntax
SELECT
submission_date,
experiment.key AS experiment_id,
experiment.value AS experiment_branch,
count(*) AS count
FROM
telemetry.clients_daily,
UNNEST(experiments.key_value) AS experiment -- Implicit JOIN
WHERE
submission_date &gt; '2019-07-01'
AND sample_id = '10'
GROUP BY
1, 2, 3 -- Implicit grouping column names
</code></pre>
<h3 id="grouping-columns"><a class="header" href="#grouping-columns">Grouping Columns</a></h3>
<p>In the previous example, implicit grouping columns were discouraged, but there are cases where it makes sense.</p>
<p>In some SQL flavors (such as <a href="https://prestodb.github.io/docs/current/sql/select.html">Presto</a>) grouping elements must refer to the expression before any aliasing is done. If you are grouping by a complex expression it may be desirable to use implicit grouping columns rather than repeating the expression.</p>
<p><strong>Good</strong></p>
<pre><code class="language-sql">-- BigQuery SQL Syntax
SELECT
submission_date,
normalized_channel IN ('nightly', 'aurora', 'beta') AS is_prerelease,
count(*) AS count
FROM
telemetry.clients_daily
WHERE
submission_date &gt; '2019-07-01'
GROUP BY
submission_date,
is_prerelease -- Grouping by aliases is supported in BigQuery
</code></pre>
<p><strong>Good</strong></p>
<pre><code class="language-sql">-- Presto SQL Syntax
SELECT
submission_date,
normalized_channel IN ('nightly', 'aurora', 'beta') AS is_prerelease,
count(*) AS count
FROM
telemetry.clients_daily
WHERE
submission_date &gt; '20190701'
GROUP BY
1, 2 -- Implicit grouping avoids repeating expressions
</code></pre>
<p><strong>Bad</strong></p>
<pre><code class="language-sql">-- Presto SQL Syntax
SELECT
submission_date,
normalized_channel IN ('nightly', 'aurora', 'beta') AS is_prerelease,
count(*) AS count
FROM
telemetry.clients_daily
WHERE
submission_date &gt; '20190701'
GROUP BY
submission_date,
normalized_channel IN ('nightly', 'aurora', 'beta')
</code></pre>
<h2 id="left-align-root-keywords"><a class="header" href="#left-align-root-keywords">Left Align Root Keywords</a></h2>
<p>Root keywords should all start on the same character boundary.
This is counter to the common &quot;rivers&quot; pattern
<a href="https://www.sqlstyle.guide/#spaces">described here</a>.</p>
<p><strong>Good</strong>:</p>
<pre><code class="language-sql">SELECT
client_id,
submission_date
FROM
main_summary
WHERE
sample_id = '42'
AND submission_date &gt; '20180101'
LIMIT
10
</code></pre>
<p><strong>Bad</strong>:</p>
<pre><code class="language-sql">SELECT client_id,
submission_date
FROM main_summary
WHERE sample_id = '42'
AND submission_date &gt; '20180101'
</code></pre>
<h2 id="code-blocks"><a class="header" href="#code-blocks">Code Blocks</a></h2>
<p>Root keywords should be on their own line.
For example:</p>
<p><strong>Good</strong>:</p>
<pre><code class="language-sql">SELECT
client_id,
submission_date
FROM
main_summary
WHERE
submission_date &gt; '20180101'
AND sample_id = '42'
LIMIT
10
</code></pre>
<p>It's acceptable to include an argument on the same line as the root keyword,
if there is exactly one argument.</p>
<p><strong>Acceptable</strong>:</p>
<pre><code class="language-sql">SELECT
client_id,
submission_date
FROM main_summary
WHERE
submission_date &gt; '20180101'
AND sample_id = '42'
LIMIT 10
</code></pre>
<p>Do not include multiple arguments on one line.</p>
<p><strong>Bad</strong>:</p>
<pre><code class="language-sql">SELECT client_id, submission_date
FROM main_summary
WHERE
submission_date &gt; '20180101'
AND sample_id = '42'
LIMIT 10
</code></pre>
<p><strong>Bad</strong></p>
<pre><code class="language-sql">SELECT
client_id,
submission_date
FROM main_summary
WHERE submission_date &gt; '20180101'
AND sample_id = '42'
LIMIT 10
</code></pre>
<h2 id="join-conditions"><a class="header" href="#join-conditions">Join Conditions</a></h2>
<p>The <code>ON</code> and <code>USING</code> keywords should start on a new line indented one level more than the join keyword
and be followed by the join conditions starting on the same line. For example:</p>
<p><strong>Good</strong>:</p>
<pre><code class="language-sql">...
FROM
telemetry_stable.main_v4
LEFT JOIN
static.normalized_os_name
ON main_v4.environment.system.os.name = normalized_os_name.os_name
</code></pre>
<p><strong>Bad</strong>:</p>
<pre><code class="language-sql">...
FROM
telemetry_stable.main_v4
LEFT JOIN
static.normalized_os_name ON main_v4.environment.system.os.name = normalized_os_name.os_name
</code></pre>
<p><strong>Bad</strong>:</p>
<pre><code class="language-sql">...
FROM
telemetry_stable.main_v4
LEFT JOIN
static.normalized_os_name
ON
main_v4.environment.system.os.name = normalized_os_name.os_name
</code></pre>
<h2 id="parentheses"><a class="header" href="#parentheses">Parentheses</a></h2>
<p>If parentheses span multiple lines:</p>
<ol>
<li>The opening parenthesis should terminate the line.</li>
<li>The closing parenthesis should be lined up under
the first character of the line that starts the multi-line construct.</li>
<li>The contents of the parentheses should be indented one level.</li>
</ol>
<p>For example:</p>
<p><strong>Good</strong></p>
<pre><code class="language-sql">WITH sample AS (
SELECT
client_id,
FROM
main_summary
WHERE
sample_id = '42'
)
</code></pre>
<p><strong>Bad</strong> (Terminating parenthesis on shared line)</p>
<pre><code class="language-sql">WITH sample AS (
SELECT
client_id,
FROM
main_summary
WHERE
sample_id = '42')
</code></pre>
<p><strong>Bad</strong> (No indent)</p>
<pre><code class="language-sql">WITH sample AS (
SELECT
client_id,
FROM
main_summary
WHERE
sample_id = '42'
)
</code></pre>
<h2 id="boolean-at-the-beginning-of-line"><a class="header" href="#boolean-at-the-beginning-of-line">Boolean at the Beginning of Line</a></h2>
<p><code>AND</code> and <code>OR</code> should always be at the beginning of the line.
For example:</p>
<p><strong>Good</strong></p>
<pre><code class="language-sql">...
WHERE
submission_date &gt; 20180101
AND sample_id = '42'
</code></pre>
<p><strong>Bad</strong></p>
<pre><code class="language-sql">...
WHERE
submission_date &gt; 20180101 AND
sample_id = '42'
</code></pre>
<h2 id="nested-queries"><a class="header" href="#nested-queries">Nested Queries</a></h2>
<p>Do not use nested queries.
Instead, use common table expressions to improve readability.</p>
<p><strong>Good</strong>:</p>
<pre><code class="language-sql">WITH sample AS (
SELECT
client_id,
submission_date
FROM
main_summary
WHERE
sample_id = '42'
)
SELECT *
FROM sample
LIMIT 10
</code></pre>
<p><strong>Bad</strong>:</p>
<pre><code class="language-sql">SELECT *
FROM (
SELECT
client_id,
submission_date
FROM
main_summary
WHERE
sample_id = '42'
)
LIMIT 10
</code></pre>
<h2 id="about-this-document"><a class="header" href="#about-this-document">About this Document</a></h2>
<p>This document was heavily influenced by https://www.sqlstyle.guide/</p>
<p>Changes to the style guide should be reviewed by at least one member of
both the Data Engineering team and the Data Science team.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/concepts/sql_style.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="airflow-gotchas"><a class="header" href="#airflow-gotchas">Airflow Gotcha's</a></h1>
<p>Airflow is an integral part of our data platform. ETL processes, forecasts and various analyses are scheduled and monitored through Airflow. Our Airflow instance is hosted at <a href="https://workflow.telemetry.mozilla.org/home"><code>workflow.telemetry.mozilla.org</code></a> (WTMO).</p>
<h2 id="dags-are-automatically-generated-for-the-most-part"><a class="header" href="#dags-are-automatically-generated-for-the-most-part">DAGs are automatically generated for the most part</a></h2>
<p>Airflow DAGs for our ETL processes get automatically generated as part of <a href="https://github.com/mozilla/bigquery-etl">bigquery-etl</a>. The process for scheduling queries is to specify a DAG as part of its metadata. How to schedule queries is described in detail this <a href="https://mozilla.github.io/bigquery-etl/cookbooks/creating_a_derived_dataset/">guide to creating a derived dataset</a>. Generated DAGs are prefixed with <code>bqetl_</code>.</p>
<p>Some DAGs, for example for custom machine learning tasks or to schedule running custom tools, need to be defined manually. These DAGs need to be created in the <a href="https://github.com/mozilla/telemetry-airflow/tree/main/dags">telemetry-airflow</a> repository.</p>
<p>A separate script syncs generated bigquery-etl DAGs every 10 minutes to our Airflow instance. DAGs that live in telemetry-airflow get deployed via CircleCI whenever a change is pushed to <code>main</code>.</p>
<h2 id="new-dags-need-to-be-unpaused-manually"><a class="header" href="#new-dags-need-to-be-unpaused-manually">New DAGs need to be unpaused manually</a></h2>
<p>After adding a new DAG either through bigquery-etl or telemetry-airflow, it will take about 10 minutes until the new DAG gets deployed. After deployment, the DAG is by default disabled. It is necessary to manually <em>unpause</em> the DAG on <a href="https://workflow.telemetry.mozilla.org/home">WTMO</a></p>
<h2 id="external-task-dependencies-are-managed-via-externaltasksensors"><a class="header" href="#external-task-dependencies-are-managed-via-externaltasksensors">External task dependencies are managed via <code>ExternalTaskSensor</code>s</a></h2>
<p>Tasks are distributed across different Airflow DAGs. Usually, each DAG contains tasks that are closely related to a specific use case or for generating a set of related datasets. In many cases, tasks depend on other tasks that are running as part of a different DAG. For example, a lot of tasks depend on the <a href="https://github.com/mozilla/telemetry-airflow/blob/0ba2b5631f079fa90fe07467021fab0f9cfc7366/dags/copy_deduplicate.py#L116"><code>copy_deduplicate_main_ping</code></a> task.</p>
<p>External upstream dependencies are expressed using <a href="https://airflow.apache.org/docs/apache-airflow/1.10.3/_api/airflow/sensors/external_task_sensor/index.html"><code>ExternalTaskSensor</code>s</a>. These sensors ensure that the external upstream task is finished before the job that depends on that upstream task is executed. These sensors are usually defined like:</p>
<pre><code class="language-python">wait_for_bq_events = ExternalTaskSensor(
task_id=&quot;wait_for_bq_events&quot;, # name of this wait task as it will appear in the UI
external_dag_id=&quot;copy_deduplicate&quot;, # name of the external DAG
external_task_id=&quot;bq_main_events&quot;, # name of the external task
execution_delta=timedelta(hours=3), # delta based on differences in schedule between upstream DAG and current DAG
mode=&quot;reschedule&quot;, # use mode &quot;reschedule&quot; to unblock slots while waiting on upstream task to finish
allowed_states=ALLOWED_STATES, # pre-defined success states
failed_states=FAILED_STATES, # pre-defined failure states
pool=&quot;DATA_ENG_EXTERNALTASKSENSOR&quot;, # this slot pool is used for task sensors
email_on_retry=False,
dag=dag,
)
some_local_task.set_upstream(wait_for_bq_events)
</code></pre>
<p>It is important to note that the <code>execution_delta</code> needs to be set correctly depending on the time difference between the upstream DAG schedule and the schedule of the downstream DAG. If the <code>execution_delta</code> is not set correctly, downstream tasks will wait indefinitely without ever getting executed.</p>
<p>While upstream dependencies are automatically determined between generated DAGs in bigquery-etl, if there are dependencies between DAGs in telemetry-airflow and bigquery-etl, then these dependencies need to be either added manually to the DAG definition or to the scheduling metadata of the scheduled query.</p>
<h2 id="downstream-dependencies-are-managed-via-externaltaskmarkers"><a class="header" href="#downstream-dependencies-are-managed-via-externaltaskmarkers">Downstream dependencies are managed via <code>ExternalTaskMarker</code>s</a></h2>
<p><a href="https://airflow.apache.org/docs/apache-airflow/stable/_api/airflow/sensors/external_task/index.html#airflow.sensors.external_task.ExternalTaskMarker"><code>ExternalTaskMarker</code>s</a> are used to indicate all downstream dependencies to a task. Whenever the task is cleared with <em>Downstream Recursive</em> selected, then all downstream tasks will get cleared automatically. This is extremely useful when running backfill of Airflow tasks. When clearing the tasks, a pop-up will show all the downstream tasks that will get cleared. In case a task should be cleared without its downstream dependencies running as well, deselect the <em>Downstream Recursive</em> option.</p>
<p><code>ExternalTaskMarker</code>s are generally wrapped into a <code>TaskGroup</code> and defined like:</p>
<pre><code class="language-python">with TaskGroup('copy_deduplicate_all_external') as copy_deduplicate_all_external:
ExternalTaskMarker(
task_id=&quot;bhr_collection__wait_for_bhr_ping&quot;, # name of task marker task
external_dag_id=&quot;bhr_collection&quot;, # external downstream DAG
external_task_id=&quot;wait_for_bhr_ping&quot;, # external downstream task ID
execution_date=&quot;{{ execution_date.replace(hour=5, minute=0).isoformat() }}&quot;, # execution date calculated based on time differences in task schedules
)
</code></pre>
<p>Upstream dependencies are automatically determined between generated DAGs in bigquery-etl. If there are dependencies between DAGs in telemetry-airflow and bigquery-etl, then these dependencies need to be either added manually to the DAG definition or to the scheduling metadata of the scheduled query.</p>
<h2 id="the-dag-schedules-are-selected-based-on-schedules-of-upstream-dependencies"><a class="header" href="#the-dag-schedules-are-selected-based-on-schedules-of-upstream-dependencies">The DAG schedules are selected based on schedules of upstream dependencies</a></h2>
<p>The <code>schedule_interval</code> of a DAG should be set to a time that ensures that all upstream dependencies have likely finished before tasks in the DAG get executed. Airflow will send an email notification every time a task needs to be rescheduled due to upstream dependencies not having finished. To reduce the amount of notifications and avoid delays due to rescheduled tasks, the <code>schedule_interval</code> should be set based on when upstream tasks have finished.</p>
<h2 id="dag-will-be-scheduled-for-one-schedule_interval-after-the-start_date"><a class="header" href="#dag-will-be-scheduled-for-one-schedule_interval-after-the-start_date">DAG will be scheduled for one <code>schedule_interval</code> after the <code>start_date</code></a></h2>
<p>The DAG will not run at the first available time after the <code>start_date</code> per its configured scheduled, but rather will wait for the <code>scheduled_interval</code> time to elapse after the <code>start_date</code>. For example, if the <code>schedule_interval</code> specifies a daily run, then the run starting on <code>2023-04-18</code> will trigger after <code>2023-04-18 23:59</code>. See the <a href="https://airflow.apache.org/docs/apache-airflow/stable/core-concepts/dag-run.html#data-interval">Airflow docs</a> for more info.</p>
<h2 id="airflow-triage"><a class="header" href="#airflow-triage">Airflow triage</a></h2>
<p>To detect broken or stuck tasks, we set up an <a href="https://mana.mozilla.org/wiki/display/DATA/Airflow+Triage+Process">Airflow triage process</a> that notifies tasks owners of problems with their Airflow tasks. Generally, DAGs are checked for failures or stuck tasks on a daily basis and problems are reported on <a href="https://bugzilla.mozilla.org/buglist.cgi?query_format=advanced&amp;bug_status=UNCONFIRMED&amp;bug_status=NEW&amp;bug_status=ASSIGNED&amp;bug_status=REOPENED&amp;bug_status=RESOLVED&amp;bug_status=VERIFIED&amp;bug_status=CLOSED&amp;status_whiteboard=%5Bairflow-triage%5D%20&amp;classification=Client%20Software&amp;classification=Developer%20Infrastructure&amp;classification=Components&amp;classification=Server%20Software&amp;classification=Other&amp;resolution=---&amp;resolution=FIXED&amp;resolution=INVALID&amp;resolution=WONTFIX&amp;resolution=INACTIVE&amp;resolution=DUPLICATE&amp;resolution=WORKSFORME&amp;resolution=INCOMPLETE&amp;resolution=SUPPORT&amp;resolution=EXPIRED&amp;resolution=MOVED&amp;status_whiteboard_type=allwordssubstr&amp;list_id=16121716">Bugzilla</a>.</p>
<p>In case of a failure and after merging the solution to the problem, clear the logs for the failing task to allow the DAG to run again.</p>
<h2 id="testing-airflow-dags"><a class="header" href="#testing-airflow-dags">Testing Airflow DAGs</a></h2>
<p>A guide on how to set up Airflow locally and test Airflow DAGs is available <a href="https://github.com/mozilla/telemetry-airflow#testing">here</a>.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/concepts/airflow_gotchas.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="telemetry-reference"><a class="header" href="#telemetry-reference">Telemetry Reference</a></h1>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/concepts/index.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="a-brief-history-of-mozilla-data-collection"><a class="header" href="#a-brief-history-of-mozilla-data-collection">A brief history of Mozilla data collection</a></h1>
<blockquote>
<p>This section was originally included in the <a href="https://mozilla-private-report.protosaur.dev/smoot-existing-metrics/book/05_overview.html">Project Smoot existing metrics report</a>
(Mozilla internal link); the DTMO version has been updated to reflect changes to the data platform.</p>
</blockquote>
<h2 id="blocklistxml-and-active-daily-installs-adi"><a class="header" href="#blocklistxml-and-active-daily-installs-adi"><code>blocklist.xml</code> and Active Daily Installs (ADI)</a></h2>
<p>The <a href="https://wiki.mozilla.org/Blocklisting">blocklist</a> was a mechanism
for informing Firefox clients about malicious add-ons, DLLs, and other
extension content that should be blocked. The blocklist also noted when
hardware acceleration features should be avoided on certain graphics
cards. To be effective, the blocklist needed to be updated on a faster
cadence than Firefox releases.</p>
<p>The blocklist was <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=271166">first
implemented</a> in
2006 for Firefox 2, and reported the app ID and version to the blocklist
server.</p>
<p>Several additional variables, including OS version, locale, and
distribution, were <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=430120">added to the
URL</a> for Firefox 3
in 2008. Being able to count users was already expressed as a priority
in the bug comments.</p>
<p>A count of blocklist fetches was used to produce a metric called Active
Daily Users, which was <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=812282">renamed to Active Daily
Installs</a> (ADI) by 2012.</p>
<p>As of August 2020, this mechanism has been superseded by a <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1257565#c120">Remote
Settings-based</a>
replacement and the ADI measure is no longer in use. See the <a href="concepts/./censuses.html#adi--active-daily-installs-blocklist-fetches">historical
reference on ADI</a>
for more information.</p>
<h2 id="telemetry-1"><a class="header" href="#telemetry-1">Telemetry</a></h2>
<p>The <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=585196">earliest telemetry
infrastructure</a>
landed in Firefox 6, and was driven <a href="https://wiki.mozilla.org/Platform/Features/Telemetry">by engineering
needs</a>.</p>
<p>Telemetry was originally opt-out on the nightly and aurora channels, and
opt-in otherwise. It originally lacked persistent client identifiers.</p>
<h2 id="firefox-health-report"><a class="header" href="#firefox-health-report">Firefox Health Report</a></h2>
<p>The Firefox Health Report (FHR) was specified to enable longitudinal and
retention analyses. FHR aimed to enable analyses that were not possible
based on the blocklist ping, update ping, telemetry, Test Pilot and
crash stats datasets that were already available.</p>
<p>FHR was <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=718066">first
implemented</a> in
Firefox 20. It was introduced in blog posts by <a href="https://blog.lizardwrangler.com/2012/09/21/firefox-health-report/">Mitchell
Baker</a>
and <a href="https://blog.mozilla.org/metrics/2012/09/21/firefox-health-report/">Gilbert
Fitzgerald</a>.</p>
<p>To avoid introducing a persistent client identifier, FHR originally
relied on a “document ID” system. The client would generate a new UUID
(a random, unique ID) for each FHR document, and remember a list of its
most recent previous document IDs. While uploading a new FHR document,
the client would ask the server to remove its previous documents. The
intent was that the server would end up holding at most one document
from each user, and longitudinal metrics could be accumulated by the
client. This approach proved fragile and was abandoned. A <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=968419">persistent
client identifier</a>
was implemented for Firefox 30.</p>
<h2 id="firefox-desktop-telemetry-today"><a class="header" href="#firefox-desktop-telemetry-today">Firefox Desktop Telemetry today</a></h2>
<p>FHR was retired and merged with telemetry to produce the current
generation of telemetry data, distinguished as “v4 telemetry” or
“unified telemetry.”</p>
<p>Instead of mapping FHR probes directly to telemetry, the Unified
Telemetry project built upon the telemetry system to answer the
questions Mozilla had attempted to answer with FHR.</p>
<p>The <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1122515">implementation of unified
telemetry</a> and
opt-out delivery to the release channel was completed for Firefox 42, in 2015.</p>
<p>Telemetry payloads are uploaded in documents called pings. Several kinds
of pings are defined, representing different kinds of measurement. These
include:</p>
<ul>
<li><code>main</code>: activity, performance, technical, and other measurements;
the workhorse of Firefox desktop telemetry</li>
<li><code>crash</code>: information about crashes, including stack traces</li>
<li><code>opt-out</code>: a farewell ping sent when a user disables telemetry</li>
<li><code>module</code>: on Windows, records DLLs injected into the Firefox process</li>
</ul>
<p>and others.</p>
<p>Browser sessions and subsessions are important concepts in telemetry. A
<strong>session</strong> begins when the browser launches and ends—perhaps seconds or
days later— when the parent browser process terminates.</p>
<p>A <strong>subsession</strong> ends</p>
<ul>
<li>when its parent session ends, or</li>
<li>at local midnight, or</li>
<li>when the telemetry environment changes,</li>
</ul>
<p>whichever comes first.</p>
<p>The <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/data/environment.html">telemetry
environment</a>
describes the hardware and operating system of the client computer. It
can change during a Firefox session when e.g. hardware is plugged into a
laptop.</p>
<p>The subsession is the reporting unit for activity telemetry; each <code>main</code>
ping describes a single subsession. Activity counters are reset once a
subsession ends. Data can be accumulated for analysis by summing over a
clients pings.</p>
<p>Telemetry pings can contain several different types of measurements:</p>
<ul>
<li>scalars are integers describing either an event count or a
measurement that occurs only once during a subsession;
<code>simpleMeasurement</code>s are an older, less flexible scalar
implementation in the process of being deprecated</li>
<li>histograms represent measurements that can occur repeatedly during a
subsession; histograms report the count of measurements that fell
into each of a set of predefined buckets (e.g. between zero and one,
between one and two, etc).</li>
<li>events represent discrete events; the time and ordering of the
events are preserved, which clarifies sequences of user actions</li>
</ul>
<p>Data types are discussed in more depth in the <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/collection/index.html">telemetry data
collection</a>
documentation.</p>
<h2 id="firefox-desktop-telemetry-the-next-generation"><a class="header" href="#firefox-desktop-telemetry-the-next-generation">Firefox Desktop Telemetry: The Next Generation</a></h2>
<p>The next step for Firefox Desktop Telemetry is to prototype an implementation
using <a href="concepts/glean/glean.html">Glean</a>.</p>
<p>This effort is known as &quot;Firefox on Glean&quot; or FOG. This effort is expected to
begin in late 2019 / early 2020.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/concepts/history.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="profile-behavior"><a class="header" href="#profile-behavior">Profile behavior</a></h1>
<h2 id="profile-creation"><a class="header" href="#profile-creation"><a href="concepts/profile/profile_creation.html">Profile Creation</a></a></h2>
<h2 id="real-world-usage"><a class="header" href="#real-world-usage"><a href="concepts/profile/realworldusage.html">Real World Usage</a></a></h2>
<h2 id="profile-history"><a class="header" href="#profile-history"><a href="concepts/profile/profilehistory.html">Profile History</a></a></h2>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/concepts/profile/index.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="profile-creation---the-technical-part"><a class="header" href="#profile-creation---the-technical-part">Profile Creation - The technical part</a></h1>
<ul>
<li><a href="concepts/profile/profile_creation.html#what-is-a-profile">What is a profile?</a></li>
<li><a href="concepts/profile/profile_creation.html#profile-behaviors">Profile Behaviors</a>
<ul>
<li><a href="concepts/profile/profile_creation.html#profile-creation">Profile Creation</a>
<ul>
<li><a href="concepts/profile/profile_creation.html#managed-first-use">Managed: First use</a></li>
<li><a href="concepts/profile/profile_creation.html#managed-profile-manager-creation">Managed: Profile Manager creation</a></li>
<li><a href="concepts/profile/profile_creation.html#unmanaged-command-line-start">Unmanaged: Command-line start</a></li>
</ul>
</li>
<li><a href="concepts/profile/profile_creation.html#profile-reset">Profile Reset</a></li>
<li><a href="concepts/profile/profile_creation.html#profile-deletion">Profile Deletion</a></li>
<li><a href="concepts/profile/profile_creation.html#telemetry-opt-out">Telemetry opt-out</a></li>
</ul>
</li>
<li><a href="concepts/profile/profile_creation.html#profile-creation-date">Profile Creation Date</a>
<ul>
<li><a href="concepts/profile/profile_creation.html#managed-during-profile-creation">Managed: During Profile Creation</a></li>
<li><a href="concepts/profile/profile_creation.html#unmanaged-empty-profile-directory">Unmanaged: Empty profile directory</a></li>
</ul>
</li>
</ul>
<h2 id="what-is-a-profile"><a class="header" href="#what-is-a-profile">What is a profile?</a></h2>
<p>All of the changes a user makes in Firefox, like the home page, what toolbars you use, installed addons, saved passwords and your bookmarks, are all stored in a special folder, called a profile.
Telemetry stores archived and pending pings in the profile directory as well as metadata like the client ID.</p>
<p>Every run of Firefox needs a profile. However a single installation can use multiple profiles for different runs.
The profile folder is stored in a separate place from the Firefox program so that, if something ever goes wrong with Firefox, the profile information will still be there.</p>
<p>Firefox also comes with a Profile Manager, a different run mode to create, migrate and delete the profiles.</p>
<h2 id="profile-behaviors"><a class="header" href="#profile-behaviors">Profile Behaviors</a></h2>
<p>In order to understand the behavior of users and base analysis on things like the profile creation date,
it is essential to understand how a profile is created and identified by the browser.
Also, it is important to understand how user actions with and within profiles affect our ability to reason about profiles from a data perspective.
This includes resetting or deleting profiles or opting into or out of sending Telemetry data.</p>
<p>The different cases are described in more detail in the following sections.</p>
<h3 id="profile-creation-1"><a class="header" href="#profile-creation-1">Profile Creation</a></h3>
<p>There are multiple ways a Firefox profile can be created.
Some of these behave slightly differently.</p>
<p>Profiles can be created and managed by the Firefox Profile Manager:</p>
<ul>
<li>New profile on first launch</li>
<li>New profile from Profile Manager</li>
<li><code>--createprofile</code> command line argument</li>
</ul>
<p>Profiles can be created externally and not be managed by the Firefox Profile Manager:</p>
<ul>
<li><code>--profile</code> command line argument</li>
</ul>
<h4 id="managed-first-use"><a class="header" href="#managed-first-use">Managed: First use</a></h4>
<p>When Firefox is opened for the first time after a fresh install, without any prior Firefox profile on disk visible to Firefox, it will create a new profile.
Firefox uses &quot;Default User&quot; as the profile name, creates the profile's directory with a random suffix and marks the new profile as default for subsequent starts of Firefox.
Read <a href="https://support.mozilla.org/en-US/kb/profiles-where-firefox-stores-user-data">where Firefox stores your profile data</a>.</p>
<h4 id="managed-profile-manager-creation"><a class="header" href="#managed-profile-manager-creation">Managed: Profile Manager creation</a></h4>
<p>The user can create a new profile through the Profile Manager.
This can either be done on <code>about:profiles</code> in a running Firefox or by starting Firefox with the <code>--ProfileManager</code> flag.
The Profile Manager will ask for a name for the profile and picks a new directory for it.
The Profile Manager allows the user to create a new profile from an existing directory (in which case any files will be included) or from scratch (in which case the directory will be created).</p>
<p>The <code>--createprofile</code> flag can be used from the command line and works the same as creating a profile through the Profile Manager.</p>
<h4 id="unmanaged-command-line-start"><a class="header" href="#unmanaged-command-line-start">Unmanaged: Command-line start</a></h4>
<p>Firefox can be started on the command line with a path to a profile directory: <code>firefox --profile path/to/directory</code>.
If the directory does not exist it will be created.</p>
<p>A profile created like this will not be picked up by the Profile Manager.
Its data will persist after Firefox is closed, but the Profile Manager will not know about it.
The profile will not turn up in <code>about:profiles</code>.</p>
<h3 id="profile-reset"><a class="header" href="#profile-reset">Profile Reset</a></h3>
<p>A user can reset the profile (see <a href="https://support.mozilla.org/en-US/kb/refresh-firefox-reset-add-ons-and-settings">Refresh Firefox - reset addons and settings</a>).
This will copy over most user data to a new directory, keeping things like the history, bookmarks and cookies, but will remove extensions, modified preferences and added search engines.</p>
<p>A profile reset will not change the Telemetry <code>clientID</code>.
The date of the most recent profile reset is saved and will be contained in Telemetry pings in the <code>profile.resetDate</code> field.</p>
<h3 id="profile-deletion"><a class="header" href="#profile-deletion">Profile Deletion</a></h3>
<p>A profile can be deleted through the Profile Manager, which will delete all stored data from disk.
The profile can also be deleted by simply removing the profile's directory.
We will never know about a deletion. We simply won't see that profile in new Telemetry data anymore.</p>
<p>Uninstalling the Firefox installation will not remove any profile data.</p>
<p><strong>Note:</strong> Removing a profile's directory while it is in use is not recommended and will lead to a corrupt state.</p>
<h3 id="telemetry-opt-out"><a class="header" href="#telemetry-opt-out">Telemetry opt-out</a></h3>
<p>The user can opt out of sending Telemetry data.
When the user opts out, Telemetry sends a <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/data/deletion-request-ping.html">&quot;deletion-request&quot; ping</a>, containing an empty payload.
The local <code>clientID</code> is reset to a fixed value.</p>
<p>When a user opts into sending Telemetry data, a new <code>clientID</code> is generated and used in subsequent pings.
The profile itself and the profile creation date are unaffected by this.</p>
<h2 id="profile-creation-date"><a class="header" href="#profile-creation-date">Profile Creation Date</a></h2>
<p>The <em>profile creation date</em> is the assumed date of initial profile creation.
However it proved to be not reliable for all cases.
There are multiple ways this date is determined.</p>
<h3 id="managed-during-profile-creation"><a class="header" href="#managed-during-profile-creation">Managed: During Profile Creation</a></h3>
<p>When a profile is created explicitly the profile directory is created and a <code>times.json</code> containing a timestamp of the current time is stored inside that profile directory<sup class="footnote-reference"><a href="#1">1</a></sup>.
It is read at later times when the profile creation date is used.</p>
<pre class="mermaid">graph TD
A[Start Firefox] --&gt;B[Select profile dir, default or defined]
B --&gt; C{Selected dir exist?}
C --&gt; |No| D[Create directory]
C --&gt; |Yes| E[Write times.json]
D --&gt; E
E --&gt; F[Show Browser]
F --&gt; G[ProfileAge.jsm is called]
G --&gt; J[Read time from times.json]
J --&gt; S[Return creation date]
</pre>
<div class="footnote-definition" id="1"><sup class="footnote-definition-label">1</sup>
<p>Relevant parts in the code: <a href="https://searchfox.org/mozilla-central/rev/292d295d6b084b43b70de26a42e68513bb7b36a3/toolkit/xre/nsAppRunner.cpp#2394-2395,2397-2398,2527-2533"><code>nsAppRunner::SelectProfile</code></a> calling <a href="https://searchfox.org/mozilla-central/rev/196560b95f191b48ff7cba7c2ba9237bba6b5b6a/toolkit/profile/nsToolkitProfileService.cpp#789-793"><code>nsToolkitProfileService::CreateProfile</code></a>.</p>
</div>
<h3 id="unmanaged-empty-profile-directory"><a class="header" href="#unmanaged-empty-profile-directory">Unmanaged: Empty profile directory</a></h3>
<p>When <code>--profile path/to/directory</code> is passed on the command line, the directory is created if it does not exist, but no <code>times.json</code> is written<sup class="footnote-reference"><a href="#2">2</a></sup>.
On the first access of the profile creation date (through <code>ProfileAge.jsm</code>) the module will detect that the <code>times.json</code> is missing.
It will then iterate through all files in the current profile's directory, reading file creation or modification timestamps.
The oldest of these timestamps is then assumed to be the profile creation date and written to <code>times.json</code>.
Subsequent runs of Firefox will then use this date.</p>
<pre class="mermaid">graph TD
A[Start Firefox --profile path/to/dir] --&gt;H{path/to/dir exist?}
H --&gt; |No| K[Create directory]
K --&gt; F[Show Browser]
H --&gt; |Yes| F
F --&gt; O[ProfileAge.jsm is called]
O --&gt; R{times.json exists?}
R --&gt;|Yes| Q[Read time from times.json]
R --&gt;|No| L[Scan profile dir for oldest file, write to times.json]
L --&gt; S
Q --&gt; S[Return creation date]
</pre>
<div class="footnote-definition" id="2"><sup class="footnote-definition-label">2</sup>
<p>Relevant part in the code: <a href="https://searchfox.org/mozilla-central/rev/292d295d6b084b43b70de26a42e68513bb7b36a3/toolkit/xre/nsAppRunner.cpp#2357-2363"><code>nsAppRunner::SelectProfile</code></a> creating the directory.</p>
</div>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/concepts/profile/profile_creation.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="real-world-usage-1"><a class="header" href="#real-world-usage-1">Real World Usage</a></h1>
<p>This page backs away from our profile-focused data view and examines what Firefox Desktop usage looks like in the real world. There are many components and layers that exist between a user acquiring and running Firefox, and this documentation will illuminate what those are and how they can affect the meaning of a profile.</p>
<h2 id="real-life-components-of-firefox-desktop-usage"><a class="header" href="#real-life-components-of-firefox-desktop-usage">Real Life Components of Firefox Desktop Usage</a></h2>
<p><img src="concepts/profile/images/real-life-usage-components.png" alt="" /></p>
<p>The above image illustrates all the layers that sit between a user acquiring and running Firefox Desktop and the Telemetry pings we receive.</p>
<ul>
<li>1: The user
<ul>
<li>A human being presumably.</li>
</ul>
</li>
<li>2: The machine
<ul>
<li>The physical hardware running Firefox.</li>
</ul>
</li>
<li>3: The disk image / hard drive
<ul>
<li>A single machine could have separate partitions running different OSes.</li>
<li>Multiple machines could run copies of a single disk image</li>
<li>Disk images are also used as backups to restore a machine.</li>
</ul>
</li>
<li>4: OS user profile
<ul>
<li>Most operating systems allow users to log into different user profiles with separate user directories (such as a &quot;Guest&quot; account).</li>
<li>Usually, Firefox is installed into a system directory that all users profiles will share, but Firefox profiles are saved within the user directories, effectively segregating them.</li>
</ul>
</li>
<li>5: Firefox binary / installer
<ul>
<li>The downloaded binary package or stub installer which installs Firefox into the disk image. Users can get these from our website or one of our managed properties, but they can also acquire these from 3rd party sources as well.</li>
<li>Our website is instrumented with Google Analytics to track download numbers, but other properties (FTP) and 3rd party sources are not. Google Analytics data is not directly connected to Telemetry data.</li>
<li>A user can produce multiple installations from a single Firefox binary / installer. For example, if a user copies it to a USB stick or keeps it in cloud storage, they could install Firefox on multiple machines from a single binary / installer.</li>
</ul>
</li>
<li>6: Firefox installation
<ul>
<li>The installed Firefox program on a given disk image.</li>
<li>Since Firefox is usually installed in a system directory, the single installation of Firefox will be shared by all the OS user profiles in the disk image.</li>
<li>Stub installers are instrumented with pings to report new install counts, however, full binaries are not.</li>
</ul>
</li>
<li>7: Firefox profile
<ul>
<li>The profile Firefox uses during a user's session.</li>
<li>A user can create multiple Firefox profiles using the Firefox Profile Manager, or by specifying a custom directory to use at startup. More details <a href="concepts/profile/profile_creation.html">here</a>.</li>
<li>This is the entity that we see in Telemetry. Profiles send pings to Telemetry with a client ID as its identifier.</li>
</ul>
</li>
</ul>
<h2 id="desktop-browser-use-cases"><a class="header" href="#desktop-browser-use-cases">Desktop Browser Use Cases</a></h2>
<p>Below are the rough categories of Firefox use cases that we know happen in the real world.</p>
<p>Note, these categories are rough approximations, and are not necessarily mutually exclusive.</p>
<h4 id="regular-user"><a class="header" href="#regular-user">Regular User</a></h4>
<p>What we imagine a typical user to be. Someone who buys a computer, always uses a default OS user profile, downloads Firefox once, installs it, and continues using the default Firefox profile.</p>
<p><img src="concepts/profile/images/regular-user.png" alt="" /></p>
<p>In Telemetry, this user would just show up as a single client ID.</p>
<p>Assuming they went through our normal funnel, they should show up once in Google Analytics as a download and once in stub installer pings as a new installation (if they used a stub installer).</p>
<h4 id="multi-profile-user"><a class="header" href="#multi-profile-user">Multi-Profile User</a></h4>
<p>A more advanced user, who uses multiple Firefox profiles in their normal, everyday use, but otherwise is pretty 'normal' (uses the same OS user profile, etc.).</p>
<p><img src="concepts/profile/images/multi-profile-user.png" alt="" /></p>
<p>In Telemetry, this user would show up as 2 (or more) separate client IDs.
We would have no way to know they came from the same computer and user without identifying that the subsessions are never overlapping and that large portions of the environment (CPU, GPU, Displays) are identical and that would be no guarantee.</p>
<p>Assuming they went through our normal funnel, they would show up once in Google Analytics as a download and once in stub installer pings as a new installation (if they used a stub installer).</p>
<p>However, any subsequent new Firefox profile creations would not have any corresponding downloads or installations.
Since Firefox 55 however, any newly created profile will send a &quot;new-profile&quot; ping.</p>
<h4 id="shared-computer"><a class="header" href="#shared-computer">Shared Computer</a></h4>
<p>A situation where there is a computer that is shared across multiple users and each user uses a different OS user profile. Since Firefox profiles live at the user directory level, each user would have a separate Firefox profile. Note, users logging in under a &quot;Guest&quot; account in most machines falls into this category.</p>
<p><img src="concepts/profile/images/shared-computer.png" alt="" /></p>
<p>In this case, every user who logged into this one computer with a different OS user profile would show up as a different client ID. We have no way of knowing they came from the same computer.</p>
<p>Furthermore, if the computer wiped the user directory after use, like Guest accounts and university computer labs often do, then they would show up as a <strong>new</strong> client ID every time they logged in, even if they have used the same computer multiple times. This use case could inflate new profile counts.</p>
<p>Similar to Multi-Profile Users, in this use case, there would be only one download event and install event (assuming normal funnel and stub installer), but multiple client ID's.</p>
<h4 id="cloned-machines"><a class="header" href="#cloned-machines">Cloned Machines</a></h4>
<p>In this case, there are actually multiple users with computers that all share the same disk image at some point.</p>
<p>Think of the situation where the IT staff sets up the computer for a new hire at a company. Instead of going through to trouble of installing all the required programs and setting them up correctly for each computer, they'll do it once on one computer, save the disk image, and simply copy it over each time they need to issue a new machine.</p>
<p>Or think of the case where the IT staff of a library needs to set up 2 dozen machines at once.</p>
<p><img src="concepts/profile/images/cloned-machines.png" alt="" /></p>
<p>In this case, depending on the state of the disk image when it was copied, we could see multiple client ID's for each user+machine, or we could see all the user+machines sharing the same client ID.</p>
<p>If the disk image was copied after a Firefox profile was created, then the old user+machine and new user+machine will share the same client ID, and be submitting pings to us concurrently.</p>
<p>If the disk image was copied after the Firefox installation but before an initial Firefox profile was created, then each user+machine will get their own Firefox profile and client ID when they run Firefox for the first time.</p>
<p>As with the Multi-Profile User and Shared Computer case, even though there could be multiple Firefox profiles in this use case, there will only be one download and install event.</p>
<h4 id="migrations"><a class="header" href="#migrations">Migrations</a></h4>
<h5 id="type-1-migrate-disk-image"><a class="header" href="#type-1-migrate-disk-image">Type 1: Migrate Disk Image</a></h5>
<p>A user has a backup of their disk image and when they switch to a new computer or their current computer crashes, they simply reboot from the old disk image.</p>
<p><img src="concepts/profile/images/migration-1.png" alt="" /></p>
<p>In this case, the old machine and the new machine will just share the same client ID (assuming that the disk was copied after a Firefox profile was created). In fact, it will look exactly like the Cloned Machines case, except that instead of sending pings concurrently, they'll be sending us pings first from the old machine and then from the new machine.</p>
<p>Also, it should be noted that their Firefox profile will 'revert' back to the state that it was in when the disk image was copied, essentially starting over from the past, and any unsent pings on the image (if they exist) will be resent.
For instance, we will see another ping with the <code>profile_subsession_count</code> (the count of how many subsessions a profile has seen in its history) we previously saw some time before.</p>
<p>Again, there will only be one download and install associated with this use case (assuming normal funnel and stub installer).</p>
<h5 id="type-2-migrate-os-user-directory"><a class="header" href="#type-2-migrate-os-user-directory">Type 2: Migrate OS User Directory</a></h5>
<p>A user has a backup of their OS user directory and copies it to a new machine.</p>
<p><img src="concepts/profile/images/migration-2.png" alt="" /></p>
<p>This is similar to Type 1 migration, but instead of copying the entire disk, the user only copies the OS user directory. Since the Firefox profile lives in the OS user directory, the old machine and new machine will share the same client ID.</p>
<p>The only difference is since the Firefox Installation lives in system directories, the client might have to re-download and re-install the browser. However, if they also copy the Firefox binary / installer, there will not be a download event, only an install event.</p>
<h5 id="type-3-migrate-firefox-binary--installer"><a class="header" href="#type-3-migrate-firefox-binary--installer">Type 3: Migrate Firefox Binary / Installer</a></h5>
<p>A user has the Firefox binary or installer saved on their old machine and copies it over to a new machine to install Firefox.</p>
<p><img src="concepts/profile/images/migration-3.png" alt="" /></p>
<p>In this case, there will not be a second download event, but there will be an install event and the new and old machines will have separate client ID's.</p>
<h5 id="type-4-migrate-firefox-profile"><a class="header" href="#type-4-migrate-firefox-profile">Type 4: Migrate Firefox Profile</a></h5>
<p>A user copies their old Firefox profile from their old machine to a new computer, and runs Firefox using the copied Firefox profile.</p>
<p><img src="concepts/profile/images/migration-4.png" alt="" /></p>
<p>In this case, since the Firefox profile is being copied over, both the new and the old machine will have profiles with the same client ID. Again, the profile on the new computer will revert back to the point in its history where it was copied.
And since the profile contains any unsent Telemetry pings, we may receive duplicated submissions of pings from the same client ID.</p>
<p>If the Firefox binary / installer was downloaded, there will be a download and install event. If it was migrated via USB stick, it will only have an install event.</p>
<h2 id="how-might-cloned-client-ids-be-detected-by-telemetry"><a class="header" href="#how-might-cloned-client-ids-be-detected-by-telemetry">How might cloned client IDs be detected by telemetry?</a></h2>
<p>Theres no 100% reliable identifier or signature of cloned client IDs. However, several signatures may help identify some of the potential clones while perhaps misidentifying some non-clones or failing to identify all clones.</p>
<p>Generally, cloned client IDs may exhibit the following behavior:</p>
<ul>
<li>Client ID has activity in multiple countries at once (but this also captures people that travel between countries or use VPN).</li>
<li>Client ID has multiple distinct, overlapping profile histories over the same time span (e.g. chain together main pings using the previous subsession id and subsession id fields, with profile subsession counter as reference. Each client id should, in theory, have a single unbroken chain of main pings. If a client id has overlapping history branches over the same time period, that can be a marker of cloned profiles).</li>
<li>Client ID has multiple machine specs at once.</li>
<li>Client ID has multiple versions at once (but this also captures clients who upgrade their browser).</li>
<li>…</li>
</ul>
<p>Typical experiment enrollment criteria may restrict client behavior in such a way that complicates the detection of cloned clients (e.g. prevents enrolled clients from exhibiting mismatched versions/ countries). Additionally, the following may be indicative of clones within an experiment:</p>
<ul>
<li>Client ID sends multiple enrollment pings as measured by multiple enrollment IDs (these should be unique to a singular enrollment event). </li>
<li>Client ID is in both branches (subset of above case).</li>
<li>…</li>
</ul>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/concepts/profile/realworldusage.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="profile-history-1"><a class="header" href="#profile-history-1">Profile History</a></h1>
<p>A profile's history is simply the progression of that profile's subsessions over its lifetime. We can see this in our main pings by checking:</p>
<ul>
<li><code>profile_subsession_counter</code>
<ul>
<li>A counter which starts at 1 on the very first run of a profile and increments for each subsession. This counter will be reset to 1 if a user resets / refreshes their profile.</li>
</ul>
</li>
<li><code>subsession_start_date</code>
<ul>
<li>The date and time the subsession starts in, truncated to hours. This field is not always reliable due to local clock skew.</li>
</ul>
</li>
<li><code>previous_subsession_id</code>
<ul>
<li>The ID of the previous subsession. Will be <code>null</code> for the very first subsession, or the first subsession after a user resets / refreshes their profile.</li>
</ul>
</li>
<li><code>subsession_id</code>
<ul>
<li>The ID of the current subsession.</li>
</ul>
</li>
<li><code>submission_date_s3</code>
<ul>
<li>The date we received the ping. This date is sourced from the server's time and reliable.</li>
</ul>
</li>
<li><code>profile_reset_date</code>
<ul>
<li>The date the profile was reset. Will be <code>null</code> if the profile was not reset.</li>
</ul>
</li>
</ul>
<p><img src="concepts/profile/images/profile-history/basic-example.png" alt="" /></p>
<p>This is a nice clean example of profile history. It has a clear <strong>starting ping</strong> and it progresses linearly, with each subsession connecting to the next via <code>subsession_id</code>. However, due to the fact that profiles can be shared across machines, and restored manually, etc. strange behaviors can arise (see <a href="concepts/profile/realworldusage.html">Real World Usage</a>).</p>
<h2 id="profile-history-start-conditions"><a class="header" href="#profile-history-start-conditions">Profile History Start Conditions</a></h2>
<p>Under normal assumptions, we expect to see the <strong>starting ping</strong> in a profile's history in our telemetry data. The starting ping in the profile's history is the ping from their very first subsession. We expect this ping to have <code>profile_subsession_counter = 1</code> and <code>previous_subsession_id is null</code> and <code>profile_reset_date is null</code>.</p>
<p>However, not all profiles appear in our data with a starting ping and instead appear to us mid-history.</p>
<p><img src="concepts/profile/images/profile-history/ping-diagram-start-condition.png" alt="" /></p>
<h4 id="history-has-beginning"><a class="header" href="#history-has-beginning">History Has Beginning</a></h4>
<p><img src="concepts/profile/images/profile-history/example-starting.png" alt="" /></p>
<p>As you can see, this profile starts with a ping where <code>profile_subsession_counter = 1</code> and <code>previous_subsession_id is null</code>.</p>
<h4 id="history-has-no-beginning"><a class="header" href="#history-has-no-beginning">History Has No Beginning</a></h4>
<p><img src="concepts/profile/images/profile-history/example-midhistory.png" alt="" /></p>
<p>In this example, the profile simply appears in our data mid-history, with presumably the 25th subsession in it's history. Its previous history is a mystery.</p>
<h2 id="profile-history-progression-events"><a class="header" href="#profile-history-progression-events">Profile History Progression Events</a></h2>
<p>After a profile appears, in 'normal' conditions, there should be a linear, straightforward progression with each subsession linking to the next.</p>
<p><img src="concepts/profile/images/profile-history/ping-diagram-events.png" alt="" /></p>
<p>However, the following abnormal events can occur.</p>
<h4 id="history-gap"><a class="header" href="#history-gap">History Gap</a></h4>
<p>There is a gap in the profile history.</p>
<p>It's possible this behavior is due to dropped pings.</p>
<p><img src="concepts/profile/images/profile-history/example-gap.png" alt="" /></p>
<p>Here, we see a gap between the 30th ping and the 41st ping and the 44th ping.</p>
<h4 id="history-splits"><a class="header" href="#history-splits">History Splits</a></h4>
<p>The history of a profile splits, and after a single subsession, there are two (or more) subsessions that link back to it.</p>
<p>This is probably due to cloned machines or disk image restores. Note, after the profile splits, the two branches might continue concurrently or one branch might die while the other continues.
It is very hard to distinguish between the different branches of the same profile.</p>
<ul>
<li>Profile begins</li>
</ul>
<p><img src="concepts/profile/images/profile-history/example-splits-1.png" alt="" /></p>
<ul>
<li>Profile splits: branch 1</li>
</ul>
<p><img src="concepts/profile/images/profile-history/example-splits-2.png" alt="" /></p>
<ul>
<li>Profile splits: branch 2</li>
</ul>
<p><img src="concepts/profile/images/profile-history/example-splits-3.png" alt="" /></p>
<p>In this example, the profile history starts normally, but on the 5th ping, the history splits into two branches that seem to progress concurrently.</p>
<h4 id="history-restarts"><a class="header" href="#history-restarts">History Restarts</a></h4>
<p>The history of a profile suddenly starts over, with a brand new starting ping.</p>
<ul>
<li>Profile begins</li>
</ul>
<p><img src="concepts/profile/images/profile-history/example-restart-1.png" alt="" /></p>
<ul>
<li>Profile restarts</li>
</ul>
<p><img src="concepts/profile/images/profile-history/example-restart-2.png" alt="" /></p>
<p>Here, we see the profile start their history normally, but then they begin a new, totally unconnected branch with a starting ping that is <strong>not</strong> the same as the original starting ping (different <code>subsession_id</code>s).</p>
<h4 id="history-reruns"><a class="header" href="#history-reruns">History Reruns</a></h4>
<p><a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1631935">(Work in Progress)</a></p>
<h2 id="how-to-order-history"><a class="header" href="#how-to-order-history">How to Order History</a></h2>
<p><a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1631934">(Work in Progress)</a></p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/concepts/profile/profilehistory.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="engagement-metrics"><a class="header" href="#engagement-metrics">Engagement metrics</a></h1>
<blockquote>
<p>This section was originally included in the <a href="https://mozilla-private-report.protosaur.dev/smoot-existing-metrics/book/05_overview.html">Project Smoot existing metrics report</a>
(Mozilla internal link).</p>
</blockquote>
<p>A handful of metrics have been adopted as engagement metrics, either as
censuses of the population or to describe user activity within a
session. This chapter aims to describe what those metrics are and how
theyre defined.</p>
<h2 id="engagement-metrics-1"><a class="header" href="#engagement-metrics-1">Engagement metrics</a></h2>
<h3 id="active_ticks"><a class="header" href="#active_ticks"><code>active_ticks</code></a></h3>
<p>The <code>active_ticks</code> probe <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1187069#c6">is
specified</a> to
increment once in every 5-second window that a user performs an action
that could interact with content or chrome, including mousing over the
window while it lacks focus. One additional tick is recorded after the
activity stops.</p>
<p>Main pings provide two measurements of <code>active_ticks</code>: a
<code>simpleMeasurement</code> and a scalar.</p>
<p>The <code>simpleMeasurement</code> was <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1106122">implemented in Firefox
37</a> before the
launch of unified telemetry, and had previously been
<a href="https://bugzilla.mozilla.org/show_bug.cgi?id=826893">implemented</a> for
<a href="https://bugzilla.mozilla.org/show_bug.cgi?id=827157">FHR</a>.</p>
<p>The <code>simpleMeasurement</code> was discovered to be resetting incorrectly,
which was <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1482466">fixed</a>
in Firefox 62.</p>
<p>The scalar (which was not affected by the same bug) was
<a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1376942">implemented</a> in
Firefox 56. The scalar is aggregated into <code>main_summary</code>, but should
always be identical to the <code>simpleMeasurement</code>.</p>
<h3 id="subsession_length"><a class="header" href="#subsession_length"><code>subsession_length</code></a></h3>
<p><code>subsession_length</code> is the wall-clock duration of a subsession.
<code>subsession_length</code> includes time that the computer was asleep for
Windows, but not for OS X or Linux; there is a <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1205567">long-outstanding
bug</a> to include
sleep time on all platforms.</p>
<p>There is <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1205985">another
bug</a> to count only
time that the computer is not in sleep.</p>
<p><code>subsession_length</code> was first implemented with <a href="https://web.archive.org/web/20210402095333/https://mail.mozilla.org/pipermail/fhr-dev/2015-January/000384.html">the advent of
subsessions</a>,
which came with unified telemetry.</p>
<h3 id="total_uri_count"><a class="header" href="#total_uri_count"><code>total_uri_count</code></a></h3>
<p><code>total_uri_count</code> was
<a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1271313">implemented</a> for
Firefox 50.</p>
<p><code>total_uri_count</code> is intended to capture the number of distinct
navigation events a user performs. It includes changes to the URI
fragment (i.e. anchor navigation) on the page. It excludes
<code>XmlHttpRequest</code> fetches and <code>iframes</code>.</p>
<p>It works by attaching an instance of <code>URICountListener</code> as a
<code>TabsProgressListener</code> which responds to <code>onLocationChange</code> events.</p>
<p>Some filters are applied to <code>onLocationChange</code> events:</p>
<ul>
<li>Error pages are excluded.</li>
<li>Only top-level pageloads (where <code>webProgress.isTopLevel</code>,
<a href="https://searchfox.org/mozilla-central/rev/f1c7ba91fad60bfea184006f3728dd6ac48c8e56/uriloader/base/nsIWebProgress.idl#144">documented inline</a>, is true) are counted – i.e,
not navigations within a frame.</li>
<li>Tab restore events are excluded.</li>
<li>URIs visited in private browsing mode are excluded unless
<code>browser.engagement.total_uri_count.pbm</code> is true. (The pref has been
flipped on for small populations in a couple of short studies, but,
<a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1535169">for now</a> remains false by default.)</li>
</ul>
<h3 id="unfiltered_uri_count"><a class="header" href="#unfiltered_uri_count"><code>unfiltered_uri_count</code></a></h3>
<p>The unfiltered count,
<a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1304647">implemented</a> for
Firefox 51, differs only in that it includes URIs using protocol specs
other than HTTP and HTTPS. It excludes some (but not all) <code>about:</code> pages
– the set of “initial pages” defined in <code>browser.js</code> are excluded, but
e.g. <code>about:config</code> and <code>about:telemetry</code> are included.</p>
<p>No applications of <code>unfiltered_uri_count</code> have been identified.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/concepts/engagement.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="user-statessegments"><a class="header" href="#user-statessegments">User states/segments</a></h1>
<p>A user state is a group of clients who fit a set of criteria at a point in time.
The set of criteria itself can also be referred to as a &quot;user state&quot;.</p>
<p>In data science these are normally called &quot;segments&quot;; for Firefox we call them &quot;user states&quot;.</p>
<p>Typically you'll use user states to gain more insight into what is going on, by asking
&quot;Regarding the thing I'm interested in,
do users in different user states behave differently,
and what insights does this give me into the users and the product?&quot;
For example, &quot;In this experiment, how do new users react to this feature,
and how does this differ from established users?&quot;.
Or &quot;DAU moved dramatically -
is this restricted to users in this particular country (i.e. a user state)
where there's an event happening, which would raise our suspicions,
or is it global and therefore not solely due to that event?&quot;</p>
<h2 id="versioning"><a class="header" href="#versioning">Versioning</a></h2>
<p>We are building out our library of user states,
and we want room to iterate to improve them in the future.
So please quote user states' versions with their names, e.g. &quot;regular users v3&quot;
so that your communication is forwards compatible.</p>
<h2 id="current-user-statessegments"><a class="header" href="#current-user-statessegments">Current user states/segments</a></h2>
<h3 id="regular-users-v3"><a class="header" href="#regular-users-v3">Regular users v3</a></h3>
<p><code>clients_last_seen.is_regular_user_v3</code></p>
<p>This user state contains clients who sent pings on <em>at least 14</em> of the previous 27 days. As of February 2020 this user state contained approximately 2/3 of DAU and its users had a 1-week retention of around 95%.</p>
<h3 id="new-or-resurrected-v3"><a class="header" href="#new-or-resurrected-v3">New or Resurrected v3</a></h3>
<p><code>clients_last_seen.is_new_or_resurrected_v3</code></p>
<p>This user state contains clients who sent pings on <em>none</em> of the previous 27 days. As of February 2020 this user state contained approximately 4% of DAU and its users had a 1-week retention of approximately 30%.</p>
<h3 id="weekday-regulars-v1"><a class="header" href="#weekday-regulars-v1">Weekday regulars v1</a></h3>
<p><code>clients_last_seen.is_weekday_regular_v1</code></p>
<p>This user state contains clients in <em>Regular users v3</em> who typically use the browser only on weekdays. This user state is responsible for a slight majority of the weekly seasonality in DAU for <em>Regular users v3</em>. Of the previous 27 days, these users submitted a ping on at most one weekend day (UTC). Due to differing timezones, we allow flexibility: the &quot;weekend&quot; could be Friday/Saturday, Saturday/Sunday, or Sunday/Monday; we only ask that each client is self-consistent for the 27 day period.</p>
<h3 id="all-week-regulars-v1"><a class="header" href="#all-week-regulars-v1">All-week regulars v1</a></h3>
<p><code>clients_last_seen.is_allweek_regular_v1</code></p>
<p>This user state contains clients in <em>Regular users v3</em> who do not fit in <em>Weekday regulars v1</em> - clients that used the browser on a weekend at least twice in the previous 27 days. DAU for this user state does have some weekly seasonality, so some of the clients in this user state use the browser on weekdays preferentially, but not exclusively.</p>
<h3 id="core-actives-v1"><a class="header" href="#core-actives-v1">Core Actives v1</a></h3>
<p><code>clients_last_seen.is_core_active_v1</code></p>
<p>This user state contains clients that browsed at least 1 URI in at least 21 of the previous 28 days (including the current date). URI counts are derived from the column <code>scalar_parent_browser_engagement_total_uri_count_sum</code> in <a href="concepts/../datasets/batch_view/clients_daily/reference.html"><code>clients_daily</code></a> and <a href="concepts/../datasets/bigquery/clients_last_seen/reference.html"><code>clients_last_seen</code></a>. Note that <code>is_core_active_v1</code> can be <code>true</code> on days where clients did not send a ping or browse at least 1 URI, so long as the aforesaid condition still holds.</p>
<h3 id="activity-segments-informal"><a class="header" href="#activity-segments-informal">Activity Segments (informal)</a></h3>
<p><code>clients_last_seen.activity_segments_v1</code></p>
<p>This column classifies each client-day based into one of four informal segments, defined below:</p>
<ul>
<li>
<p><code>infrequent_user</code>: client that browsed at least 1 URI in at least 1 and up to 6 days in the past 28 days.</p>
</li>
<li>
<p><code>casual_user</code>: client that browsed at least 1 URI in at least 7 and up to 13 days in the past 28 days.</p>
</li>
<li>
<p><code>regular_user</code>: client that browsed at least 1 URI in at least 14 and up to 20 days in the past 28 days.
(note that this differs from <code>regular_user_v3</code>)</p>
</li>
<li>
<p><code>core_user</code>: client that browsed at least 1 URI in at least 21 of the past 28 days.</p>
</li>
<li>
<p><code>other</code>: client does not meet any of the criteria above (i.e. they sent pings in at least 1 day out of the previous
28 but did not browse any URIs).</p>
</li>
</ul>
<p>Note that these are informal segments and are provided for convenience - one should not, for example, assume that
there are inherent differences between infrequent and casual users, for example. Also note that they do not divide up
the past 28 days evenly. One can use <code>clients_last_seen.days_visited_1_uri_bits</code> to define their own criteria if a
different breakdown is desired.</p>
<h2 id="writing-queries-with-user-statessegments"><a class="header" href="#writing-queries-with-user-statessegments">Writing queries with user states/segments</a></h2>
<p>When a user state is defined with respect to a user's <em>behavior</em> (e.g. usage levels) as opposed to more stable
traits (e.g. country),
we should evaluate each user's user state eligibility
using data collected <em>before</em> the time period in which we want to study their actions.
Else we run the risk of making trivial discoveries
like &quot;heavy users use the product heavily&quot; instead of more meaningful ones
like &quot;heavy users go on to continue using the product heavily&quot;.</p>
<p>So, when writing queries to compute user states directly from their definition,
be sure to compute users' user states using only
data collected before the time period you're analyzing their behavior.</p>
<p>User states are found as columns in the <code>clients_last_seen</code> dataset: the user state listed for a client on a <code>submission_date</code> is valid for that <code>submission_date</code> because it is computed only using behavioral data collected <em>before</em> the <code>submission_date</code>.</p>
<p>TODO: mention that user states are only really defined on days the user is active</p>
<h3 id="wau-and-mau"><a class="header" href="#wau-and-mau">WAU and MAU</a></h3>
<p>Users can move in or out of specific user states part way through a week or a month.
This poses a conundrum if we want to plot the WAU or MAU for a user state.</p>
<p>Our convention is to <em>count the number of distinct users who were active in the user state in the period</em>: e.g. &quot;MAU(sent a ping as a regular user v3)&quot;.
So if a user was active as a regular user v3 on e.g. the second day of a 28-day window, then they will contribute to &quot;regular user v3 MAU&quot; regardless of whether they lost their &quot;regular user v3&quot; status at any point in the 28-day window.</p>
<p>Since many user states use the full extent of the <code>*_bits</code> column wizardry in <code>clients_last_seen</code>, you'll have to query WAU or MAU the old fashioned way:</p>
<pre><code class="language-sql">WITH dates AS (
SELECT *
FROM UNNEST(GENERATE_DATE_ARRAY('2020-05-01', '2020-07-01')) as d
) SELECT
dates.d AS submission_date,
COUNT(DISTINCT client_id) * 100 AS regular_user_v3_mau,
FROM dates
INNER JOIN telemetry.clients_last_seen cls
ON cls.submission_date BETWEEN DATE_SUB(dates.d, INTERVAL 27 DAY) AND dates.d
AND cls.submission_date BETWEEN '2020-04-01' AND '2020-07-01'
WHERE cls.sample_id = 42
AND cls.is_regular_user_v3
AND cls.days_since_seen = 0
GROUP BY dates.d
ORDER BY dates.d
</code></pre>
<p>Our convention has the potentially counterintuitive consequence that a user can count towards &quot;MAU(sent a ping as a regular user v3)&quot; and &quot;MAU(sent a ping as not a regular user v3)&quot; for the same 28-day window.
If you need to break MAU down into the sum of MAU for various user states, then in this instance you would need to break it down into &quot;only regular v3&quot;, &quot;only not regular v3&quot;, and &quot;both&quot;.</p>
<p>It might be tempting to assign users to whichever user state they happened to be in at the end of the window: this quantity is easy to query.
But many of the user states were defined to be meaningful <em>on days the users were active</em>.
&quot;Regular users v3&quot; is predictive of retention, <em>given that the user was active on the day of interest as a regular user</em>.
If someone has been using the browser every day for a year but then suddenly churns, then it's misleading to consider them to be active as a &quot;not regular user v3&quot; for MAU on the period ending the 15th day after their last activity.
A user can be &quot;new or resurrected v3&quot; for only one day in a 28-day period: unless they appear for the first time on the last day of the month, the user will not qualify as &quot;new or resurrected v3&quot; at the end of the MAU window!
So beware this trap and try to only use user states on days the users are active.</p>
<h3 id="example-queries"><a class="header" href="#example-queries">Example queries</a></h3>
<p>DAU for <em>regular users v3</em>:</p>
<pre><code class="language-sql">SELECT
submission_date,
COUNTIF(is_regular_user_v3) AS dau_regular_users_v3
FROM moz-fx-data-shared-prod.telemetry.clients_last_seen
WHERE
submission_date BETWEEN '2020-01-01' AND '2020-03-01'
AND days_since_seen = 0 -- Get DAU from clients_last_seen
GROUP BY submission_date
</code></pre>
<p>DAU for <em>regular users v3</em>, but joining from a different table:</p>
<pre><code class="language-sql">SELECT
cd.submission_date,
COUNTIF(is_regular_user_v3) AS dau_regular_users_v3
FROM clients_daily cd
INNER JOIN clients_last_seen cls
ON cls.client_id = cd.client_id
AND cls.submission_date = cd.submission_date
AND cls.submission_date BETWEEN '2020-01-01' AND '2020-03-01'
WHERE
cd.submission_date BETWEEN '2020-01-01' AND '2020-03-01'
</code></pre>
<p>Using <code>_is_core_active_v1_</code>:</p>
<p>Here are two basic ways to calculate a time series that counts the number of clients qualifying as Core Active:</p>
<ol>
<li>On a 28 day basis. Here we ask: for a given 28 day period, how many clients qualify as Core Active on that day?
This is equivalent to looking at a sliding 28 day window where we evaluate the 28 day history of each client on
the last day of the window and ask whether they meet the criteria. This means that they do not necessarily have to
be active (either sent a ping or browsed at least 1 URI) on the last day of the window to qualify. Below we show an
example of a query that returns this time series.</li>
</ol>
<pre><code class="language-sql">SELECT submission_date,
COUNTIF(is_core_active_v1) as number_core_actives
FROM telemetry.clients_last_seen
WHERE submission_date &gt;= DATE_SUB(CURRENT_DATE, INTERVAL 7 DAY)
GROUP BY 1
ORDER BY 1
</code></pre>
<ol start="2">
<li>On a daily basis. Here we ask: of all the users who sent a ping on a given day, how many of them qualify as Core
Active? In this case, we restrict ourselves to looking only at clients who reported telemetry (sent a main ping)
on a given day, and then ask how many of them qualify as core active based on their history in the most recent
28 day window. This is equivalent to asking what subset of DAU qualifies as Core Active. The query here is similar
to the one above, with one addition to the WHERE clause:</li>
</ol>
<pre><code class="language-sql">SELECT submission_date,
COUNTIF(is_core_active_v1) as number_core_actives
FROM telemetry.clients_last_seen
WHERE submission_date &gt;= DATE_SUB(CURRENT_DATE, INTERVAL 7 DAY) AND days_since_seen = 0
GROUP BY 1
ORDER BY 1
</code></pre>
<h2 id="obsolete-user-states"><a class="header" href="#obsolete-user-states">Obsolete user states</a></h2>
<h3 id="usage-regularity-v2"><a class="header" href="#usage-regularity-v2">Usage regularity v2</a></h3>
<p>This is a set of three segments.
On a given day, every client falls into exactly one of these segments.
Each client's segment can be computed from <code>telemetry.clients_last_seen.days_visited_5_uri_bits</code>.</p>
<p><em>Regular users v2</em> is defined as
clients who browsed &gt;=5 URIs on <em>at least eight</em> of the previous 27 days.
As of February 2020 this segment contained approximately 2/3 of DAU
and its users had a 1-week retention for a 5 URI day usage criterion of approximately 95%.</p>
<p><em>New/irregular users v2</em> is defined as
clients who browsed &gt;=5 URIs on <em>none</em> of the previous 27 days.
As of February 2020 this segment contained approximately 15% of DAU,
and had a retention for a 5 URI day usage criterion of about 10%
(though &quot;activation&quot; is likely a more relevant word than &quot;retention&quot; for many of these clients).</p>
<p><em>Semi-regular users v2</em> is defined as
clients who browsed &gt;=5 URIs on <em>between one and seven</em> of the previous 27 days,
i.e. it contains users who do not fit the other two segments at this time.
As of February 2020 this segment contained approximately 20% of DAU,
and had a retention for a 5 URI day usage criterion of about 60%.
We do not yet know what proportion of users in this segment stay in this segment for an extended period, and what proportion are in transition between other segments.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/concepts/segments.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="data-and-experiments"><a class="header" href="#data-and-experiments">Data and experiments</a></h1>
<h2 id="nimbus"><a class="header" href="#nimbus">Nimbus</a></h2>
<p>Nimbus is Firefox's cross-platform experimentation tool.</p>
<p>You can learn more about Nimbus at <a href="https://experimenter.info">https://experimenter.info</a>.</p>
<p><a href="concepts/../datasets/jetstream.html">Jetstream</a> analyzes Nimbus experiments. Results appear in <a href="https://experimenter.services.mozilla.com">Experimenter</a>.</p>
<p>Nimbus experiments are <a href="concepts/../datasets/experiment_telemetry.html">represented in telemetry</a> the same way Normandy experiments are.</p>
<h2 id="normandy"><a class="header" href="#normandy">Normandy</a></h2>
<p>Normandy is an experimentation platform for Firefox desktop.</p>
<p><a href="concepts/../datasets/jetstream.html">Jetstream</a> also analyzes Normandy experiments, although the results do not appear in the experiment console.</p>
<p>Normandy experiments are <a href="concepts/../datasets/experiment_telemetry.html">represented in telemetry</a> the same way Nimbus experiments are.</p>
<h2 id="heartbeat"><a class="header" href="#heartbeat">Heartbeat</a></h2>
<p><a href="concepts/../datasets/heartbeat.html">Heartbeat</a> is a survey mechanism controlled with Normandy.</p>
<h2 id="monitoring"><a class="header" href="#monitoring">Monitoring</a></h2>
<p>We publish <a href="concepts/../datasets/experiment_monitoring.html">aggregate datasets for experiment monitoring</a> to BigQuery.</p>
<h2 id="experiment-specific-telemetry"><a class="header" href="#experiment-specific-telemetry">Experiment-specific telemetry</a></h2>
<p>Sometimes experiments deploy custom telemetry that is not well-documented elsewhere.
We maintain <a href="concepts/../datasets/dynamic_telemetry.html">a list</a> of these datasets.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/concepts/experiments.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="metric-hub"><a class="header" href="#metric-hub">Metric Hub</a></h1>
<p>Metric Hub is a <a href="https://github.com/mozilla/metric-hub">repository</a> that contains metric, data source and segment definitions that have been
reviewed and can be seen as the source of truth.
Definitions that are part of Metric Hub can be referenced in configurations of other tooling as well, such as <a href="https://experimenter.info/deep-dives/jetstream/overview">Jetstream</a> and <a href="https://docs.telemetry.mozilla.org/cookbooks/operational_monitoring.html?highlight=opmon#operational-monitoring-opmon">OpMon</a>.</p>
<p>Metric Hub is a &quot;thin&quot; metric layer that lies between the Data Warehouse and consumers, such as analysis tools.
The Data Warehouse contains the datasets and tables referenced in the metric definitions. Analysis tools can reference metrics that are defined in metric-hub.</p>
<pre class="mermaid">graph TB
subgraph MH[&quot; &quot;]
m1 --&gt;|2. Run SQL against Source| d0(fa:fa-database Data Warehouse)
m0(fa:fa-file Metric Definitions) --&gt; m1(fa:fa-stream Metric Hub)
end
c0(fa:fa-magnifying-glass-chart Analysis Tools) --&gt;|1. Reference Metrics| m1
d0 --&gt;|3. Return Results| c0
classDef bq fill:#eff,stroke:#099;
classDef metrics fill:#efe,stroke:#090;
classDef consumer fill:#ececff,stroke:#9370db;
classDef nostyle fill:#fff,stroke:#fff;
class c0 consumer
class d0 bq
class m0,m1 metrics
class MH nostyle
</pre>
<p>Available metrics can be found in the <a href="https://mozilla.acryl.io/glossaryNode/urn:li:glossaryNode:Metric%20Hub/Contents?is_lineage_mode=false">DataHub metrics glossary</a></p>
<h2 id="metrics-and-statistics"><a class="header" href="#metrics-and-statistics">Metrics and Statistics</a></h2>
<p><em>Metric</em> is a very overloaded term and has different meanings in different parts of our data platform.
In the context of metric-hub there are two key concepts:</p>
<ul>
<li><em>metric</em>: A metric describes an aggregation of activities or measurements for a specific entity (e.g. clients, users, ...).
<ul>
<li>Example 1: A metric &quot;Ad Clicks&quot; is defined as <code>SUM('ad_click')</code>, counts clicks on ads for individual clients</li>
<li>Example 2: A metric &quot;Income&quot; can be calculated as <code>SUM('money_made')</code> for individual people</li>
</ul>
</li>
<li><em>statistic</em>: Statistics summarize the distribution of metrics within a specific time frame and population segment. Statistics are used to derive insights and patterns from the raw metric data
<ul>
<li>Example 1: To get the average number of daily &quot;Ad Clicks&quot; for all Windows clients over the last month, the statistic &quot;Mean&quot; can be applied. To see the distribution of ad clicks across clients in the US, &quot;Frequency Binning&quot; can be applied to the &quot;Ad Clicks&quot; metric data.</li>
<li>Example 2: To see the median monthly &quot;Income&quot; for people in the US, the &quot;Percentile&quot; statistic can be applied on the calculated &quot;Income&quot; metric aggregated over a month, with the 50th percentile representing the median</li>
<li>Different statistics are available for different tools that use metrics.</li>
</ul>
</li>
</ul>
<h2 id="adding-definitions"><a class="header" href="#adding-definitions">Adding definitions</a></h2>
<p>To add or update a project configuration, open a pull request against <a href="https://github.com/mozilla/metric-hub">metric-hub</a>.
CI checks will validate that the structure of the definitions as well as the SQL syntax is correct. A review by data science is required before changes can get merged.</p>
<p>Definitions are part of config files that are written in <a href="https://toml.io/en/">TOML</a>.
These definitions files are platform-specific and located in the <a href="https://github.com/mozilla/metric-hub/tree/main/definitions"><code>definitions/</code> directory of the metric-hub repository</a>. Definitions files are named after the platform they target, for example definitions related to Firefox Desktop are in the <code>firefox_desktop.toml</code> file.</p>
<p>Generally, configuration files have four main sections: <code>[data_sources]</code>, <code>[metrics]</code>, <code>[segments]</code>, and <code>[dimensions]</code>. All of these sections are optional.</p>
<p>Lines starting with a <code>#</code> are comments and have no effect.</p>
<h3 id="data_sources-section-1"><a class="header" href="#data_sources-section-1"><code>[data_sources]</code> Section</a></h3>
<p>Data sources specify the tables data should be queried from.</p>
<pre><code class="language-toml">[data_sources]
[data_sources.main_v1]
# FROM expression - often just a fully-qualified table name. Sometimes a subquery.
from_expression = &quot;mozdata.telemetry.main&quot;
# SQL snippet specifying the submission_date column
submission_date_column = &quot;submission_date&quot;
[data_sources.events_memory_v1]
# FROM expression - subquery
from_expression = &quot;&quot;&quot;
(
SELECT
*
FROM `moz-fx-data-shared-prod.telemetry.events`
WHERE
event_category = 'memory_watcher'
)
&quot;&quot;&quot;
submission_date_column = &quot;submission_date&quot;
</code></pre>
<p>Data sources can be joined with other data sources:</p>
<pre><code class="language-toml"># Join the `baseline` data source with the `metrics` data source.
# Definitions for both data sources must exist.
[data_sources.baseline.joins.metrics]
relationship = &quot;many_to_many&quot; # this determines the type of JOIN used; options: many_to_many, one_to_one, one_to_many, many_to_one; default: many_to_many
on_expression = &quot;&quot;&quot; # SQL expression specifying the JOIN condition; default join is on client_id_column and submission_date_columns
baseline.client_id = metrics.client_id AND
baseline.submission_date = metrics.submission_date
&quot;&quot;&quot;
</code></pre>
<p>Wildcard character can be used to apply joins to multiple data sources:</p>
<pre><code class="language-toml"># Apply join to all data sources prefixed with user_
[data_sources.user_'*'.joins.metrics]
# [default] relationship = many_to_many
# [default] on_expression = &quot;&quot;&quot; # SQL expression specifying the JOIN condition; default join is on client_id_column and submission_date_columns
# baseline.{client_id_column} = metrics.{client_id_column} AND
# baseline.{submission_date_column} = metrics.{submission_date_column}
# &quot;&quot;&quot;
</code></pre>
<blockquote>
<p>If there are multiple wildcard expression targeting a data source, the definition that is provided
last in the config file has precedence. This means <code>joins</code> expressions can be overwritten by
re-defining a data source definition later on in the config file.</p>
</blockquote>
<h3 id="metrics-section-1"><a class="header" href="#metrics-section-1"><code>[metrics]</code> Section</a></h3>
<p>The metrics sections allows to specify metrics. A metric aggregates data and is associated with some data source.</p>
<p>Each metric is identified by a unique slug and a version (versions are optional but strongly encouraged), and can be defined by adding a new section with a name like:</p>
<p><code>[metrics.&lt;new_metric_slug&gt;_v&lt;version&gt;]</code></p>
<pre><code class="language-toml">[metrics]
[metrics.memory_pressure_count_v1]
# The data source to use. Use the slug of a data source defined in a platform-specific config,
# or else define a new data source (see above).
data_source = &quot;events_memory&quot;
# A clause of a SELECT expression with an aggregation
select_expression = &quot;SUM(SAFE_CAST(SPLIT(event_string_value, ',')[OFFSET(1)] AS NUMERIC))&quot;
# Type of the metric to be evaluated.
# This is used to determine the method of aggregation to be applied.
# Either &quot;scalar&quot; or &quot;histogram&quot;.
# scalar = a single value is returned
# histogram = an array of scalars is returned
type = &quot;scalar&quot;
# A friendly metric name displayed in dashboards.
friendly_name = &quot;Memory Pressure Count&quot;
# A description that will be displayed by dashboards.
description = &quot;Number of memory pressure events&quot;
# An optional category that can be any string value. It's currently not being used but in the future, this could be used to visually group different metrics by category.
category = &quot;performance&quot;
# An optional owner or team owning this metric. Can be a string or list of strings.
owner = &quot;example@mozilla.org&quot;
# Whether the metric is deprecated and should no longer be used (optional).
deprecated = false
# An optionl string ('gold', 'silver', or 'bronze') that is the metric's current level according to the Metric Levels Taxonomy (https://mozilla-hub.atlassian.net/wiki/spaces/DATA/pages/610894135/Metrics#Metric-Levels-Taxonomy).
level = &quot;gold&quot;
</code></pre>
<p>Since metrics aggregate data, the metric SQL definition must contain some aggregation method (like <code>SUM</code>, <code>COUNT</code>, ...) to be valid.</p>
<p>Existing metrics cannot be removed after they have been added to Metric Hub. Other tools or configurations might still reference the
deleted metric resulting in their computations to break. Instead, to indicate that a metric should no longer be used <code>deprecated</code> should
be set to <code>true</code>.</p>
<h4 id="statistics"><a class="header" href="#statistics">Statistics</a></h4>
<p>Statistics reduce a set of metric values to a summary describing the population.
Any summarization of the client-level data can be implemented as a statistic.</p>
<p>Different statistics are available for different tools. To specify which statistic should be applied to a specific metric, use the config files that live in the folders specific to each tool that integrates metric-hub. For example, to specify that certain statistics should be applied to the <code>memory_pressure_count</code> metric in Looker, go to the <code>looker/definitions/firefox_desktop.toml</code> file and specify the statistics:</p>
<pre><code class="language-toml"># Specify which statistic to use for a metric
[metrics.memory_pressure_count.statistics]
client_count = {}
mean = {}
</code></pre>
<p>Wildcard expressions can be used to express that a specific statistic should be available for multiple metrics:</p>
<pre><code class="language-toml"># All metrics with the bookmark_ prefix should have the mean computed
[metrics.bookmark_'*'.statistics.mean]
# All metrics should have client counts computed (not recommended to apply statistic to every metric)
[metrics.'*'.statistics.client_count]
</code></pre>
<p>New statistics need to be implemented inside the tooling that uses metric definitions.</p>
<h3 id="dimensions-section-1"><a class="header" href="#dimensions-section-1"><code>[dimensions]</code> Section</a></h3>
<p>Dimensions define a field or dimension on which the client population should be segmented. Dimensions are used in OpMon. For segmenting client populations clients see the <code>[segments]</code> section.</p>
<p>For example:</p>
<pre><code class="language-toml">[dimensions]
[dimensions.os_v1]
# The data source to use. Use the slug of a data source defined in a platform-specific config,
# or else define a new data source (see above).
data_source = &quot;main&quot;
# SQL snippet referencing a field whose values should be used to segment the client population.
select_expression = &quot;normalized_os&quot;
</code></pre>
<h3 id="segments-section"><a class="header" href="#segments-section"><code>[segments]</code> Section</a></h3>
<p>Segments specify a boolean condition that determines whether a client is part of the segment. Segment are used in Jetstream, for segmenting client populations in OpMon please see the <code>[dimensions]</code> section.</p>
<pre><code class="language-toml">[segments.my_segment_v1]
# Note the aggregation function; these expressions are grouped over client_id
select_expression = '{{agg_any(&quot;is_default_browser&quot;)}}'data_source = &quot;my_data_source&quot;
# segments require their own data source to be defined
# the standard `data_source`s cannot be used for segments
[segments.data_sources.my_data_source_v1]
from_expression = '(SELECT submission_date, client_id, is_default_browser FROM my_cool_table)'
</code></pre>
<p>Segment SQL snippets need to be boolean expressions to be valid.</p>
<h2 id="accessing-and-using-metric-definitions"><a class="header" href="#accessing-and-using-metric-definitions">Accessing and Using Metric Definitions</a></h2>
<p>All the definitions are automatically available in some of our tooling:</p>
<ul>
<li><a href="https://experimenter.info/deep-dives/jetstream/overview">Jetstream</a> - used for analyzing experiments</li>
<li><a href="https://github.com/mozilla/mozanalysis">mozanalysis</a> - a Python library which standardizes how experiment data is analyzed at Mozilla</li>
<li><a href="https://docs.telemetry.mozilla.org/cookbooks/operational_monitoring.html">OpMon</a> - a tool for monitoring operational metrics</li>
<li><a href="https://github.com/mozilla/bigquery-etl">bigquery-etl</a> - used for writing ETL queries</li>
<li><a href="https://mozilla.cloud.looker.com">Looker</a> - used for creating dashboards and ad-hoc analyses</li>
</ul>
<h3 id="using-metrics-in-etl-queries"><a class="header" href="#using-metrics-in-etl-queries">Using Metrics in ETL queries</a></h3>
<p>Metrics and data sources can be referenced in query and view definitions in <a href="https://github.com/mozilla/bigquery-etl">bigquery-etl</a>.
Query and view definitions are <a href="https://jinja.palletsprojects.com/en/3.1.x/">Jinja templates</a> which have access to the <code>metrics.calculate()</code> and <code>metrics.data_source()</code> functions.</p>
<p>Metrics can be referenced as follows:</p>
<pre><code class="language-sql"> SELECT
*
FROM
{{ metrics.calculate(
metrics=['days_of_use', 'active_hours'],
platform='firefox_desktop',
group_by={'sample_id': 'sample_id', 'channel': 'application.channel'},
where='submission_date = &quot;2023-01-01&quot;'
) }}
</code></pre>
<p>This query will get translated to:</p>
<pre><code class="language-sql">SELECT
*
FROM
(
WITH clients_daily AS (
SELECT
client_id AS client_id,
submission_date AS submission_date,
COALESCE(SUM(active_hours_sum), 0) AS active_hours,
COUNT(submission_date) AS days_of_use,
FROM
mozdata.telemetry.clients_daily
GROUP BY
client_id,
submission_date
)
SELECT
clients_daily.client_id,
clients_daily.submission_date,
active_hours,
days_of_use,
FROM
clients_daily
)
</code></pre>
<p>The following parameters are available for <code>metrics.calculate()</code> to customize the query:</p>
<ul>
<li><code>metrics</code>: unique reference(s) to metric definition, all <a href="https://mozilla.github.io/metric-hub/metrics/firefox_desktop/">metric definitions</a> are aggregations (e.g. SUM, AVG, ...)</li>
<li><code>platform</code>: platform to compute metrics for (e.g. <code>firefox_desktop</code>, <code>firefox_ios</code>, <code>fenix</code>, ...)</li>
<li><code>group_by</code>: fields used in the GROUP BY statement; this is a dictionary where the key represents the alias, the value is the field path; <code>GROUP BY</code> always includes the configured <code>client_id</code> and <code>submission_date</code> fields</li>
<li><code>where</code>: SQL filter clause</li>
<li><code>group_by_client_id</code>: Whether the field configured as <code>client_id</code> (defined as part of the data source specification in metric-hub) should be part of the <code>GROUP BY</code>. <code>True</code> by default</li>
<li><code>group_by_submission_date</code>: Whether the field configured as <code>submission_date</code> (defined as part of the data source specification in metric-hub) should be part of the <code>GROUP BY</code>. <code>True</code> by default</li>
</ul>
<p>Data sources can be referenced as follows:</p>
<pre><code class="language-sql">SELECT
*
FROM {{ metrics.data_source(
data_source=&quot;main&quot;,
platform=&quot;firefox_desktop&quot;,
where='submission_date = &quot;2023-01-01&quot;'
)
}}
</code></pre>
<p>To render queries into raw SQL queries use the <code>./bqetl query render path/to/query.sql</code> command. Rendered views and queries are also available on the <a href="https://github.com/mozilla/bigquery-etl/tree/generated-sql"><code>generated-sql</code> branch in bigquery-etl</a>. The <code>bqetl</code> tooling does support running and publishing artifacts that use Jinja (translating Jinja templates into raw SQL isn't strictly necessary to perform these actions).</p>
<h3 id="using-metrics-in-python-scripts"><a class="header" href="#using-metrics-in-python-scripts">Using Metrics in Python Scripts</a></h3>
<p>Metric definitions can also be imported into Python scripts by using the <a href="https://github.com/mozilla/metric-config-parser"><code>mozilla-metric-config-parser</code></a>. This library automatically parses the definitions in Metric Hub and returns their Python type representations.</p>
<pre><code class="language-python">from metric_config_parser.config import ConfigCollection
config_collection = ConfigCollection.from_github_repo(&quot;https://github.com/mozilla/metric-hub&quot;)
metric = config_collection.get_metric_definition(slug=&quot;active_hours&quot;, app_name=&quot;firefox_desktop&quot;)
print(metric.select_expression)
</code></pre>
<p>To use the metrics with Mozanalysis, you'll need <code>Metric</code>s not <code>MetricDefinition</code>s. For example:</p>
<pre><code class="language-python">from mozanalysis.config import ConfigLoader
metric = ConfigLoader.get_metric(metric_slug=&quot;active_hours&quot;, app_name=&quot;firefox_desktop&quot;)
</code></pre>
<h3 id="using-metrics-in-looker"><a class="header" href="#using-metrics-in-looker">Using Metrics in Looker</a></h3>
<p>Metric definitions are available in Looker. For each data source a corresponding explore exists in Looker. These explores are prefixed with &quot;Metric Definitions&quot; followed by the data source name. For example, for the Firefox Desktop <code>clients_daily</code> data source an explore &quot;Metric Definitions Clients Daily&quot; is available under the Firefox Desktop section.</p>
<p>These explores look like the following:</p>
<p><img src="concepts/../assets/looker_metric_hub.png" alt="" /></p>
<p>The side pane is split into different sections:</p>
<ul>
<li><strong>Base Fields</strong>: This section contains dimensions that are useful for filtering or segmenting the population, like channel or operating system. These base fields can be configured in metric-hub (see below).</li>
<li><strong>Metrics</strong>: This section contains all metrics that are based on the data source represented by the explore. These metrics describe an aggregation of activities or measurements on a per-client basis.</li>
<li><strong>Statistics</strong>: This sections contains the <a href="https://github.com/mozilla/metric-hub/tree/main/looker">statistics that have been defined in metric-hub on top of the metric definitions</a> as measures. These statistics summarize the distribution of metrics within a specific time frame, population and/or segment and are used to derive insights and patterns from the raw metric data. Statistics have to be defined manually under the <a href="https://github.com/mozilla/metric-hub/tree/main/looker"><code>looker/</code> directory in metric-hub</a>.</li>
<li><strong>Sample of source data</strong>: Defines the sample size that should be selected from the data source. Decreasing the sample size will speed up getting results in Looker, however it might decrease the accuracy. The results are being adjusted based on the sample size. For example, if a 1% sample is being used, then certain statistic results (like sum, count) will be multiplied by 100.</li>
<li><strong>Aggregate Client Metrics Per ...</strong>: This parameter controls the time window over which metrics are aggregated per client. For example, this allows to get a weekly average of a metric, a maximum of a metric over the entire time period. By default, aggregations are on a daily basis.</li>
</ul>
<h4 id="getting-metrics-into-looker"><a class="header" href="#getting-metrics-into-looker">Getting Metrics into Looker</a></h4>
<p>Metric definitions will be available in the &quot;Metric Definition&quot; explores for metrics that have been added to the <a href="https://github.com/mozilla/metric-hub/tree/main/definitions"><code>definitions/</code> folder in metric-hub</a>.</p>
<p>Statistics on top of these metrics need to be defined in the <a href="https://github.com/mozilla/metric-hub/tree/main/looker"><code>looker/</code> folder in metric-hub</a>. Statistics currently supported by Looker are:</p>
<ul>
<li><code>sum</code></li>
<li><code>count</code></li>
<li><code>average</code></li>
<li><code>min</code></li>
<li><code>max</code></li>
<li><code>client_count</code>: distinct count of clients where the metric value is &gt;0</li>
<li><code>ratio</code>: ratio between two metrics. When configuring the statistic metric slugs need to be provided for the <code>numerator</code> and <code>denominator</code> parameters</li>
<li><code>dau_proportion</code>: Ratio between the metric and active user counts</li>
</ul>
<p>To get more statistics added, please reach out on the <a href="https://mozilla.slack.com/archives/C4D5ZA91B">#data-help</a> Slack channel.</p>
<p>To filter and segment metrics in Looker, data sources that expose fields as dimensions can be configured in metric-hub. These base field data sources need to be joined with the metric data sources. Wildcard characters can be used to apply these joins to multiple data sources:</p>
<pre><code class="language-toml">[data_sources.looker_base_fields]
select_expression = &quot;&quot;&quot;
SELECT
submission_date,
client_id,
os,
country,
channel
FROM
mozdata.telemetry.clients_daily
&quot;&quot;&quot;
columns_as_dimensions = true # expose the selected fields as dimensions in Looker
# Join `looker_base_fields` on to all the data sources that are in scope for the current file (i.e., data sources for the current application)
# The selected fields in `looker_base_fields` will show up as dimensions for all the metrics
[data_sources.'*'.joins.looker_base_fields]
# Overwrite the join, to allow for a different data source to be used as base field data source
[data_sources.baseline.joins.some_other_datasource]
relationship = &quot;many_to_many&quot;
on_expression = &quot;baseline.client_id = some_other_datasource.client_id&quot;
</code></pre>
<h4 id="example-use-cases"><a class="header" href="#example-use-cases">Example Use Cases</a></h4>
<p>Some stakeholders would like to analyze crash metrics for Firefox Desktop in Looker. First, relevant metrics, such as number of socket crashes, need to be <a href="https://github.com/mozilla/metric-hub/blob/4ef7e2ef8a53c90f77a692af4c82ef31be8bf369/definitions/firefox_desktop.toml#L1577C10-L1593C11">added to <code>definitions/firefox_desktop.toml</code></a>:</p>
<pre><code class="language-toml">[metrics.socket_crash_count_v1]
select_expression = &quot;SUM(socket_crash_count)&quot;
data_source = &quot;clients_daily&quot;
friendly_name = &quot;Client Crash Count&quot;
description = &quot;Number of Socket crashes by a single client. Filter on this field to remove clients with large numbers of crashes.&quot;
[metrics.socket_crash_active_hours_v1]
select_expression = &quot;SUM(IF(socket_crash_count &gt; 0, active_hours_sum, 0))&quot;
data_source = &quot;clients_daily&quot;
friendly_name = &quot;Client Crash Active Hours&quot;
description = &quot;Total active hours of a client with socket crashes&quot;
</code></pre>
<p>To summarize these metrics for specific channels, operating systems, etc, statistics need to be defined in <a href="https://github.com/mozilla/metric-hub/blob/4ef7e2ef8a53c90f77a692af4c82ef31be8bf369/looker/definitions/firefox_desktop.toml#L3C10-L9"><code>looker/definitions/firefox_desktop.toml</code> in metric-hub</a>:</p>
<pre><code class="language-toml">[metrics.socket_crash_count_v1.statistics.sum]
[metrics.socket_crash_active_hours_v1.statistics.sum]
[metrics.socket_crash_active_hours_v1.statistics.client_count]
[metrics.socket_crash_count_v1.statistics.ratio]
numerator = &quot;socket_crash_count_v1.sum&quot;
denominator = &quot;socket_crash_active_hours_v1.sum&quot;
</code></pre>
<p>These statistics allow to determine the total number of crashes, total number of hours with crashes, how many clients were affected and so on.</p>
<p>The <a href="https://mozilla.cloud.looker.com/explore/firefox_desktop/metric_definitions_clients_daily?qid=KxzAcgpqBQEzaCcVxrUA3w&amp;toggle=fil,vis">Metric Definitions Clients Daily explore in Looker</a> now exposes the defined metrics in statistics which are ready to be used in dashboards or ad-hoc analyses.</p>
<h2 id="faq"><a class="header" href="#faq">FAQ</a></h2>
<h3 id="should-metrics-be-defined-in-the-metric-definition-data-source-definition-or-source-table"><a class="header" href="#should-metrics-be-defined-in-the-metric-definition-data-source-definition-or-source-table">Should metrics be defined in the metric definition, data source definition or source table?</a></h3>
<p>Definitions for metrics can be encoded at different levels. It is preferable to specify the SQL that defines how a metric should be computed as much upstream as possible. This allows the most flexible usage of metric definitions.</p>
<p>Ideally, metrics should be defined in the <code>[metrics]</code> section. However in some cases metrics might rely on more complex logic. For example, if some more complicated unnesting of fields or <code>JOIN</code>s across multiple tables are required it might make sense to move the metric computation into the <code>[data_sources]</code> definition and then simply reference the field in the <code>[metrics]</code> section. The main drawback of this is that if users want to find the definition they will have to go one layer deeper and check how the data source is defined versus just having to look at the metric definition itself.</p>
<p>For computationally expensive metrics it can make sense to set up an ETL job that computes the metrics on a daily basis and writes results to a separate table. This table can serve as basis of a data source which can then be used to define a metric.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/concepts/metric_hub.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="external-data-integration-using-fivetran"><a class="header" href="#external-data-integration-using-fivetran">External data integration using Fivetran</a></h1>
<p><a href="https://www.fivetran.com/">Fivetran</a> is used for importing data from external services into our data warehouse. A <a href="https://www.fivetran.com/connectors">range of prebuilt connectors</a> are available that can be configured in the Fivetran web interface. Custom connectors can be implemented as <a href="https://fivetran.com/docs/functions/google-cloud-functions">Google Cloud Functions</a>.</p>
<h2 id="getting-access"><a class="header" href="#getting-access">Getting access</a></h2>
<p>Access to Fivetran is limited to Mozilla employees. By default, users do not have permissions to create or access connectors. To gain access, create a task in <a href="https://mozilla-hub.atlassian.net/jira/software/c/projects/DSRE/issues/">Jira under the DSRE project</a>.</p>
<p>Fivetran can be accessed using your Mozilla LDAP credentials through <code>sso.mozilla.com</code>.</p>
<h2 id="architecture"><a class="header" href="#architecture">Architecture</a></h2>
<p>Fivetran is used to extract data from external services and load it into our data warehouse. For transforming data or writing data to a different destination <a href="https://github.com/mozilla/bigquery-etl">bigquery-etl</a> is used.</p>
<p>The integration of Fivetran and external data import into our data platform is illustrated in the following:</p>
<p><img src="concepts/../assets/fivetran_architecture.png" alt="" /></p>
<p>A separate connector is set up in Fivetran for each external service data should be imported from. The imports get triggered via Airflow which will have different DAGs with a specific schedule. Fivetran will dump the data into our data warehouse. Once the first Airflow-triggered import is done, Airflow can trigger the ETL process which will write the extracted data to a different location in our data warehouse.</p>
<h3 id="environments"><a class="header" href="#environments">Environments</a></h3>
<p>We have production and development environments in Fivetran.</p>
<div class="table-wrapper"><table><thead><tr><th>Environment</th><th>Description</th><th>Fivetran Warehouse</th><th>GCP Destination Project</th><th>Access</th></tr></thead><tbody>
<tr><td>Development</td><td>This environment allows developers to test their connectors during development, quickly deploy changes and make changes to imported data.</td><td><strong>Fivetran_Dev</strong></td><td><code>dev-fivetran</code></td><td>Anyone in data engineering</td></tr>
<tr><td>Production</td><td>This environment has connectors deployed that are used in production. Access to connectors and imported data is restricted.</td><td><strong>Fivetran_Prod</strong></td><td><code>moz-fx-data-bq-fivetran</code></td><td>Only automation has access. If access to the data is required a ticket needs to be filed with SRE.</td></tr>
</tbody></table>
</div>
<p>To prevent Fivetran from having access to our data warehouse, separate GCP projects have been provisioned for Fivetran to dump imported data into. The data can be transformed and moved to different projects and datasets using bigquery-etl.</p>
<p>When working in the <strong>Fivetran_Dev</strong> environment, data is written to the <code>dev-fivetran</code> GCP project in BigQuery. All datasets are by default accessible to data engineers. So when working with sensitive data, it might be necessary to manually update the access permissions to the dataset. Ensure that the Fivetran service account has the necessary permissions to write data to the dataset when making updates.</p>
<p>Once the development has finished, either manually delete previously created datasets or set an expiration date so that the datasets will be removed automatically after some time.</p>
<p>There are also some additional Fivetran environments that are not managed through Data Engineering and are not integrated into our data platform.</p>
<h2 id="setting-up-connectors"><a class="header" href="#setting-up-connectors">Setting up connectors</a></h2>
<p>This is a step-by-step guide for setting up a new connector in Fivetran, scheduling it via Airflow and transforming imported data via bigquery-etl:</p>
<ol>
<li>
<p>Log in to <a href="https://fivetran.com">Fivetran</a> and select the environment the new connector should be created in.</p>
</li>
<li>
<p>Click on the &quot;Add Connector&quot; button and search for the service data that should be imported from. If no connector is available for the service please check out <a href="concepts/external_data_integration_using_fivetran.html#developing-custom-connectors">&quot;Developing custom connectors&quot;</a> for writing custom connectors.</p>
</li>
<li>
<p>Specify the name of the <code>destination schema</code> (BigQuery dataset name where data will be loaded into)</p>
</li>
<li>
<p>Configure the connector by providing API credentials and other required settings.</p>
</li>
<li>
<p>Finish the setup and wait for the connection tests to pass.</p>
</li>
<li>
<p>Once the connector has been created a historical data import will be automatically triggered. This import can take a few minutes up to several days depending on how much historical data is available.</p>
<ul>
<li>By default, Fivetran schedules imports to run every 6 hours. This schedule will be ignored when scheduling the import through Airflow as explained in the following steps.</li>
</ul>
</li>
<li>
<p>Specify the data transformation in <a href="https://github.com/mozilla/bigquery-etl">bigquery-etl</a> or <a href="https://github.com/mozilla/private-bigquery-etl">private-bigquery-etl</a> when working with sensitive data. Please follow the <a href="https://mozilla.github.io/bigquery-etl/cookbooks/creating_a_derived_dataset/">general guide of creating a derived dataset</a>. When writing the query, data will be queried from the dataset that was specified when the connector was set up. The connector documentation specifies what tables will be available and what the schemas will look like.</p>
<ul>
<li>Make sure to query from the correct GCP project depending on the environment for which the connector was set up for.</li>
</ul>
</li>
<li>
<p><a href="https://github.com/mozilla/bigquery-etl/blob/main/docs/reference/scheduling.md">Scheduling</a> the ETL and the Fivetran import is also done in bigquery-etl. In the <code>metadata.yaml</code> file of the transformation queries, in addition to specifying the DAG which will be generated automatically, also specify the Fivetran tasks that the ETL depends on, for example:</p>
<pre><code class="language-yaml">scheduling:
dag_name: bqetl_external_service
depends_on_fivetran:
- task_id: fivetran_import_1
- task_id: another_fivetran_import
</code></pre>
</li>
<li>
<p>Once the changes have been merged into bigquery-etl and the Airflow DAGs have been updated and are available in Airflow the connector ID needs to be configured in the Airflow Admin interface. The connector ID is used for identifying the specific Fivetran connector that should be triggered. Looking at the Airflow DAGs each Fivetran import task will reference an Airflow variable. To configure these variables in the <a href="https://workflow.telemetry.mozilla.org/variable/list/">Airflow Admin - Variables settings</a> add a new entry. The <strong>Key</strong> needs to be set to the variable name as shown in the DAG source, the <strong>Value</strong> is the Fivetran connector ID which can be copied from the Connection Details of the Fivetran connector in the &quot;Setup&quot; tab.</p>
</li>
<li>
<p>Enable the DAG in Airflow and trigger an import.</p>
<ul>
<li>By default the Airflow DAG is turned off and needs to be activated manually through the Airflow web UI. Once the first run has started, the Fivetran connector will be updated to only run when triggered manually. If a schedule has been configured through the Fivetran web UI then it will be overwritten.</li>
</ul>
</li>
</ol>
<h2 id="developing-custom-connectors"><a class="header" href="#developing-custom-connectors">Developing custom connectors</a></h2>
<p>Fivetran might not provide a prebuilt connector for a specific service. It is possible to implement custom connectors that will be deployed as <a href="https://fivetran.com/docs/functions/google-cloud-functions">Google Cloud Functions</a>. The code bases of these connectors are part of one of the following repositories:</p>
<ul>
<li><a href="https://github.com/mozilla/fivetran-connectors">Fivetran Connectors</a> - This repository contains custom connectors that can be shared publicly.</li>
<li><a href="https://github.com/mozilla/private-fivetran-connectors">Private Fivetran Connectors</a> - This repository contains custom connectors that should not be shared publicly since the code bases might contain sensitive information.</li>
</ul>
<p>When developing connectors use the <strong>Fivetran_Dev</strong> environment. This environment allows for faster changes, data is directly accessible and can be deleted by developers.
Deploy connectors to the <strong>Fivetran_Prod</strong> environment only after the connector has been thoroughly tested and the data schema is unlikely to change.
A detailed guide of how to create a new custom connector step by step, best practices and tips around debugging are available <a href="https://github.com/mozilla/fivetran-connectors#development">here</a>.</p>
<h2 id="getting-help-3"><a class="header" href="#getting-help-3">Getting help</a></h2>
<ul>
<li><a href="https://mozilla.slack.com/archives/C4D5ZA91B"><strong>#data-help</strong></a> - for questions about the data imported from external services and derived datasets</li>
<li><a href="https://mozilla.slack.com/archives/C02GZTFM08M"><strong>#fivetran-discussion</strong></a> - for questions regarding the development of custom connectors or setting up connectors</li>
</ul>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/concepts/external_data_integration_using_fivetran.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="projects"><a class="header" href="#projects">Projects</a></h1>
<p>Below are a number of trailheads that lead into the projects and code that comprise the Firefox Data Platform.</p>
<h2 id="telemetry-apis"><a class="header" href="#telemetry-apis">Telemetry APIs</a></h2>
<div class="table-wrapper"><table><thead><tr><th>Name and repo</th><th>Description</th></tr></thead><tbody>
<tr><td><a href="https://github.com/mozilla/python_moztelemetry"><code>python_moztelemetry</code></a></td><td>Python APIs for Mozilla Telemetry</td></tr>
<tr><td><a href="https://github.com/mozilla/moztelemetry"><code>moztelemetry</code></a></td><td>Scala APIs for Mozilla Telemetry</td></tr>
<tr><td><a href="https://github.com/mozilla/spark-hyperloglog"><code>spark-hyperloglog</code></a></td><td>Algebird's HyperLogLog support for Apache Spark</td></tr>
<tr><td><a href="https://github.com/mozilla/mozanalysis"><code>mozanalysis</code></a></td><td>A library for Mozilla experiments analysis</td></tr>
<tr><td><a href="https://github.com/mozilla-mobile/android-components/tree/master/components/service/glean"><code>glean</code></a></td><td>A client-side mobile Telemetry SDK for collecting metrics and sending them to Mozilla's Telemetry service</td></tr>
</tbody></table>
</div>
<h2 id="etl-code-and-datasets"><a class="header" href="#etl-code-and-datasets">ETL code and Datasets</a></h2>
<div class="table-wrapper"><table><thead><tr><th>Name and repo</th><th>Description</th></tr></thead><tbody>
<tr><td><a href="https://github.com/mozilla/bigquery-etl"><code>bigquery-etl</code></a></td><td>SQL ETL code for building derived datasets in BigQuery</td></tr>
<tr><td><a href="https://github.com/mozilla/telemetry-batch-view"><code>telemetry-batch-view</code></a></td><td>Scala ETL code for derived datasets</td></tr>
<tr><td><a href="https://github.com/mozilla/python_mozetl"><code>python_mozetl</code></a></td><td>Python ETL code for derived datasets</td></tr>
<tr><td><a href="https://github.com/mozilla/telemetry-airflow"><code>telemetry-airflow</code></a></td><td>Airflow configuration and DAGs for scheduled jobs</td></tr>
<tr><td><a href="https://github.com/mozilla/python_mozaggregator"><code>python_mozaggregator</code></a></td><td>Aggregation job for <code>telemetry.mozilla.org</code> aggregates</td></tr>
<tr><td><a href="https://github.com/mozilla/telemetry-streaming"><code>telemetry-streaming</code></a></td><td>Spark Streaming ETL jobs for Mozilla Telemetry</td></tr>
</tbody></table>
</div>
<p>See also <a href="https://docs.telemetry.mozilla.org"><code>data-docs</code></a> for documentation on datasets.</p>
<h2 id="infrastructure"><a class="header" href="#infrastructure">Infrastructure</a></h2>
<div class="table-wrapper"><table><thead><tr><th>Name and repo</th><th>Description</th></tr></thead><tbody>
<tr><td><a href="https://github.com/mozilla-services/mozilla-pipeline-schemas"><code>mozilla-pipeline-schemas</code></a></td><td>JSON and Parquet Schemas for Mozilla Telemetry and other structured data</td></tr>
<tr><td><a href="https://github.com/mozilla/gcp-ingestion"><code>gcp-ingestion</code></a></td><td>Documentation and implementation of the Mozilla telemetry ingestion system on Google Cloud Platform</td></tr>
<tr><td><a href="https://github.com/mozilla/jsonschema-transpiler"><code>jsonschema-transpiler</code></a></td><td>Convert JSON Schema into BigQuery table definitions</td></tr>
<tr><td><a href="https://github.com/mozilla/mozilla-schema-generator"><code>mozilla-schema-generator</code></a></td><td>Incorporate probe metadata to generate BigQuery table schemas</td></tr>
<tr><td><a href="https://github.com/mozilla-services/hindsight"><code>hindsight</code></a></td><td>Real-time data processing</td></tr>
<tr><td><a href="https://github.com/mozilla-services/lua_sandbox"><code>lua_sandbox</code></a></td><td>Generic sandbox for safe data analysis</td></tr>
<tr><td><a href="https://github.com/mozilla-services/lua_sandbox_extensions"><code>lua_sandbox_extensions</code></a></td><td>Modules and packages that extend the Lua sandbox</td></tr>
<tr><td><a href="https://github.com/mozilla-services/nginx_moz_ingest"><code>nginx_moz_ingest</code></a></td><td>Nginx module for Telemetry data ingestion</td></tr>
<tr><td><a href="https://github.com/mozilla-services/puppet-config/tree/master/pipeline"><code>puppet-config</code></a></td><td>Cloud services puppet config for deploying infrastructure</td></tr>
<tr><td><a href="https://github.com/mozilla/parquet2hive"><code>parquet2hive</code></a></td><td>Hive import statement generator for Parquet datasets</td></tr>
<tr><td><a href="https://github.com/mozilla-services/edge-validator"><code>edge-validator</code></a></td><td>A service endpoint for validating incoming data</td></tr>
</tbody></table>
</div>
<h2 id="data-applications"><a class="header" href="#data-applications">Data applications</a></h2>
<div class="table-wrapper"><table><thead><tr><th>Name and repo</th><th>Description</th></tr></thead><tbody>
<tr><td><a href="https://github.com/mozilla/telemetry-dashboard"><code>telemetry.mozilla.org</code></a></td><td>Main entry point for viewing <a href="https://telemetry.mozilla.org">aggregate Telemetry data</a></td></tr>
<tr><td><a href="https://github.com/mozilla/glam">Glean Aggregate Metrics</a></td><td>Aggregate info about probes and measures</td></tr>
<tr><td><a href="https://debug-ping-preview.firebaseapp.com">Glean Debug View</a></td><td>Tag and view Glean submissions with low latency</td></tr>
<tr><td><a href="https://github.com/mozilla/redash">Redash</a></td><td>Mozilla's fork of the <a href="https://sql.telemetry.mozilla.org">data query / visualization system</a></td></tr>
<tr><td><a href="https://github.com/mozilla/redash-stmo"><code>redash-stmo</code></a></td><td>Mozilla's extensions to Redash</td></tr>
<tr><td><a href="https://github.com/mozilla/taar">TAAR</a></td><td>Telemetry-aware addon recommender</td></tr>
<tr><td><a href="https://github.com/mozilla/ensemble">Ensemble</a></td><td>A minimalist platform for publishing data</td></tr>
<tr><td><a href="https://github.com/mozilla/firefox-hardware-report">Hardware Report</a></td><td>Firefox Hardware Report, <a href="https://data.firefox.com/dashboard/hardware">available here</a></td></tr>
<tr><td><a href="https://github.com/mozilla/stmocli">St. Mocli</a></td><td>A command-line interface to <a href="https://sql.telemetry.mozilla.org">STMO</a></td></tr>
<tr><td><a href="https://github.com/mozilla/probe-scraper">probe-scraper</a></td><td>Scrape and publish Telemetry probe data from Firefox</td></tr>
<tr><td><a href="https://github.com/mozilla/firefox-test-tube">test-tube</a></td><td>Compare data across branches in experiments</td></tr>
<tr><td><a href="https://github.com/mozilla/experimenter">experimenter</a></td><td>A web application for managing experiments</td></tr>
<tr><td><a href="https://github.com/mozilla/stmoab">St. Moab</a></td><td>Automatically generate Redash dashboard for A/B experiments</td></tr>
</tbody></table>
</div>
<h2 id="legacy-projects"><a class="header" href="#legacy-projects">Legacy projects</a></h2>
<p>Projects in this section are less active, but may not be officially
deprecated. Please check with the <code>fx-data-dev</code> mailing list before
starting a new project using anything in this section.</p>
<div class="table-wrapper"><table><thead><tr><th>Name and repo</th><th>Description</th></tr></thead><tbody>
<tr><td><a href="https://github.com/mozilla/telemetry-next-node"><code>telemetry-next-node</code></a></td><td>A <code>node.js</code> package for accessing Telemetry Aggregates data</td></tr>
<tr><td><a href="https://github.com/mozilla/emr-bootstrap-spark"><code>emr-bootstrap-spark</code></a></td><td>AWS bootstrap scripts for Spark.</td></tr>
<tr><td><a href="https://github.com/mozilla/emr-bootstrap-presto"><code>emr-bootstrap-presto</code></a></td><td>AWS bootstrap scripts for Presto.</td></tr>
</tbody></table>
</div>
<h2 id="reference-materials"><a class="header" href="#reference-materials">Reference materials</a></h2>
<h3 id="public"><a class="header" href="#public">Public</a></h3>
<div class="table-wrapper"><table><thead><tr><th>Name and repo</th><th>Description</th></tr></thead><tbody>
<tr><td><a href="https://github.com/mozilla/data-docs"><code>data-docs</code></a></td><td>All the info you need to <a href="https://docs.telemetry.mozilla.org">answer questions about Firefox users with data</a></td></tr>
<tr><td>Firefox source docs</td><td><a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/">Mozilla Source Tree Docs - Telemetry section</a></td></tr>
<tr><td><a href="https://github.com/mozilla/mozilla-reports"><code>mozilla.report</code></a></td><td>Knowledge repository for public reports (archived)</td></tr>
</tbody></table>
</div>
<h3 id="non-public"><a class="header" href="#non-public">Non-public</a></h3>
<div class="table-wrapper"><table><thead><tr><th>Name and repo</th><th>Description</th></tr></thead><tbody>
</tbody></table>
</div><footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/tools/projects.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="dataset-reference-1"><a class="header" href="#dataset-reference-1">Dataset Reference</a></h1>
<p>This section contains reference material on some of the major datasets we store in BigQuery.
Reading this section front to back is not recommended.
Instead, identify a dataset you'd like to understand better and read through
the relevant documentation.
After reading the tutorial, you should know all you need about the dataset.</p>
<p>Detailed per-table docs are available <a href="https://mozilla.acryl.io/">in the Data Catalog</a>.</p>
<p>Each tutorial should include:</p>
<ul>
<li>Introduction
<ul>
<li>A short overview of why we built the dataset and what need it's meant to solve</li>
<li>What data source the data is collected from,
and a high level overview of how the data is organized</li>
<li>How it is stored and how to access the data</li>
</ul>
</li>
<li>Reference
<ul>
<li>An example query to give the reader an idea of what the data looks like
and how it is meant to be used</li>
<li>How the data is processed and sampled</li>
<li>How frequently it's updated, and how it's scheduled</li>
<li>An up-to-date schema for the dataset</li>
<li>How to augment or modify the dataset</li>
</ul>
</li>
</ul>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="raw-ping-data"><a class="header" href="#raw-ping-data">Raw Ping Data</a></h1>
<blockquote>
<p><strong>⚠</strong> This article discusses pings sent by Firefox's legacy v4 telemetry system.
See the <a href="https://mozilla.github.io/glean/book/user/pings/index.html">Glean documentation on pings</a> for newer applications written using the Glean SDK.</p>
</blockquote>
<ul>
<li><a href="datasets/pings.html#introduction">Introduction</a></li>
<li><a href="datasets/pings.html#ping-types">Ping Types</a>
<ul>
<li><a href="datasets/pings.html#main-ping">&quot;main&quot; ping</a></li>
<li><a href="datasets/pings.html#first-shutdown-ping">&quot;first-shutdown&quot; ping</a></li>
<li><a href="datasets/pings.html#event-ping">&quot;event&quot; ping</a></li>
<li><a href="datasets/pings.html#update-ping">&quot;update&quot; ping</a></li>
<li><a href="datasets/pings.html#new-profile-ping">&quot;new-profile&quot; ping</a></li>
<li><a href="datasets/pings.html#crash-ping">&quot;crash&quot; ping</a></li>
<li><a href="datasets/pings.html#deletion-request-ping">&quot;deletion-request&quot; ping</a></li>
<li><a href="datasets/pings.html#coverage-ping">&quot;coverage&quot; ping</a></li>
</ul>
</li>
<li><a href="datasets/pings.html#pingsender">Pingsender</a></li>
<li><a href="datasets/pings.html#ping-metadata">Ping Metadata</a></li>
<li><a href="datasets/pings.html#analysis">Analysis</a></li>
<li><a href="datasets/pings.html#further-reading">Further Reading</a></li>
</ul>
<h2 id="introduction-3"><a class="header" href="#introduction-3">Introduction</a></h2>
<p>We receive data from our users via <strong>pings</strong>.
There are several types of pings,
each containing different measurements and sent for different purposes.
To review a complete list of ping types and their schemata, see
<a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/data/index.html">this section of the Mozilla Source Tree Docs</a>.</p>
<p>Pings are also described by a JSONSchema specification which can be found in <a href="https://github.com/mozilla-services/mozilla-pipeline-schemas/tree/master/schemas/telemetry">the <code>mozilla-pipeline-schemas</code> repository</a>.</p>
<p>There are a few pings that are central to delivering our core data collection
primitives (Histograms, Events, Scalars) and for keeping an eye on Firefox
behaviour (Environment, New Profiles, Updates, Crashes).</p>
<p>For instance, a user's first session in Firefox might have four pings like this:</p>
<p><img src="datasets//datasets/images/first_session_pings.png" alt="Flowchart of pings in the user's first session" /></p>
<h2 id="ping-types"><a class="header" href="#ping-types">Ping Types</a></h2>
<h3 id="main-ping"><a class="header" href="#main-ping">&quot;main&quot; ping</a></h3>
<p>The <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/data/main-ping.html">&quot;main&quot; ping</a> is the workhorse of the Firefox Telemetry system.
It delivers the Telemetry Environment as well as Histograms and Scalars for all
process types that collect data in Firefox. It has several variants each with
specific delivery characteristics:</p>
<div class="table-wrapper"><table><thead><tr><th>Reason</th><th>Sent when</th><th>Notes</th></tr></thead><tbody>
<tr><td>shutdown</td><td>Firefox session ends cleanly</td><td>Accounts for about 80% of all &quot;main&quot; pings (<a href="https://sql.telemetry.mozilla.org/queries/3434"><code>STMO#3434</code></a>). Sent by Pingsender immediately after Firefox shuts down, subject to conditions: Firefox 55+, if the OS isn't also shutting down, and if this isn't the client's first session. If Pingsender fails or isn't used, the ping is sent by Firefox at the beginning of the next Firefox session.</td></tr>
<tr><td>daily</td><td>It has been more than 24 hours since the last &quot;main&quot; ping, and it is around local midnight</td><td>In long-lived Firefox sessions we might go days without receiving a &quot;shutdown&quot; ping. Thus the &quot;daily&quot; ping is sent to ensure we occasionally hear from long-lived sessions.</td></tr>
<tr><td>environment-change</td><td>Telemetry Environment changes</td><td>Is sent immediately when triggered by Firefox (Installing or removing an addon or changing a monitored user preference are common ways for the Telemetry Environment to change)</td></tr>
<tr><td>aborted-session</td><td>Firefox session doesn't end cleanly</td><td>Sent by Firefox at the beginning of the next Firefox session.</td></tr>
</tbody></table>
</div>
<p>It was introduced in Firefox 38.</p>
<h3 id="first-shutdown-ping"><a class="header" href="#first-shutdown-ping">&quot;first-shutdown&quot; ping</a></h3>
<p>The <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/data/first-shutdown-ping.html">&quot;first-shutdown&quot; ping</a> is identical to the &quot;main&quot;
ping with reason &quot;shutdown&quot; created at the end of the user's first session,
but sent with a different ping type. This was introduced when we started
using Pingsender to send shutdown pings as there would be a lot of
first-session &quot;shutdown&quot; pings that we'd start receiving all of a sudden.</p>
<p>It is sent using Pingsender.</p>
<p>It was introduced in Firefox 57.</p>
<h3 id="event-ping"><a class="header" href="#event-ping">&quot;event&quot; ping</a></h3>
<p>The <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/data/event-ping.html">&quot;event&quot; ping</a> provides low-latency eventing support to Firefox
Telemetry. It delivers the Telemetry Environment, Telemetry Events from all
Firefox processes, and some diagnostic information about Event Telemetry. It is
sent every hour if there have been events recorded, and up to once every 10
minutes (governed by a <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/internals/preferences.html">preference</a>) if the maximum event limit
for the ping (default to 1000 per process, governed by a
<a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/internals/preferences.html">preference</a>) is reached before the hour is up.</p>
<p>It was introduced in Firefox 62.</p>
<h3 id="update-ping"><a class="header" href="#update-ping">&quot;update&quot; ping</a></h3>
<p>Firefox Update is the most important means we have of reaching our users with
the latest fixes and features. The <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/data/update-ping.html">&quot;update&quot; ping</a> notifies us
when an update is downloaded and ready to be applied (reason: &quot;ready&quot;) and when
the update has been successfully applied (reason: &quot;success&quot;). It contains the
Telemetry Environment and information about the update.</p>
<p>It was introduced in Firefox 56.</p>
<h3 id="new-profile-ping"><a class="header" href="#new-profile-ping">&quot;new-profile&quot; ping</a></h3>
<p>When a user starts up Firefox for the first time, a profile is created.
Telemetry marks the occasion with the <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/data/new-profile-ping.html">&quot;new-profile&quot; ping</a>
which sends the Telemetry Environment. It is sent either 30 minutes after Firefox
starts running for the first time in this profile (reason: &quot;startup&quot;) or at the
end of the profile's first session (reason: &quot;shutdown&quot;), whichever comes first.
&quot;new-profile&quot; pings are sent immediately when triggered. Those with reason
&quot;startup&quot; are sent by Firefox. Those with reason &quot;shutdown&quot; are sent by
Pingsender.</p>
<p>It was introduced in Firefox 55.</p>
<h3 id="crash-ping"><a class="header" href="#crash-ping">&quot;crash&quot; ping</a></h3>
<p>The <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/data/crash-ping.html">&quot;crash&quot; ping</a> provides diagnostic information whenever a
Firefox process exits abnormally. Unlike the &quot;main&quot; ping with reason
&quot;aborted-session&quot;, this ping does not contain Histograms or Scalars. It
contains a Telemetry Environment, <a href="https://searchfox.org/mozilla-central/source/toolkit/crashreporter/CrashAnnotations.yaml">Crash Annotations</a>, and
<a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/data/crash-ping.html#stack-traces">Stack Traces</a>.</p>
<p>It was introduced in Firefox 40.</p>
<h3 id="deletion-request-ping"><a class="header" href="#deletion-request-ping">&quot;deletion-request&quot; ping</a></h3>
<p>In the event a user opts out of Telemetry, we send one final
<a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/data/deletion-request-ping.html">&quot;deletion-request&quot; ping</a> to let us know. It contains
only the <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/data/common-ping.html">common ping data</a> and an empty payload.</p>
<p>It was introduced in Firefox 72, replacing the <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/obsolete/optout-ping.html">&quot;optout&quot; ping</a>
(which was in turn introduced in Firefox 63).</p>
<h3 id="coverage-ping"><a class="header" href="#coverage-ping">&quot;coverage&quot; ping</a></h3>
<p>The <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/data/coverage-ping.html">coverage ping</a> (<a href="https://blog.mozilla.org/data/2018/08/20/effectively-measuring-search-in-firefox/">announcement</a>)
is a periodic census intended to estimate telemetry opt-out rates.</p>
<p>We estimate that <a href="https://docs.google.com/document/d/1EnQoq9o1sLXTgsbbG8mPnUiWIOmKSINGJOAKyYUm_YA/edit#">93% of release channel
profiles</a>
have telemetry enabled (and are therefore included in DAU).</p>
<h2 id="pingsender-1"><a class="header" href="#pingsender-1">Pingsender</a></h2>
<p><a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/internals/pingsender.html">Pingsender</a> is a small application shipped with Firefox which
attempts to send pings even if Firefox is not running. If Firefox has crashed or has already shut
down we would otherwise have to wait for the next Firefox session to begin to
send pings.</p>
<p>Pingsender was introduced in Firefox 54 to send &quot;crash&quot; pings. It was expanded
to send &quot;main&quot; pings of reason &quot;shutdown&quot; in Firefox 55 (excepting the first
session). It sends the &quot;first-shutdown&quot; ping since its introduction in Firefox 57.</p>
<h2 id="ping-metadata"><a class="header" href="#ping-metadata">Ping Metadata</a></h2>
<p>The data pipeline appends metadata to arriving pings containing
information about the ingestion environment including timestamps,
Geo-IP data about the client,
and fields extracted from the ping or client headers that are useful for downstream processing.</p>
<p>These fields are available in BigQuery ping tables inside the <code>metadata</code> struct, described in detail
in <a href="datasets/../cookbooks/new_ping.html">the &quot;Ingestion Metadata&quot; section of this article</a>.</p>
<p>Since the metadata are not present in the ping as it is sent by the client,
these fields are documented here, instead of in the source tree docs.</p>
<p>As of September 28, 2018, members of the <code>meta</code> key on main pings include:</p>
<!-- table generated via `scripts/new_ping_metadata_table.py > src/cookbooks/new_ping_metadata_table.md` -->
<div class="table-wrapper"><table><thead><tr><th>field</th><th>description</th></tr></thead><tbody>
<tr><td><code>additional_properties</code></td><td>A JSON string containing any payload properties not present in the schema</td></tr>
<tr><td><code>document_id</code></td><td>The document ID specified in the URI when the client sent this message</td></tr>
<tr><td><code>normalized_app_name</code></td><td>Set to &quot;Other&quot; if this message contained an unrecognized app name</td></tr>
<tr><td><code>normalized_channel</code></td><td>Set to &quot;Other&quot; if this message contained an unrecognized channel name</td></tr>
<tr><td><code>normalized_country_code</code></td><td>An ISO 3166-1 alpha-2 country code</td></tr>
<tr><td><code>normalized_os</code></td><td>Set to &quot;Other&quot; if this message contained an unrecognized OS name</td></tr>
<tr><td><code>normalized_os_version</code></td><td>N/A</td></tr>
<tr><td><code>sample_id</code></td><td>Hashed version of client_id (if present) useful for partitioning; ranges from 0 to 99</td></tr>
<tr><td><code>submission_timestamp</code></td><td>Time when the ingestion edge server accepted this message</td></tr>
<tr><td><code>metadata.user_agent.browser</code></td><td>N/A</td></tr>
<tr><td><code>metadata.user_agent.os</code></td><td>N/A</td></tr>
<tr><td><code>metadata.user_agent.version</code></td><td>N/A</td></tr>
<tr><td><code>metadata.uri.app_build_id</code></td><td>N/A</td></tr>
<tr><td><code>metadata.uri.app_name</code></td><td>N/A</td></tr>
<tr><td><code>metadata.uri.app_update_channel</code></td><td>N/A</td></tr>
<tr><td><code>metadata.uri.app_version</code></td><td>N/A</td></tr>
<tr><td><code>metadata.header.date</code></td><td>Date HTTP header</td></tr>
<tr><td><code>metadata.header.dnt</code></td><td>DNT (Do Not Track) HTTP header</td></tr>
<tr><td><code>metadata.header.x_debug_id</code></td><td>X-Debug-Id HTTP header</td></tr>
<tr><td><code>metadata.header.x_pingsender_version</code></td><td>X-PingSender-Version HTTP header</td></tr>
<tr><td><code>metadata.geo.city</code></td><td>City name</td></tr>
<tr><td><code>metadata.geo.country</code></td><td>An ISO 3166-1 alpha-2 country code</td></tr>
<tr><td><code>metadata.geo.db_version</code></td><td>The specific <a href="https://dev.maxmind.com/geoip/geoip2/geoip2-city-country-csv-databases/">Geo database</a> version used for this lookup</td></tr>
<tr><td><code>metadata.geo.subdivision1</code></td><td>First major country subdivision, typically a state, province, or county</td></tr>
<tr><td><code>metadata.geo.subdivision2</code></td><td>Second major country subdivision; not applicable for most countries</td></tr>
<tr><td><code>metadata.isp.db_version</code></td><td>The specific <a href="https://dev.maxmind.com/geoip/geoip2/geoip2-isp-csv-database/">ISP database</a> version used for this lookup</td></tr>
<tr><td><code>metadata.isp.name</code></td><td>The name of the Internet Service Provider</td></tr>
<tr><td><code>metadata.isp.organization</code></td><td>The name of a specific business entity when available; otherwise the ISP name</td></tr>
</tbody></table>
</div>
<h2 id="analysis-1"><a class="header" href="#analysis-1">Analysis</a></h2>
<p>The <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/data/main-ping.html">main ping</a> includes histograms, scalars, and other performance and diagnostic data.
Since Firefox 62, it <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1460595">no longer contains event data</a>; events are now sent in a separate <code>event</code> ping.</p>
<p><a href="datasets/./derived.html">Derived datasets</a> are processed from ping tables. They are intended to be:</p>
<ul>
<li>Easier and faster to query</li>
<li>Organized to make the data easier to analyze</li>
</ul>
<p>Ping data lives in BigQuery and is accessible in <a href="https://sql.telemetry.mozilla.org/">STMO</a>;
see the <a href="datasets/../cookbooks/bigquery.html">BigQuery cookbook section</a> for more information.
Before analyzing raw ping data,
check if a derived dataset can answer your question.
If you do need to work with raw ping data, be aware that the volume of data can be high.
Try to limit the size of your data by controlling the date range, and start off using a sample.</p>
<h2 id="further-reading"><a class="header" href="#further-reading">Further Reading</a></h2>
<p>You can find <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/data/index.html">the complete ping documentation</a> in the Firefox source documentation.
To augment our data collection, see <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/start/adding-a-new-probe.html">Collecting New Data</a> and the
<a href="https://wiki.mozilla.org/Firefox/Data_Collection">Data Collection Policy</a>.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/pings.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="derived-datasets-1"><a class="header" href="#derived-datasets-1">Derived Datasets</a></h1>
<p>See <a href="datasets/../cookbooks/bigquery/accessing_desktop_data.html">Accessing Desktop Data</a>
for a discussion on the differences between pings and derived datasets.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/derived.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="intro"><a class="header" href="#intro">Intro</a></h1>
<p>The <code>active_profiles</code> dataset gives client-level estimates of whether a profile
is still an active user of the browser at a given point in time, as well as probabilistic forecasts
of the client's future activity. These quantities are estimated by a model that attempts to infer
and decouple a client's latent propensity to leave Firefox and become inactive, as well as their
latent propensity to use the browser while still active. These estimates are currently
generated for release desktop browser profiles only, across all operating systems and
geographies.</p>
<h1 id="model"><a class="header" href="#model">Model</a></h1>
<p>The model generates predictions for each client by looking at just the recency and frequency of a
client's daily usage within the previous 90 day window. Usage is defined by the daily level binary
indicator of whether they show up in <code>clients_daily</code> on a given day.</p>
<p>The table contains columns related to these quantities:</p>
<ul>
<li><code>submission_date</code>: Day marking the end of the 90 day window. Earliest <code>submission_date</code> that
the table covers is <code>'2019-05-13'</code>.</li>
<li><code>min_day</code>: First day in the window that the client was seen. This could be anywhere between
the first day in the window and the last day in the window.</li>
<li><code>max_day</code>: Last day in the window the client was seen. The highest value this can be is
<code>submission_date</code>.</li>
<li><code>recency</code>: Age of client in days.</li>
<li><code>frequency</code>: Number of days in the window that a client has returned to use the browser
after <code>min_day</code>.</li>
<li><code>num_opportunities</code>: Given a first appearance at <code>min_day</code>, what is the highest number of
days a client could have returned. That is, what is the highest possible value for <code>frequency</code>.</li>
</ul>
<p>Since the model is only using these 2 coarse-grained statistics, these columns should make it
relatively straightforward to interpret why the model made the predictions that it did for a given
profile.</p>
<h2 id="latent-quantities"><a class="header" href="#latent-quantities">Latent quantities</a></h2>
<p>The model estimates the expected value for 2 related latent probability variables for a user. The
values in <code>prob_daily_leave</code> give our expectation of the probability that they will become inactive
on a given day, and <code>prob_daily_usage</code> represents the probability that a user will return on a given
day, <em>given that they are still active</em>.</p>
<p>These quantities could be useful for disentangling usage <em>rate</em> from the likelihood that a user is
still using the browser. We could, for example, identify intense users who are at risk of
churning, or users who at first glance appear to have churned, but are actually just infrequent
users.</p>
<p><code>prob_active</code> is the expected value of the probability that a user is still active on
<code>submission_date</code>, given their most recent 90 days' of activity. 'Inactive' in this sense
means that the profile will not use the browser again, whether because they have uninstalled
the browser or for some other reason.</p>
<h2 id="predictions"><a class="header" href="#predictions">Predictions</a></h2>
<p>There are several columns of the form <code>e_total_days_in_next_7_days</code>, which give the expected
number of times that a user will show up in the next 7 days (or 14, 21, 28 days). These
predictions take into account both the likelihood that a user will become inactive in the
future, as well as their daily propensity to use the browser, given that they are still active.
The values in <code>e_total_days_in_next_7_days</code> will be between 0 and 7.</p>
<p>An estimate for the probability that a client will contribute to MAU is available in the
column <code>prob_mau</code>. This is simply the probability that the user will return at any point in
the following 28 days, thereby contributing to MAU. Since it is a probability, the values will
range between 0 and 1, just like <code>prob_daily_leave</code> and <code>prob_daily_usage</code>.</p>
<h2 id="attributes"><a class="header" href="#attributes">Attributes</a></h2>
<p>There are several columns that contain attributes of the client, like <code>os</code>, <code>locale</code>,
<code>normalized_channel</code>, <code>normalized_os_version</code>, and <code>country</code>. <code>sample_id</code> is also included,
which can be useful for quicker queries, as the table is clustered by this column in BigQuery.</p>
<h2 id="remarks-on-the-model"><a class="header" href="#remarks-on-the-model">Remarks on the model</a></h2>
<p>A way to think about the model that infers these quantities is to imagine a simple process
where each client is given 2 weighted coins when they become users, and that they flip each
day. Since they're weighted, the probability of heads won't be 50%, but rather some probability
between 0 and 100%, specific to each client's coin. One coin, called <code>L</code>, comes up heads with
probability <code>prob_daily_leave</code>, and if it ever comes up heads, the client will never use the
browser again. The daily usage coin, <code>U</code>, has heads <code>prob_daily_usage</code>% of the time. <em>While
they are still active</em>, clients flip this coin to decide whether they will use the browser
on that day, and show up in <code>clients_daily</code>.</p>
<p>The combination of these two coin flipping processes results in a history of activity that we
can see in <code>clients_daily</code>. While the model is simple, it has very good predictive power that
can tell, <em>in aggregate</em>, how many users will still be active at some point in the future.
A downside of the model's simplicity, however, is that its predictions are not highly tailored
to an individual client. The very simplified features do not take into account things like
seasonality, or finer grained attributes of their usage (like active hours, addons, etc.).
Further, the predictions in this table only account for existing users that have been seen in
the 90 days of history, and so longer term forecasts of user activity would need to somehow model
new users separately.</p>
<h1 id="caveats-and-future-work"><a class="header" href="#caveats-and-future-work">Caveats and future work</a></h1>
<p>Due to the lightweight feature space of the model, the predictions perform better at the
population level rather than the individual client level, and there will be a lot of client-level
variation in behavior. That is, when grouping clients by different dimensions, say all of the
<code>en-IN</code> users on Darwin, the <em>average</em> MAU prediction should be quite close, but a lot of users'
behavior can deviate significantly from the predictions.</p>
<p>The model will also be better at medium- to longer- term forecasts. In particular, the model
will not be well suited to give predictions for new users who have appeared only once in the data
set very recently. These constitute a disproportionately large share of users, but do not
have enough history for this model to make good use of.
These single day profiles are currently the subject of
<a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1507073">an investigation</a>
that will hopefully yield good heuristics for users that only show up for a single day.</p>
<h1 id="sample-query"><a class="header" href="#sample-query">Sample query</a></h1>
<p><a href="https://console.cloud.google.com/bigquery?sq=630180991450:648f8e0a2faa4d86847fe8d27daf1938">Here</a> is
a sample query that will give averages for predicted MAU, probability that users are still
active, and other quantities across different operating systems:</p>
<pre><code class="language-sql">SELECT
os,
cast(sum(prob_mau) AS int64) AS predicted_mau,
count(*) AS n,
round(avg(prob_active) * 100, 1) AS prob_active,
round(avg(prob_daily_leave) * 100, 1) AS prob_daily_leave,
round(avg(prob_daily_usage) * 100, 1) AS prob_daily_usage,
round(avg(e_total_days_in_next_28_days), 1) AS e_total_days_in_next_28_days
FROM
`telemetry.active_profiles`
WHERE
submission_date = '2019-08-01'
AND sample_id = 1
GROUP BY
1
HAVING
count(*) &gt; 100
ORDER BY
avg(prob_daily_usage) DESC
</code></pre>
<h2 id="scheduling-1"><a class="header" href="#scheduling-1">Scheduling</a></h2>
<p>The code behind the model can be found in the <a href="https://github.com/wcbeard/bgbb_lib/"><code>bgbb_lib</code> repo</a>,
or on <a href="https://pypi.org/project/bgbb/">PyPI</a>. The airflow job is defined in the
<a href="https://github.com/wcbeard/bgbb_airflow"><code>bgbb_airflow</code> repo</a>.</p>
<p>The model to fit the parameters is run weekly, and the table is updated daily.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/active_profiles.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="active-users-aggregates-reference"><a class="header" href="#active-users-aggregates-reference">Active Users Aggregates reference</a></h1>
<ul>
<li><a href="datasets/bigquery/active_users_aggregates/reference.html#introduction">Introduction</a></li>
<li><a href="datasets/bigquery/active_users_aggregates/reference.html#background">Background</a></li>
<li><a href="datasets/bigquery/active_users_aggregates/reference.html#datasets">Datasets</a>
<ul>
<li><a href="datasets/bigquery/active_users_aggregates/reference.html#active-users-aggregates">Active users aggregates</a></li>
<li><a href="datasets/bigquery/active_users_aggregates/reference.html#active-users-aggregates-for-device">Active users aggregates for device</a></li>
<li><a href="datasets/bigquery/active_users_aggregates/reference.html#active-users-aggregates-for-attribution">Active users aggregates for attribution</a></li>
</ul>
</li>
<li><a href="datasets/bigquery/active_users_aggregates/reference.html#scheduling">Scheduling</a></li>
<li><a href="datasets/bigquery/active_users_aggregates/reference.html#code-reference">Code Reference</a></li>
<li><a href="datasets/bigquery/active_users_aggregates/reference.html#change-control">Change control</a></li>
<li><a href="datasets/bigquery/active_users_aggregates/reference.html#location-and-how-to-query">Location and how to query</a></li>
<li><a href="datasets/bigquery/active_users_aggregates/reference.html#looker-visualizations-with-period-over-period-analysis">Looker visualizations with period over period analysis</a></li>
</ul>
<h2 id="introduction-4"><a class="header" href="#introduction-4">Introduction</a></h2>
<p>The <code>active_users_aggregates</code> is a set of tables designed for analysis of client
activity on a daily, weekly and monthly basis, starting from the submission date
of the first baseline ping received by our servers.</p>
<h2 id="background"><a class="header" href="#background">Background</a></h2>
<h5 id="phase-1"><a class="header" href="#phase-1">Phase 1</a></h5>
<p>These aggregates were initially designed to support the migration of the GUD
Growth and Usage Dashboard as designed in the <a href="https://docs.google.com/document/d/1qvWO49Lr_Z_WErh3I3058A3B1YuiuURx19K3aTdmejM/edit?usp=sharing">Initial proposal</a>.
This resulted in three views that collect data for all browsers:</p>
<ul>
<li><code>telemetry.active_users_aggregates</code> for an overall analysis of active users and search metrics.</li>
<li><code>telemetry.active_users_aggregates_device</code> for analysis of active users based on unique devices where the client is installed.</li>
<li><code>telemetry.active_users_aggregates_attribution</code> for analysis of active users based on the first attribution reported by each client.</li>
</ul>
<h5 id="phase-2"><a class="header" href="#phase-2">Phase 2</a></h5>
<p>As part of the performance and quality improvement efforts and to support the <a href="https://docs.google.com/document/d/1fxnwHRd6EmtKqzuy8SZpl_6CyQIhsZMCUFUQq6gmvtA/edit#">kpi source of truth proposal</a>,
the view <code>telemetry.active_users_aggregates</code> is now based in two views, for Desktop and Mobile data.
The underlying setup is one table per browser, located in the corresponding BigQuery dataset, e.g. <code>fenix_derived.active_users_aggregates_v1</code>.</p>
<p>This new setup has a set of benefits:</p>
<ul>
<li>Data can be queried per browser, which reduces the amount of data scanned for lower query cost and improves query response time.</li>
<li>Browser specific logic and KPIs only require modifying the corresponding table and immediately reflects in the unioned views without further effort.</li>
<li>Browser specific implementations only require the backfill of the corresponding table, which results in reduced cost and time.</li>
</ul>
<h2 id="datasets-1"><a class="header" href="#datasets-1">Datasets</a></h2>
<h4 id="active-users-aggregates"><a class="header" href="#active-users-aggregates">Active users aggregates</a></h4>
<p>This aggregates contains the metrics daily, weekly and monthly active users,
new profile and search counts aggregated by product and various
business dimensions: attribution, channel, country, city, date,
device model, distribution id, segment, OS details.</p>
<h4 id="active-users-aggregates-for-device"><a class="header" href="#active-users-aggregates-for-device">Active users aggregates for device</a></h4>
<p>The <code>active_users_aggregates_device</code> contains the metrics of
daily, weekly and monthly active users, new profiles and search counts
with additional detail of the OS and device where the client is installed.</p>
<p>The reason to have this aggregate in addition to <code>active_users_aggregates</code>
is to improve the query performance for final users, by separating the
device analysis, which makes one of the biggest size columns in the table,
as most devices have unique identifiers.</p>
<h4 id="active-users-aggregates-for-attribution"><a class="header" href="#active-users-aggregates-for-attribution">Active users aggregates for attribution</a></h4>
<p>The <code>active_users_aggregates_attribution</code> contains the metrics of
daily, weekly and monthly active users, new profiles and search counts
to retrieve the source and context of each client installation and the <code>cohorts</code>
behaviour.</p>
<p>It can be used to query the set attribution parameters in the context
of the business core dimensions: country, submission_date, <code>app_name</code>
and if the browser is set to default.</p>
<p>The reason to have this aggregate in addition to <code>active_users_aggregates</code>
is to improve the query performance for final users, by separating the
analysis of the numerous attribution parameters, which is required with
less regularity than other dimensions and mostly for specific purposes.
E.g. During investigations or for marketing campaigns.</p>
<p>This aggregate retrieves Fenix attribution information from
<code>fenix.firefox_android_clients</code>.</p>
<h2 id="scheduling-2"><a class="header" href="#scheduling-2">Scheduling</a></h2>
<p>These datasets are scheduled to update daily at <code>3:30</code> hours in Airflow DAG
<a href="https://workflow.telemetry.mozilla.org/home?search=bqetl_analytics_aggregations"><code>bqetl_analytics_aggregations</code></a></p>
<h2 id="code-reference"><a class="header" href="#code-reference">Code Reference</a></h2>
<p>The query and metadata for unioned views (all browsers) is defined in the <code>active_users_aggregates_</code>
sub-folder in bigquery-etl under
<a href="https://github.com/mozilla/bigquery-etl/tree/main/sql/moz-fx-data-shared-prod/telemetry_derived"><code>telemetry_derived</code></a>.</p>
<p>The query and metadata per browser are auto-generated using <a href="https://github.com/mozilla/bigquery-etl/tree/main/sql_generators/active_users">sql generators</a>
and can be accessed by selecting the generated-sql branch in the bigquery-etl GitHub repository and navigating to the browser's sub-folder.
E.g. <a href="https://github.com/mozilla/bigquery-etl/blob/generated-sql/sql/moz-fx-data-shared-prod/firefox_desktop_derived/active_users_aggregates_v1/query.sql">Firefox desktop query</a>.</p>
<h2 id="change-control"><a class="header" href="#change-control">Change control</a></h2>
<p>The underlying tables for each browser aggregate are subject to change-control.</p>
<p>This follows the <a href="https://docs.google.com/document/d/1TTJi4ht7NuzX6BPG_KTr6omaZg70cEpxe9xlpfnHj9k/edit#heading=h.ttegrcfy18ck">Infrastructure Change Control process</a> and
implies that any modifications to the corresponding code in GitHub will require the approval of members of the Data Science and Data Engineering teams.</p>
<h2 id="location-and-how-to-query"><a class="header" href="#location-and-how-to-query">Location and how to query</a></h2>
<div class="table-wrapper"><table><thead><tr><th>BigQuery view</th><th>Notes / Location in Looker</th></tr></thead><tbody>
<tr><td><code>firefox_desktop.active_users_aggregates</code></td><td>Firefox Desktop. Windows, Linux, MacOS (OS = 'Darwin').</td></tr>
<tr><td><code>firefox_ios.active_users_aggregates</code></td><td>Firefox iOS</td></tr>
<tr><td><code>fenix.active_users_aggregates</code></td><td>Fenix</td></tr>
<tr><td><code>focus_android.active_users_aggregates</code></td><td>Focus Android</td></tr>
<tr><td><code>focus_ios.active_users_aggregates</code></td><td>Focus iOS</td></tr>
<tr><td><code>klar_ios.active_users_aggregates</code></td><td>Klar iOS</td></tr>
<tr><td></td><td></td></tr>
<tr><td></td><td></td></tr>
<tr><td><code>telemetry.active_users_aggregates_mobile</code></td><td>Unioned view of mobile browsers.</td></tr>
<tr><td><code>telemetry.active_users_aggregates</code></td><td>Unioned view of mobile &amp; desktop browsers.<br/><a href="https://mozilla.cloud.looker.com/explore/combined_browser_metrics/active_users_aggregates">Looker Explore</a></td></tr>
<tr><td><code>telemetry.active_users_aggregates_device</code></td><td>Unioned view of mobile &amp; desktop browsers.<br/><a href="https://mozilla.cloud.looker.com/explore/combined_browser_metrics/active_users_aggregates_device">Looker Explore</a></td></tr>
<tr><td><code>telemetry.active_users_aggregates_attribution</code></td><td>Unioned view of mobile &amp; desktop browsers.<br/><a href="https://mozilla.cloud.looker.com/explore/combined_browser_metrics/active_users_aggregates_attribution">Looker Explore</a></td></tr>
</tbody></table>
</div>
<h2 id="looker-visualizations-with-period-over-period-analysis"><a class="header" href="#looker-visualizations-with-period-over-period-analysis">Looker visualizations with period over period analysis</a></h2>
<p>The <code>Usage</code> folder for <a href="https://mozilla.cloud.looker.com/folders/748">Mobile and Desktop browsers</a>
includes a set of visualizations that you can access directly and are enhanced
with the period over period analysis.</p>
<p><img src="datasets/bigquery/active_users_aggregates/img.png" alt="img.png" /></p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/bigquery/active_users_aggregates/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="addons"><a class="header" href="#addons">Addons</a></h1>
<p>Addon usage by client, partitioned by day.</p>
<h2 id="data-reference"><a class="header" href="#data-reference">Data Reference</a></h2>
<p>This dataset contains one or more records for every entry in the main ping table
that contain a non-null value for <code>client_id</code>.
Each Addons record contains info for a single addon,
or if the main ping did not contain any active addons,
there will be a row with nulls for all the addon fields
(to identify <code>client_id</code>s/records without any addons).</p>
<h2 id="scheduling-3"><a class="header" href="#scheduling-3">Scheduling</a></h2>
<p>This dataset is updated daily via the <a href="https://github.com/mozilla/telemetry-airflow">telemetry-airflow</a> infrastructure.
The job DAG runs every day after the Main Summary data has been generated.
The DAG defined in <a href="https://github.com/mozilla/bigquery-etl/blob/master/dags/bqetl_addons.py"><code>dags/bqetl_addons.py</code></a></p>
<h2 id="code-reference-1"><a class="header" href="#code-reference-1">Code Reference</a></h2>
<p>This dataset is generated by <a href="https://github.com/mozilla/bigquery-etl/">BigQuery ETL</a>. The query that generates the dataset is <a href="https://github.com/mozilla/bigquery-etl/blob/master/sql/moz-fx-data-shared-prod/telemetry_derived/addons_v2/query.sql"><code>sql/moz-fx-data-shared-prod/telemetry_derived/addons_v2/query.sql</code></a>.</p>
<p>You may find the <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/data/environment.html#addons">environment reference</a> in The Firefox Source Documentation helpful for understanding the source data.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/batch_view/addons/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="addons-daily"><a class="header" href="#addons-daily">Addons Daily</a></h1>
<ul>
<li><a href="datasets/other/addons_daily/reference.html#introduction">Introduction</a>
<ul>
<li><a href="datasets/other/addons_daily/reference.html#contents">Contents</a></li>
<li><a href="datasets/other/addons_daily/reference.html#accessing-the-data">Accessing the Data</a></li>
</ul>
</li>
<li><a href="datasets/other/addons_daily/reference.html#data-reference">Data Reference</a>
<ul>
<li><a href="datasets/other/addons_daily/reference.html#example-queries">Example Queries</a>
<ul>
<li><a href="datasets/other/addons_daily/reference.html#dau-wau-and-mau-for-ublock-origin">DAU, WAU and MAU for uBlock Origin</a></li>
<li><a href="datasets/other/addons_daily/reference.html#add-ons-with-highest-organicsap-search-ratio">Add-ons with Highest Organic:SAP search ratio</a></li>
<li><a href="datasets/other/addons_daily/reference.html#adclickssap-ratio-for-popular-add-ons-around-adblocker"><code>AdClicks:SAP</code> ratio for Popular Add-ons around <code>adblocker</code></a></li>
</ul>
</li>
<li><a href="datasets/other/addons_daily/reference.html#scheduling">Scheduling</a></li>
<li><a href="datasets/other/addons_daily/reference.html#schema">Schema</a></li>
</ul>
</li>
<li><a href="datasets/other/addons_daily/reference.html#code-reference">Code Reference</a></li>
</ul>
<h1 id="introduction-5"><a class="header" href="#introduction-5">Introduction</a></h1>
<p>The <code>addons_daily</code> table is a small and fast dataset with data on specific add-ons, and the users who have them installed. It contains one row per <code>addon_id</code> and <code>submission_date</code>.</p>
<h4 id="contents"><a class="header" href="#contents">Contents</a></h4>
<p>Many questions about add-ons are of the form: &quot;How many users have add-on A installed?&quot; or &quot;Are users with add-on Z more active than users with add-on Y?&quot; This dataset is aimed at answering these type of questions without having to do cumbersome joins or filters.</p>
<p>This dataset also has detailed search aggregates by each add-on, broken out by our major search engines (<code>google</code>, <code>bing</code>, <code>ddg</code>, <code>amazon</code>, <code>yandex</code>, <code>other</code>), along with total aggregates (<code>total</code>). This allows us to identify strange search patterns for add-ons who change a user's search settings on their behalf, often siphoning away SAP searches and Mozilla revenue (see the second example query below).</p>
<h4 id="accessing-the-data"><a class="header" href="#accessing-the-data">Accessing the Data</a></h4>
<p>The <code>addons_daily</code> table is accessible through STMO using the
<code>Telemetry (BigQuery)</code> data source.</p>
<p>See <a href="https://sql.telemetry.mozilla.org/queries/71007/source"><code>STMO#71007</code></a> for an example.</p>
<h1 id="data-reference-1"><a class="header" href="#data-reference-1">Data Reference</a></h1>
<h2 id="example-queries-1"><a class="header" href="#example-queries-1">Example Queries</a></h2>
<h4 id="dau-wau-and-mau-for-ublock-origin"><a class="header" href="#dau-wau-and-mau-for-ublock-origin">DAU, WAU and MAU for uBlock Origin</a></h4>
<pre><code class="language-sql">SELECT
submission_date,
dau,
wau,
mau
FROM
`moz-fx-data-shared-prod.telemetry.addons_daily`
WHERE
submission_date &gt;= DATE_SUB(CURRENT_DATE, INTERVAL 28 DAY)
AND addon_id = 'uBlock0@raymondhill.net'
</code></pre>
<h4 id="add-ons-with-highest-organicsap-search-ratio"><a class="header" href="#add-ons-with-highest-organicsap-search-ratio">Add-ons with Highest Organic:SAP search ratio</a></h4>
<pre><code class="language-sql">SELECT
addon_id,
ANY_VALUE(name) as name,
AVG(dau) as avg_dau,
SAFE_DIVIDE(SUM(organic_searches.total), SUM(sap_searches.total)) as organic_sap_ratio
FROM
telemetry.addons_daily
WHERE
submission_date &gt;= DATE_SUB(CURRENT_DATE, INTERVAL 7 DAY)
AND is_system = false
GROUP BY
1
HAVING
avg(dau) &gt; 1000
ORDER BY
4 DESC
</code></pre>
<h4 id="adclickssap-ratio-for-popular-add-ons-around-adblocker"><a class="header" href="#adclickssap-ratio-for-popular-add-ons-around-adblocker"><code>AdClicks:SAP</code> ratio for Popular Add-ons around <code>adblocker</code></a></h4>
<pre><code class="language-sql">SELECT
addon_id,
ANY_VALUE(name) as name,
AVG(dau) as avg_dau,
SAFE_DIVIDE(SUM(ad_clicks.total), SUM(sap_searches.total)) as adclick_sap_ratio
FROM
telemetry.addons_daily
WHERE
submission_date &gt;= DATE_SUB(CURRENT_DATE, INTERVAL 7 DAY)
AND is_system = false
AND addon_id in ('uBlock0@raymondhill.net', -- 'Ublock user'
'{d10d0bf8-f5b5-c8b4-a8b2-2b9879e08c5d}', -- 'ABP user'
'jid1-NIfFY2CA8fy1tg@jetpack', --'Adblock user',
'{73a6fe31-595d-460b-a920-fcc0f8843232}', --'NoScript user',
'firefox@ghostery.com', --'Ghostery user'
'adblockultimate@adblockultimate.net', --'AdblockUltimate user'
'jid1-MnnxcxisBPnSXQ@jetpack' -- '$PrivacyBadger user'
)
GROUP BY
1
HAVING
avg(dau) &gt; 1000
ORDER BY
4 DESC
</code></pre>
<h2 id="scheduling-4"><a class="header" href="#scheduling-4">Scheduling</a></h2>
<p>This dataset is updated daily via the
<a href="https://github.com/mozilla/telemetry-airflow">telemetry-airflow</a> infrastructure.
The job runs as part of the <a href="https://github.com/mozilla/bigquery-etl/blob/master/dags/bqetl_addons.py"><code>addons_daily</code> DAG</a>.</p>
<h2 id="schema-1"><a class="header" href="#schema-1">Schema</a></h2>
<p>The data is partitioned by <code>submission_date</code>.</p>
<p>As of 2020-04-17, the current version of the <code>addons_daily</code> dataset is <code>v1</code>.</p>
<h1 id="code-reference-2"><a class="header" href="#code-reference-2">Code Reference</a></h1>
<p>This dataset is generated by
<a href="https://github.com/mozilla/bigquery-etl/blob/master/sql/moz-fx-data-shared-prod/telemetry_derived/addons_daily_v1/query.sql"><code>bigquery-etl</code></a>.
Refer to this repository for information on how to run or augment the dataset.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/other/addons_daily/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="autonomous-system-aggregates"><a class="header" href="#autonomous-system-aggregates">Autonomous system aggregates</a></h1>
<p>In the normal course of processing incoming telemetry,
the contents of pings are separated from the IP address of the client that sent the ping.
Analysts do not have access to the IP address data,
and the IP address data is discarded after several days.</p>
<p>To provide some insight about the different experiences users have on different ISP networks,
while preserving the IP privacy of individual users,
this dataset computes some aggregates from the telemetry data
before the IP address information is discarded.
The dataset is computed each day from the pings received the prior day.</p>
<p>The motivating question for this dataset was to understand
which network operators are using the
<a href="https://use-application-dns.net"><code>use-application-dns.net</code></a>
canary domain to disable DNS over HTTPS (DoH) by default for clients using their networks.
If a user has not specifically turned DoH on or off,
Firefox checks for indications that DoH should not be enabled.
One of these checks is to perform a lookup for the canary domain
using the client's default DNS resolver.
If the lookup returns a <code>NXDOMAIN</code> error code indicating the canary domain does not exist,
DoH will not be enabled by default.
Network operators control this behavior
by configuring the resolvers they provision for their clients.</p>
<p>An <a href="https://en.wikipedia.org/wiki/Autonomous_system_(Internet)">autonomous system</a>
represents a network with a common routing policy,
often because it is controlled by a single entity.
Autonomous systems advertise a set of routes, representing blocks of network addresses.
We use them as a way to identify the entity controlling an IP address.</p>
<p>The <code>asn_aggregates</code> dataset,
created in <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1615269">bug 1615269</a>,
contains the columns:</p>
<ul>
<li><code>autonomous_system_number</code> (int64): the number of the autonomous system
from which pings were submitted</li>
<li><code>submission_date</code> (date): the date that pings reached the ingestion endpoint</li>
<li><code>n_clients</code> (int64): number of Firefox clients sending event pings that day, from that AS</li>
<li><code>doh_enabled</code> (int64): number of clients who sent a <code>enable_doh</code> result
<strong>for the canary heuristic</strong> that day, from that AS</li>
<li><code>doh_disabled</code> (int64): number of clients who sent a <code>disable_doh</code> result
<strong>for the canary heuristic</strong> that day, from that AS</li>
</ul>
<p>The canary heuristic indicates whether a client was able to resolve
<a href="https://use-application-dns.net"><code>use-application-dns.net</code></a>;
this is a mechanism available to network operators
who may choose to disable DoH by default for their clients.</p>
<p>We record rows only for ASs where <code>n_clients</code> is at least 500.</p>
<p>The ingestion endpoint only accepts connections with IPv4.
If that changes, clients submitting telemetry from IPv6 addresses will be ignored by this dataset.</p>
<p>The client AS number is determined by looking up the client's IP address in a MaxMind database.</p>
<p>Some notes on interpretation:</p>
<ul>
<li>ASs can change route announcements frequently, so the MaxMind database may be stale</li>
<li>Telemetry is not necessarily sent on network change events,
so users may record activity on one network and submit it from another.</li>
<li>The number of distinct client evaluating DoH heuristics is not currently captured;
if clients report both enabled and disabled states for DoH, they will be double-counted.</li>
</ul>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/other/asn_aggregates/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="clients-daily-1"><a class="header" href="#clients-daily-1">Clients Daily</a></h1>
<p>Many questions about Firefox take the form &quot;What did clients with
characteristics X, Y, and Z do during the period S to E?&quot; The
<code>clients_daily</code> table aims to answer these questions. Each row in
the table is a (<code>client_id</code>, <code>submission_date</code>) and contains a
number of aggregates about that day's activity.</p>
<h2 id="accessing-the-data-1"><a class="header" href="#accessing-the-data-1">Accessing the Data</a></h2>
<p>The <code>clients_daily</code> table is accessible through STMO using the
<code>Telemetry (BigQuery)</code> data source.</p>
<h2 id="data-reference-2"><a class="header" href="#data-reference-2">Data Reference</a></h2>
<h3 id="example-queries-2"><a class="header" href="#example-queries-2">Example Queries</a></h3>
<h4 id="compute-churn-for-a-one-day-cohort"><a class="header" href="#compute-churn-for-a-one-day-cohort">Compute Churn for a one-day cohort:</a></h4>
<pre><code class="language-sql">SELECT
submission_date,
approx_count_distinct(client_id) AS cohort_dau
FROM
telemetry.clients_daily
WHERE
submission_date &gt; '2017-08-31'
AND submission_date &lt; '2017-10-01'
AND profile_creation_date LIKE '2017-09-01%'
GROUP BY 1
ORDER BY 1
</code></pre>
<h4 id="distribution-of-pings-per-client-per-day"><a class="header" href="#distribution-of-pings-per-client-per-day">Distribution of pings per client per day:</a></h4>
<pre><code class="language-sql">SELECT
normalized_channel,
CASE
WHEN pings_aggregated_by_this_row &gt; 50 THEN 50
ELSE pings_aggregated_by_this_row
END AS pings_per_day,
approx_count_distinct(client_id) AS client_count
FROM telemetry.clients_daily
WHERE
submission_date = '2017-09-01'
AND normalized_channel &lt;&gt; 'Other'
GROUP BY
normalized_channel,
pings_per_day
ORDER BY
pings_per_day,
normalized_channel
</code></pre>
<h2 id="scheduling-5"><a class="header" href="#scheduling-5">Scheduling</a></h2>
<p>This dataset is updated daily via the
<a href="https://github.com/mozilla/telemetry-airflow">telemetry-airflow</a> infrastructure.
The job runs as part of the <a href="https://github.com/mozilla/bigquery-etl/blob/master/dags/bqetl_main_summary.py"><code>main_summary</code> DAG</a>.</p>
<h2 id="schema-2"><a class="header" href="#schema-2">Schema</a></h2>
<p>The data is partitioned by <code>submission_date</code>.</p>
<p>As of 2019-11-28, the current version of the <code>clients_daily</code> dataset is <code>v6</code>.</p>
<h2 id="code-reference-3"><a class="header" href="#code-reference-3">Code Reference</a></h2>
<p>This dataset is generated by <a href="https://github.com/mozilla/bigquery-etl/">BigQuery ETL</a>. The query that generates the dataset is <a href="https://github.com/mozilla/bigquery-etl/blob/ad84a15d580333b41d36cfe8331e51238f3bafa1/sql/moz-fx-data-shared-prod/telemetry_derived/clients_daily_v6/query.sql"><code>sql/moz-fx-data-shared-prod/telemetry_derived/clients_daily_v6/query.sql</code></a>.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/batch_view/clients_daily/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="clients-last-seen-reference"><a class="header" href="#clients-last-seen-reference">Clients Last Seen Reference</a></h1>
<ul>
<li><a href="datasets/bigquery/clients_last_seen/reference.html#introduction">Introduction</a>
<ul>
<li><a href="datasets/bigquery/clients_last_seen/reference.html#content">Content</a></li>
<li><a href="datasets/bigquery/clients_last_seen/reference.html#background-and-caveats">Background and Caveats</a></li>
<li><a href="datasets/bigquery/clients_last_seen/reference.html#accessing-the-data">Accessing the Data</a></li>
</ul>
</li>
<li><a href="datasets/bigquery/clients_last_seen/reference.html#data-reference">Data Reference</a>
<ul>
<li><a href="datasets/bigquery/clients_last_seen/reference.html#field-descriptions">Field Descriptions</a>
<ul>
<li><a href="datasets/bigquery/clients_last_seen/reference.html#activity-segmentuser-statecore-active-specific">Activity Segment/User State/Core Active Specific</a></li>
<li><a href="datasets/bigquery/clients_last_seen/reference.html#usage-specific">Usage Specific</a></li>
<li><a href="datasets/bigquery/clients_last_seen/reference.html#new-profile-specific">New Profile Specific</a></li>
</ul>
</li>
<li><a href="datasets/bigquery/clients_last_seen/reference.html#example-queries">Example Queries</a>
<ul>
<li><a href="datasets/bigquery/clients_last_seen/reference.html#compute-dau-for-non-windows-clients-for-the-last-week">Compute DAU for non-windows clients for the last week</a></li>
<li><a href="datasets/bigquery/clients_last_seen/reference.html#compute-wau-by-channel-for-the-last-week">Compute WAU by Channel for the last week</a></li>
</ul>
</li>
<li><a href="datasets/bigquery/clients_last_seen/reference.html#scheduling">Scheduling</a></li>
<li><a href="datasets/bigquery/clients_last_seen/reference.html#schema">Schema</a></li>
</ul>
</li>
<li><a href="datasets/bigquery/clients_last_seen/reference.html#code-reference">Code Reference</a></li>
</ul>
<h1 id="introduction-6"><a class="header" href="#introduction-6">Introduction</a></h1>
<p>The <code>clients_last_seen</code> dataset is useful for efficiently determining exact
user counts such as DAU and MAU.
It can also allow efficient calculation of other windowed usage metrics like retention via its
<a href="datasets/bigquery/clients_last_seen/../../../cookbooks/clients_last_seen_bits.html">bit pattern fields</a>.
It includes the most recent values in a 28 day window for all columns in the
<a href="datasets/bigquery/clients_last_seen//datasets/batch_view/clients_daily/reference.html"><code>clients_daily</code> dataset</a>.</p>
<h4 id="content"><a class="header" href="#content">Content</a></h4>
<p>For each <code>submission_date</code> this dataset contains one row per <code>client_id</code>
that appeared in <code>clients_daily</code> in a 28 day window including
<code>submission_date</code> and preceding days.</p>
<p>The <code>days_since_seen</code> column indicates the difference between <code>submission_date</code>
and the most recent <code>submission_date</code> in <code>clients_daily</code> where the <code>client_id</code>
appeared. A client observed on the given <code>submission_date</code> will have <code>days_since_seen = 0</code>.</p>
<p>Other <code>days_since_</code> columns use the most recent date in <code>clients_daily</code> where
a certain condition was met. If the condition was not met for a <code>client_id</code> in
a 28 day window <code>NULL</code> is used. For example <code>days_since_visited_5_uri</code> uses the
condition <code>scalar_parent_browser_engagement_total_uri_count_sum &gt;= 5</code>. These
columns can be used for user counts where a condition must be met on any day
in a window instead of using the most recent values for each <code>client_id</code>.</p>
<p>The <code>days_seen_bits</code> field stores the daily history of a client in the 28 day
window. The daily history is converted into a sequence of bits, with a <code>1</code> for
the days a client is in <code>clients_daily</code> and a <code>0</code> otherwise, and this sequence
is converted to an integer. A tutorial on how to use these bit patterns to
create filters in SQL can be found in
<a href="https://colab.research.google.com/drive/13AwwORpOtRsq22op_3rMSwPssQkJU1ok">this notebook</a>.</p>
<p>The rest of the columns use the most recent value in <code>clients_daily</code> where
the <code>client_id</code> appeared.</p>
<h4 id="background-and-caveats"><a class="header" href="#background-and-caveats">Background and Caveats</a></h4>
<p>User counts generated using <code>days_since_seen</code> only reflect the most recent
values from <code>clients_daily</code> for each <code>client_id</code> in a 28 day window.
This means Active MAU as defined cannot be efficiently calculated using <code>days_since_seen</code>
because if a given <code>client_id</code> appeared every day in February and only on February 1st had
<code>scalar_parent_browser_engagement_total_uri_count_sum &gt;= 5</code> then it would only
be counted on the 1st, and not the 2nd-28th. Active MAU can be efficiently and
correctly calculated using <code>days_since_visited_5_uri</code>.</p>
<p>MAU can be calculated over a <code>GROUP BY submission_date[, ...]</code> clause using
<code>COUNT(*)</code>, because there is exactly one row in the dataset for each
<code>client_id</code> in the 28 day MAU window for each <code>submission_date</code>.</p>
<p>User counts generated using <code>days_since_seen</code> can use <code>SUM</code> to reduce groups,
because a given <code>client_id</code> will only be in one group per <code>submission_date</code>. So
if MAU were calculated by <code>country</code> and <code>channel</code>, then the sum of the MAU for
each <code>country</code> would be the same as if MAU were calculated only by <code>channel</code>.</p>
<h4 id="accessing-the-data-2"><a class="header" href="#accessing-the-data-2">Accessing the Data</a></h4>
<p>The data is available in Re:dash and BigQuery. Take a look at this full running
<a href="https://sql.telemetry.mozilla.org/queries/62029/source#159510">example query in Re:dash (<code>STMO#159510</code>)</a>.</p>
<h1 id="data-reference-3"><a class="header" href="#data-reference-3">Data Reference</a></h1>
<h2 id="field-descriptions"><a class="header" href="#field-descriptions">Field Descriptions</a></h2>
<p>The <a href="datasets/bigquery/clients_last_seen/../../../cookbooks/clients_last_seen_bits.html"><code>*_bits</code> fields</a> store the relevant activity of a client in a 28 day
window, as a 28 bit integer.
For each bit, a 1 corresponds to the specific activity occurring on that day.</p>
<h3 id="activity-segmentuser-statecore-active-specific"><a class="header" href="#activity-segmentuser-statecore-active-specific">Activity Segment/User State/Core Active Specific</a></h3>
<p>Please see this <a href="datasets/bigquery/clients_last_seen/../../../concepts/segments.html">section</a> for descriptions regarding user states/segments.</p>
<ul>
<li><code>is_core_active_v1</code>: Boolean indicating if the client satisfies conditions of being core active on that day.</li>
<li><code>activity_segments_v1</code>: The activity segment applicable to the client that day.</li>
<li><code>is_regular_user_v3</code>: Boolean indicating if the client satisfies conditions of being a regular user on that day.</li>
<li><code>is_new_or_resurrected_v3</code>: Boolean indicating if the client satisfies conditions of being a regular user on that day.</li>
<li><code>is_weekday_regular_v1</code>: Boolean indicating if the client satisfies conditions of being a weekday regular user on that day.</li>
<li><code>is_allweek_regular_v1</code>: Boolean indicating if the client satisfies conditions of being an all-week regular user on that day.</li>
</ul>
<h3 id="usage-specific"><a class="header" href="#usage-specific">Usage Specific</a></h3>
<ul>
<li><code>days_visited_1_uri_bits</code>: Each bit field represents if a client browsed at least 1 URI on that day.</li>
<li><code>days_since_visited_1_uri</code>: Number of days since the client browsed at least 1 URI.</li>
<li><code>days_interacted_bits</code>: Each bit field represents if a client had at least 1 active tick on that day.
This is derived from the <code>active_hours_sum</code> in <code>clients_daily</code>.</li>
<li><code>days_since_interacted</code>: Number of days since the clients had at least 1 active tick.</li>
<li><code>days_had_8_active_ticks_bits</code>: Each bit field represents if a client had at least 8 active ticks on that day.
This can be used to approximate the threshold of 1 URI, and is useful for determining activity for clients using
Private Browsing Mode where URI counts are not recorded.</li>
<li><code>days_since_visited_8_active_ticks</code>: Number of days since the client had at least 8 active ticks.</li>
</ul>
<h3 id="new-profile-specific"><a class="header" href="#new-profile-specific">New Profile Specific</a></h3>
<ul>
<li><code>first_seen_date</code>: Date the client sent their first main ping.</li>
<li><code>second_seen_date</code>: Date the client sent their first main ping.</li>
<li><code>days_since_first_seen</code>: Number of days since <code>first_seen_date</code></li>
<li><code>days_since_second_seen</code>: Number of days since <code>second_seen_date</code></li>
<li><code>new_profile_5_day_activated_v1</code>: Boolean indicating if a new profile has sent a ping 5 out of their first 7 days.</li>
<li><code>new_profile_14_day_activated_v1</code>: Boolean indicating if a new profile has sent a ping 8 out of their first 14 days.</li>
<li><code>new_profile_21_day_activated_v1</code>: Boolean indicating if a new profile has sent a ping 12 out of their first 21 days.</li>
<li><code>days_since_created_profile</code>: Number of days since the profile creation date. This field is only populated when the
value is 27 days or less. Otherwise, it is NULL. <code>profile_age_in_days</code> can be used in the latter cases for all clients
who have a profile creation date.</li>
</ul>
<h2 id="example-queries-3"><a class="header" href="#example-queries-3">Example Queries</a></h2>
<h4 id="compute-dau-for-non-windows-clients-for-the-last-week"><a class="header" href="#compute-dau-for-non-windows-clients-for-the-last-week">Compute DAU for non-windows clients for the last week</a></h4>
<pre><code class="language-sql">SELECT
submission_date,
os,
COUNT(*) AS count
FROM
mozdata.telemetry.clients_last_seen
WHERE
submission_date &gt;= DATE_SUB(CURRENT_DATE, INTERVAL 1 WEEK)
AND days_since_seen = 0
GROUP BY
submission_date,
os
HAVING
count &gt; 10 -- remove outliers
AND lower(os) NOT LIKE '%windows%'
ORDER BY
os,
submission_date DESC
</code></pre>
<h4 id="compute-wau-by-channel-for-the-last-week"><a class="header" href="#compute-wau-by-channel-for-the-last-week">Compute WAU by Channel for the last week</a></h4>
<pre><code class="language-sql">SELECT
submission_date,
normalized_channel,
COUNT(*) AS count
FROM
mozdata.telemetry.clients_last_seen
WHERE
submission_date &gt;= DATE_SUB(CURRENT_DATE, INTERVAL 1 WEEK)
AND days_since_seen &lt; 7
GROUP BY
submission_date,
normalized_channel
HAVING
count &gt; 10 -- remove outliers
ORDER BY
normalized_channel,
submission_date DESC
</code></pre>
<h2 id="scheduling-6"><a class="header" href="#scheduling-6">Scheduling</a></h2>
<p>This dataset is updated daily via the
<a href="https://github.com/mozilla/telemetry-airflow">telemetry-airflow</a>
infrastructure. The job runs as part of the
<a href="https://github.com/mozilla/bigquery-etl/blob/ad84a15d580333b41d36cfe8331e51238f3bafa1/dags/bqetl_main_summary.py#L104"><code>main_summary</code> DAG</a>.</p>
<h2 id="schema-3"><a class="header" href="#schema-3">Schema</a></h2>
<p>The data is partitioned by <code>submission_date</code>.</p>
<p>As of 2019-03-25, the current version of the <code>clients_last_seen</code> dataset is
<code>v1</code>, and the schema is visible in the BigQuery console
<a href="https://console.cloud.google.com/bigquery?p=mozdata&amp;d=telemetry&amp;t=clients_last_seen_v1&amp;page=table">here</a>.</p>
<h1 id="code-reference-4"><a class="header" href="#code-reference-4">Code Reference</a></h1>
<p>This dataset is generated by
<a href="https://github.com/mozilla/bigquery-etl/blob/master/sql/moz-fx-data-shared-prod/telemetry_derived/clients_last_seen_v1/query.sql"><code>bigquery-etl</code></a>.
Refer to this repository for information on how to run or augment the dataset.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/bigquery/clients_last_seen/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="events"><a class="header" href="#events">Events</a></h1>
<p>The <code>telemetry.events</code> and <code>telemetry.events_1pct</code> derived datasets
make it easier to analyze the desktop Firefox <a href="datasets/batch_view/events/../../pings.html#event-ping">event ping</a>.
It has the following advantages over accessing the raw ping table (<code>telemetry.event</code>):</p>
<ul>
<li>There is no need to <code>UNNEST</code> the <code>events</code> column: this is already done for you.</li>
<li>You don't have to know which process type emitted your event. If you care, you can query the <code>event_process</code> column.</li>
<li>It is clustered on the <code>event_category</code> column, which can dramatically speed up your query.</li>
</ul>
<h2 id="data-reference-4"><a class="header" href="#data-reference-4">Data Reference</a></h2>
<p>The <code>events</code> dataset contains one row for each event submitted in an event ping for that day.</p>
<p>The <code>timestamp</code>, <code>category</code>, <code>method</code>, <code>object</code>, <code>value</code>, and <code>extra</code> fields of the event
are mapped to columns named <code>event_timestamp</code>, <code>event_category</code>, <code>event_method</code>, <code>event_object</code>,
<code>event_string_value</code>, and <code>event_map_values</code>.
To access the <code>event_map_values</code>, you can use the <code>mozfun.map.get_key</code> UDF,
like <code>SELECT mozfun.map.get_key(event_map_values, &quot;branch&quot;) AS branch FROM telemetry.events</code>.</p>
<p>Please note that <code>event_timestamp</code> refers to the time in milliseconds when the event was recorded <em>relative to the main process start time</em> (<code>session_start_time</code>), while the <code>timestamp</code> column refers to the time the ping was ingested. <code>event_timestamp</code> is useful for determining relative order of events within a single session. Adding <code>event_timestamp</code> to <code>session_start_time</code> will allow you to approximate the absolute time an event occurred, subject to client clock skew and other factors.</p>
<h3 id="sample-of-events-telemetryevents_1pct"><a class="header" href="#sample-of-events-telemetryevents_1pct">Sample of events: <code>telemetry.events_1pct</code></a></h3>
<p>The <code>telemetry.events_1pct</code> table is a consistent 1% sample from <code>telemetry.events</code>
(<code>sample_id = 0</code>) that includes 6 months of history. Using the sampled table can
be faster than hitting <code>telemetry.events</code>, particularly when iterating on a prototype
query.</p>
<p>BigQuery is also better able to estimate the amount of data it will scan when
querying <code>events_1pct</code>, so queries on <code>events_1pct</code> may be able to succeed where
the equivalent query on <code>events</code> with a <code>sample_id = 0</code> filter would be rejected
due to the query appearing to scan many TB of data.</p>
<h3 id="example-query"><a class="header" href="#example-query">Example Query</a></h3>
<p>This query gets the count of the number of times the user initiated the <code>dismiss_breach_alert</code>
and <code>learn_more_breach</code> actions. Note the use of the <code>event_category</code> to optimize the query:
for this example, this reduces the amount of data scanned from 450 GB to 52 MB.</p>
<pre><code class="language-sql">SELECT countif(event_method = 'dismiss_breach_alert') AS n_dismissing_breach_alert,
countif(event_method = 'learn_more_breach') AS n_learn_more
FROM mozdata.telemetry.events
WHERE event_category = 'pwmgr'
AND submission_date='2020-04-20'
AND sample_id=0
</code></pre>
<p><a href="https://sql.telemetry.mozilla.org/queries/73401/source"><code>STMO#73401</code></a></p>
<h2 id="scheduling-7"><a class="header" href="#scheduling-7">Scheduling</a></h2>
<p>The events dataset is updated daily.
The job is scheduled on <a href="https://github.com/mozilla/telemetry-airflow">Airflow</a>.
The DAG is defined in <a href="https://github.com/mozilla/telemetry-airflow/blob/master/dags/copy_deduplicate.py"><code>dags/copy_deduplicate.py</code></a>.</p>
<h2 id="code-reference-5"><a class="header" href="#code-reference-5">Code Reference</a></h2>
<p>This dataset is generated by <a href="https://github.com/mozilla/bigquery-etl/">BigQuery ETL</a>. The query that generates the dataset is <a href="https://github.com/mozilla/bigquery-etl/blob/master/sql/moz-fx-data-shared-prod/telemetry_derived/event_events_v1/query.sql"><code>sql/moz-fx-data-shared-prod/telemetry_derived/event_events_v1/query.sql</code></a>.</p>
<h2 id="more-information"><a class="header" href="#more-information">More Information</a></h2>
<p>Firefox has an API to record events, which are then submitted through the <code>event</code> ping.
The format and mechanism of event collection in Firefox is documented <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/collection/events.html">in the Firefox source documentation</a>.</p>
<p>The full events data pipeline is <a href="datasets/batch_view/events/../../../concepts/pipeline/event_pipeline.html">documented in the event pipeline documentation</a>.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/batch_view/events/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="events-daily"><a class="header" href="#events-daily">Events Daily</a></h1>
<ul>
<li><a href="datasets/bigquery/events_daily/reference.html#introduction">Introduction</a></li>
<li><a href="datasets/bigquery/events_daily/reference.html#contents">Contents</a></li>
<li><a href="datasets/bigquery/events_daily/reference.html#limitations">Limitations</a></li>
<li><a href="datasets/bigquery/events_daily/reference.html#accessing-the-data">Accessing the Data</a>
<ul>
<li><a href="datasets/bigquery/events_daily/reference.html#example-queries">Example Queries</a></li>
</ul>
</li>
<li><a href="datasets/bigquery/events_daily/reference.html#scheduling">Scheduling</a></li>
<li><a href="datasets/bigquery/events_daily/reference.html#code-reference">Code Reference</a></li>
<li><a href="datasets/bigquery/events_daily/reference.html#background-and-caveats">Background and Caveats</a></li>
</ul>
<h2 id="introduction-7"><a class="header" href="#introduction-7">Introduction</a></h2>
<p>The <code>events_daily</code> datasets can help answer questions about sequences of events (sometimes called &quot;funnels&quot;).
It is used by the <a href="datasets/bigquery/events_daily/../../../cookbooks/looker/funnel_analysis_explore.html">Event Funnel Analysis Explore in Looker</a>, but can also be queried directly using BigQuery.</p>
<p>As of this writing, variations of the <code>events_daily</code> dataset are available for Firefox Desktop, Firefox for Android, and Mozilla VPN.</p>
<h2 id="contents-1"><a class="header" href="#contents-1">Contents</a></h2>
<p><code>events_daily</code> has one row per-client per-day, much the same as <code>clients_daily</code>. The table is created in a two-step process:</p>
<ol>
<li>An ancillary table, <code>event_types</code>, is updated with the new events seen on that day. Each event is mapped to a unique unicode character, and each event property (the <code>extras</code> fields) is also mapped to a unique unicode character.</li>
<li>For every user, that day's events are mapped to their associated unicode characters (including <code>event_properties</code>). The strings are aggregated and comma-separated, giving a single ordered string that represents all of that user's events on that day.</li>
</ol>
<p>For most products, only events in the Glean <a href="https://mozilla.github.io/glean/book/user/pings/events.html">events ping</a> are aggregated (Firefox Desktop currently aggregates events in the <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/data/event-ping.html">legacy desktop &quot;event&quot; ping</a>).
If you're looking for events sent in other pings, you'll need to query them directly.</p>
<p>Included in this data is a set of dimensional information about the user, also derived from the events ping.
The full list of fields is available in the <a href="https://github.com/mozilla/bigquery-etl/blob/main/bigquery_etl/events_daily/query_templates/events_daily_v1/query.sql">templated source query</a>.</p>
<h2 id="limitations"><a class="header" href="#limitations">Limitations</a></h2>
<p>This approach makes some queries fast and easy, but has some limits:</p>
<ol>
<li>Each product is limited to at most 1 million unique event types</li>
<li>Each event property is limited to a set number of values (<a href="https://github.com/mozilla/bigquery-etl/blob/128083330cccf27923366109686aa83b5bb17e4d/bigquery_etl/events_daily/query_templates/event_types_history_v1/templating.yaml#L10">currently set to 1000 for most products</a>). As a result, some properties will not be accessible in the table.</li>
<li>Queries do not know the amount of time that passed between events, only that they occurred on the same day. This can be alleviated by sessionizing and splitting the events string using an event which indicates the start of a session. For example, for Firefox for Android, this could be <a href="https://dictionary.telemetry.mozilla.org/apps/fenix/metrics/events_app_opened"><code>events.app_opened</code></a>.</li>
</ol>
<h2 id="accessing-the-data-3"><a class="header" href="#accessing-the-data-3">Accessing the Data</a></h2>
<p>While it is possible to build queries that access this events data directly, the Data Platform instead recommends using a set of stored procedures we have available.
These procedures create BigQuery views that hide the complexity of the event representation.
The <a href="https://mozilla.github.io/bigquery-etl/mozfun/event_analysis/"><code>mozfun</code> library documentation</a> has information about these procedures and examples of their usage.</p>
<h3 id="example-queries-4"><a class="header" href="#example-queries-4">Example Queries</a></h3>
<p>This query gives the event-count and client-counts per-event per-day.</p>
<pre><code class="language-sql">SELECT
submission_date,
category,
event,
COUNT(*) AS client_count,
SUM(count) AS event_count
FROM
`moz-fx-data-shared-prod`.fenix.events_daily
CROSS JOIN
UNNEST(mozfun.event_analysis.extract_event_counts(events))
JOIN
`moz-fx-data-shared-prod`.fenix.event_types
USING (index)
WHERE
submission_date &gt;= DATE_SUB(current_date, INTERVAL 28 DAY)
GROUP BY
submission_date,
category,
event
</code></pre>
<p><a href="https://sql.telemetry.mozilla.org/dashboard/fenix-events">Link to a dashboard using this query in STMO</a>.</p>
<h2 id="scheduling-8"><a class="header" href="#scheduling-8">Scheduling</a></h2>
<p>This dataset is scheduled on Airflow and updated daily.</p>
<h2 id="code-reference-6"><a class="header" href="#code-reference-6">Code Reference</a></h2>
<p>The source for events daily is defined inside bigquery-etl as a set of templated queries which generate the <code>events_daily</code> tables as well as the dependent <code>event_types</code> tables for each supported application.
You can find the source under <a href="https://github.com/mozilla/bigquery-etl/tree/main/sql_generators/events_daily"><code>bigquery_etl/sql_generators</code></a>.</p>
<h2 id="background-and-caveats-1"><a class="header" href="#background-and-caveats-1">Background and Caveats</a></h2>
<p>See <a href="https://docs.google.com/presentation/d/1hY82h_hP-pJd1j_7PsPPHn469XIQ7p4BfTH3aqRpYTk">this presentation</a> for background.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/bigquery/events_daily/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="firefox-android-clients-reference"><a class="header" href="#firefox-android-clients-reference">Firefox Android Clients reference</a></h1>
<ul>
<li><a href="datasets/bigquery/firefox_android_clients/reference.html#introduction">Introduction</a></li>
<li><a href="datasets/bigquery/firefox_android_clients/reference.html#contents">Contents</a></li>
<li><a href="datasets/bigquery/firefox_android_clients/reference.html#scheduling">Scheduling</a></li>
<li><a href="datasets/bigquery/firefox_android_clients/reference.html#code-reference">Code Reference</a></li>
<li><a href="datasets/bigquery/firefox_android_clients/reference.html#how-to-query">How to query</a></li>
</ul>
<h2 id="introduction-8"><a class="header" href="#introduction-8">Introduction</a></h2>
<p>The table <code>fenix_derived.firefox_android_clients</code> contains the first observations
for Firefox Android clients retrieved from the ping that reports first from:
baseline, <code>first_session</code> and metrics pings.</p>
<p>The goals of this table, as described in the
<a href="https://docs.google.com/document/d/12bj4DhCybelqHVgOVq8KJlzgtbbUw3f68palNrv-gaM/">proposal</a>:</p>
<ul>
<li>Enable client segmentation based on the attribution dimensions
e.g. <code>adjust_campaign</code>, install source.</li>
<li>Facilitate the investigation of data incidents and identifying the root cause
when of one or more metrics deviate from the expected values, by segmenting it
using different dimensions.</li>
<li>Enable identifying bugs and data obtained via bots i.e. BrowserStack.</li>
<li>Serve as the baseline to complement Glean's <code>first_session</code> ping for mobile browsers
in order to use it as a single source for first reported attributes.</li>
<li>Serve as a baseline to create a <code>first_session</code> ping for Firefox Desktop.</li>
</ul>
<h2 id="contents-2"><a class="header" href="#contents-2">Contents</a></h2>
<p>The table granularity is one row per <code>client_id</code>.</p>
<p>It contains the attribution, isp, <code>os_version</code>, device, channel and first
reported country for each client. The field descriptions are fully
documented in BigQuery.</p>
<p>This table contains data only for channel <code>release</code>, since it's the only
channel where data is available in the <code>first_session</code> ping at the time
of implementation and suffices for the goals. Also, data is available
since August 2020, when the migration from Fennec to Fenix took place.</p>
<h2 id="scheduling-9"><a class="header" href="#scheduling-9">Scheduling</a></h2>
<p>Incremental updates happen on a daily basis in the Airflow DAG
<a href="https://workflow.telemetry.mozilla.org/home?search=bqetl_analytics_tables"><code>bqetl_analytics_tables</code></a></p>
<p>The table is built and initialized using the <code>init.sql</code> file and
is incrementally updated using <code>query.sql</code>, including the update of
historical records when the attribution details are received from pings
that arrive to the server after the <code>first_seen</code> date.</p>
<h2 id="code-reference-7"><a class="header" href="#code-reference-7">Code Reference</a></h2>
<p>The query and metadata for the aggregates is defined in the corresponding
sub-folder in <code>bigquery-etl</code> under
<a href="https://github.com/mozilla/bigquery-etl/tree/main/sql/moz-fx-data-shared-prod/fenix_derived/firefox_android_clients_v1"><code>fenix_derived</code></a>.</p>
<h2 id="how-to-query"><a class="header" href="#how-to-query">How to query</a></h2>
<p>This table should be accessed through the user-facing view
<code>fenix.firefox_android_clients</code> which implements additional
business logic for grouping attribution data. Use a simple
join with the <code>client_id</code>.</p>
<p>For analysis purposes, it's important to use the business date
<code>first_seen_date</code> when filtering. This date corresponds to when
the baseline ping is actually collected on the client side.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/bigquery/firefox_android_clients/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="main-ping-tables"><a class="header" href="#main-ping-tables">Main Ping Tables</a></h1>
<p>As described in the <a href="https://docs.telemetry.mozilla.org/concepts/pipeline/schemas.html">pipeline schemas deployment docs</a>,
data engineering has a process of generating schemas for pings and deploying them to BigQuery. The Main Ping table (<code>telemetry.main</code>)
is one of those generated tables.</p>
<p>As the number of telemetry probes defined in Firefox grows, so does the number of columns in <code>telemetry.main</code>. As of January 2021, we have nearly 10,000 columns, and we ingest many terabytes of main ping data per day. This combination of a very wide schema and a high data volume means that <a href="https://console.cloud.google.com/support/cases/detail/25679061?project=moz-fx-data-shared-prod">BigQuery has to reference metadata for a very large number of files</a> each time it runs a query, even if it only ends up needing to read a small fraction of those files. This has led to a problematic experience for iterative analysis use cases.</p>
<p>To reduce the time for querying main ping data, we have included two new tables: <code>telemetry.main_1pct</code>, and <code>telemetry.main_nightly</code>. These can return results for simple queries in a matter of seconds where a logically equivalent query on <code>telemetry.main</code> may take minutes.</p>
<h2 id="main-ping-sample-telemetrymain_1pct"><a class="header" href="#main-ping-sample-telemetrymain_1pct">Main Ping Sample: <code>telemetry.main_1pct</code></a></h2>
<p>This table includes a 1% sample across all channels from <code>telemetry.main</code> (<code>sample_id = 0</code>).
It includes 6 months of history.</p>
<p>It includes an additional <code>subsample_id</code> field that is similar to <code>sample_id</code> and allows
efficient sampling for even smaller population sizes. The following query would reflect
a 0.01% sample (one thousandth) of the total desktop Firefox population:</p>
<pre><code class="language-sql">SELECT
COUNT(*) AS n
FROM
mozdata.telemetry.main_1pct
WHERE
subsample_id = 0
AND DATE(submission_timestamp) = '2021-01-01'
</code></pre>
<p>The choice of implementation for <code>subsample_id</code> is not particularly well vetted;
it's simply chosen to be a hash that's stable, has a reasonable
avalanche effect, and is <em>different</em> from <code>sample_id</code>.
The definition is <code>MOD(ABS(FARM_FINGERPRINT(client_id)), 100) AS subsample_id</code>
which is the same approach we use for choosing <code>id_bucket</code> in Exact MAU tables.</p>
<h2 id="nightly-main-ping-data-telemetrymain_nightly"><a class="header" href="#nightly-main-ping-data-telemetrymain_nightly">Nightly Main Ping Data: <code>telemetry.main_nightly</code></a></h2>
<p>This table includes only data from the nightly release channel (<code>normalized_channel = 'nightly'</code>).
It includes 6 months of history.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/main_ping_tables.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="main-summary-deprecated"><a class="header" href="#main-summary-deprecated">Main Summary (deprecated)</a></h1>
<blockquote>
<p><strong>⚠</strong> Since the introduction of BigQuery, we are able to represent the
full <code>main</code> ping structure in a table, available as <code>telemetry.main</code>.
As such, <code>main_summary</code> was discontinued as of 2023-10-05.</p>
</blockquote>
<p>The <code>main_summary</code> table contains one row for each ping.
Each column represents one field from the main ping payload,
though only a subset of all main ping fields are included.
This dataset <strong>does not include most histograms</strong>.</p>
<p>This table is massive, and due to its size, it can be difficult to work with.</p>
<p>Instead, we recommend using the <code>clients_daily</code> or <code>clients_last_seen</code> dataset
where possible.</p>
<p>If you do need to query this table, make use of the <code>sample_id</code> field and
limit to a short submission date range.</p>
<h2 id="table-of-contents-11"><a class="header" href="#table-of-contents-11">Table of Contents</a></h2>
<ul>
<li><a href="datasets/batch_view/main_summary/reference.html#accessing-the-data">Accessing the Data</a></li>
<li><a href="datasets/batch_view/main_summary/reference.html#data-reference">Data Reference</a>
<ul>
<li><a href="datasets/batch_view/main_summary/reference.html#example-queries">Example Queries</a></li>
<li><a href="datasets/batch_view/main_summary/reference.html#sampling">Sampling</a></li>
<li><a href="datasets/batch_view/main_summary/reference.html#scheduling">Scheduling</a></li>
<li><a href="datasets/batch_view/main_summary/reference.html#schema">Schema</a></li>
<li><a href="datasets/batch_view/main_summary/reference.html#time-formats">Time formats</a></li>
<li><a href="datasets/batch_view/main_summary/reference.html#user-preferences">User Preferences</a></li>
</ul>
</li>
<li><a href="datasets/batch_view/main_summary/reference.html#code-reference">Code Reference</a></li>
</ul>
<h2 id="accessing-the-data-4"><a class="header" href="#accessing-the-data-4">Accessing the Data</a></h2>
<p>The <code>main_summary</code> table is accessible through STMO.
See <a href="https://sql.telemetry.mozilla.org/queries/4201/source"><code>STMO#4201</code></a> for an example.</p>
<h2 id="data-reference-5"><a class="header" href="#data-reference-5">Data Reference</a></h2>
<h3 id="example-queries-5"><a class="header" href="#example-queries-5">Example Queries</a></h3>
<p>Compare the search volume for different search source values:</p>
<pre><code class="language-sql">WITH search_data AS (
SELECT
s.source AS search_source,
s.count AS search_count
FROM
telemetry.main_summary
CROSS JOIN UNNEST(search_counts) AS s
WHERE
submission_date_s3 = '2019-11-11'
AND sample_id = 42
AND search_counts IS NOT NULL
)
SELECT
search_source,
sum(search_count) as total_searches
FROM search_data
GROUP BY search_source
ORDER BY sum(search_count) DESC
</code></pre>
<h3 id="sampling"><a class="header" href="#sampling">Sampling</a></h3>
<p>The <code>main_summary</code> dataset contains one record for each <code>main</code> ping
as long as the record contains a non-null value for
<code>documentId</code>, <code>submissionDate</code>, and <code>Timestamp</code>.
We do not ever expect nulls for these fields.</p>
<h3 id="scheduling-10"><a class="header" href="#scheduling-10">Scheduling</a></h3>
<p>This dataset is updated daily via the <a href="https://github.com/mozilla/telemetry-airflow">telemetry-airflow</a> infrastructure.
The DAG is defined in
<a href="https://github.com/mozilla/bigquery-etl/blob/master/dags/bqetl_main_summary.py"><code>dags/bqetl_main_summary.py</code></a></p>
<h3 id="schema-4"><a class="header" href="#schema-4">Schema</a></h3>
<p>As of 2019-11-28, the current version of the <code>main_summary</code> dataset is <code>v4</code>.</p>
<p>For more detail on where these fields come from in the
<a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/data/main-ping.html">raw data</a>,
please look <a href="https://github.com/mozilla/bigquery-etl/tree/ad84a15d580333b41d36cfe8331e51238f3bafa1/sql/moz-fx-data-shared-prod/telemetry_derived/main_summary_v4">in the <code>main_summary</code> ETL code</a>.</p>
<p>Most of the fields are simple scalar values, with a few notable exceptions:</p>
<ul>
<li>The <code>search_count</code> field is an array of structs, each item in the array representing
a 3-tuple of (<code>engine</code>, <code>source</code>, <code>count</code>). The <code>engine</code> field represents the name of
the search engine against which the searches were done. The <code>source</code> field represents
the part of the Firefox UI that was used to perform the search. It contains values
such as <code>abouthome</code>, <code>urlbar</code>, and <code>searchbar</code>. The <code>count</code> field contains the number
of searches performed against this engine+source combination during that subsession.
Any of the fields in the struct may be null (for example if the search key did not
match the expected pattern, or if the count was non-numeric).</li>
<li>The <code>loop_activity_counter</code> field is a simple struct containing inner fields for each
expected value of the <code>LOOP_ACTIVITY_COUNTER</code> Enumerated Histogram. Each inner field
is a count for that histogram bucket.</li>
<li>The <code>popup_notification_stats</code> field is a map of <code>String</code> keys to struct values,
each field in the struct being a count for the expected values of the
<code>POPUP_NOTIFICATION_STATS</code> Keyed Enumerated Histogram.</li>
<li>The <code>places_bookmarks_count</code> and <code>places_pages_count</code> fields contain the <strong>mean</strong>
value of the corresponding Histogram, which can be interpreted as the average number
of bookmarks or pages in a given subsession.</li>
<li>The <code>active_addons</code> field contains an array of structs, one for each entry in
the <code>environment.addons.activeAddons</code> section of the payload. More detail in
<a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1290181">Bug 1290181</a>.</li>
<li>The <code>disabled_addons_ids</code> field contains an array of strings, one for each entry in
the <code>payload.addonDetails</code> which is not already reported in the <code>environment.addons.activeAddons</code>
section of the payload. More detail in
<a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1390814">Bug 1390814</a>.
Please note that while using this field is generally OK, this was introduced to support
the <a href="https://github.com/mozilla/taar/pulls">TAAR</a> project and you should not count on it
in the future. The field can stay in the <code>main_summary</code>, but we might need to slightly change
the ping structure to something better than <code>payload.addonDetails</code>.</li>
<li>The <code>theme</code> field contains a single struct in the same shape as the items in the
<code>active_addons</code> array. It contains information about the currently active browser
theme.</li>
<li>The <code>user_prefs</code> field contains a struct with values for preferences of interest.</li>
<li>The <code>events</code> field contains an array of event structs.</li>
<li>Dynamically-included histogram fields are present as key-&gt;value maps,
or key-&gt;(key-&gt;value) nested maps for keyed histograms.</li>
</ul>
<h3 id="time-formats"><a class="header" href="#time-formats">Time formats</a></h3>
<p>Columns in <code>main_summary</code> may use one of a handful of time formats with different precisions:</p>
<div class="table-wrapper"><table><thead><tr><th>Column Name</th><th>Origin</th><th>Description</th><th>Example</th><th>Spark</th><th>Presto</th></tr></thead><tbody>
<tr><td><code>timestamp</code></td><td>stamped at ingestion</td><td>nanoseconds since epoch</td><td><code>1504689165972861952</code></td><td><code>from_unixtime(timestamp/1e9)</code></td><td><code>from_unixtime(timestamp/1e9)</code></td></tr>
<tr><td><code>submission_date_s3</code></td><td>derived from timestamp</td><td><code>YYYYMMDD</code> date string of timestamp in UTC</td><td><code>20170906</code></td><td><code>from_unixtime(unix_timestamp(submission_date, 'yyyyMMdd'))</code></td><td><code>date_parse(submission_date, '%Y%m%d')</code></td></tr>
<tr><td><code>client_submission_date</code></td><td>derived from HTTP header: <code>Fields[Date]</code></td><td>HTTP date header string sent with the ping</td><td><code>Tue, 27 Sep 2016 16:28:23 GMT</code></td><td><code>unix_timestamp(client_submission_date, 'EEE, dd M yyyy HH:mm:ss zzz')</code></td><td><code>date_parse(substr(client_submission_date, 1, 25), '%a, %d %b %Y %H:%i:%s')</code></td></tr>
<tr><td><code>creation_date</code></td><td><code>creationDate</code></td><td>time of ping creation ISO8601 at UTC+0</td><td><code>2017-09-06T08:21:36.002Z</code></td><td><code>to_timestamp(creation_date, &quot;yyyy-MM-dd'T'HH:mm:ss.SSSXXX&quot;)</code></td><td><code>from_iso8601_timestamp(creation_date) AT TIME ZONE 'GMT'</code></td></tr>
<tr><td><code>timezone_offset</code></td><td><code>info.timezoneOffset</code></td><td>timezone offset in minutes</td><td><code>120</code></td><td></td><td></td></tr>
<tr><td><code>subsession_start_date</code></td><td><code>info.subsessionStartDate</code></td><td>hourly precision, ISO8601 date in local time</td><td><code>2017-09-06T00:00:00.0+02:00</code></td><td></td><td><code>from_iso8601_timestamp(subsession_start_date) AT TIME ZONE 'GMT'</code></td></tr>
<tr><td><code>subsession_length</code></td><td><code>info.subsessionLength</code></td><td>subsession length in seconds</td><td><code>599</code></td><td></td><td><code>date_add('second', subsession_length, subsession_start_date)</code></td></tr>
<tr><td><code>profile_creation_date</code></td><td><code>environment.profile.creationDate</code></td><td>days since epoch</td><td><code>15,755</code></td><td></td><td><code>from_unixtime(profile_creation_date * 86400)</code></td></tr>
</tbody></table>
</div>
<h3 id="user-preferences"><a class="header" href="#user-preferences">User Preferences</a></h3>
<p>These are added in the <a href="https://github.com/mozilla/bigquery-etl/blob/ad84a15d580333b41d36cfe8331e51238f3bafa1/sql/moz-fx-data-shared-prod/telemetry_derived/main_summary_v4/part1.sql#L476-L501">Main Summary ETL code</a>.
They must be available in the <a href="http://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/data/environment.html">ping environment</a> to be included here.</p>
<p>Once added, they will show as top-level fields, with the string <code>user_pref</code> prepended.
For example, <code>dom.ipc.processCount</code> becomes <code>user_pref_dom_ipc_processcount</code>.</p>
<h2 id="code-reference-8"><a class="header" href="#code-reference-8">Code Reference</a></h2>
<p>This dataset is generated by <a href="https://github.com/mozilla/bigquery-etl/tree/ad84a15d580333b41d36cfe8331e51238f3bafa1/sql/moz-fx-data-shared-prod/telemetry_derived/main_summary_v4">bigquery-etl</a>.
Refer to this repository for information on how to run or augment the dataset.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/batch_view/main_summary/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="socorro-crash-reports"><a class="header" href="#socorro-crash-reports">Socorro Crash Reports</a></h1>
<ul>
<li><a href="datasets/other/socorro_crash/reference.html#introduction">Introduction</a></li>
<li><a href="datasets/other/socorro_crash/reference.html#data-reference">Data Reference</a>
<ul>
<li><a href="datasets/other/socorro_crash/reference.html#example">Example</a></li>
<li><a href="datasets/other/socorro_crash/reference.html#scheduling">Scheduling</a></li>
<li><a href="datasets/other/socorro_crash/reference.html#schema">Schema</a></li>
<li><a href="datasets/other/socorro_crash/reference.html#code-reference">Code Reference</a></li>
</ul>
</li>
</ul>
<h2 id="introduction-9"><a class="header" href="#introduction-9">Introduction</a></h2>
<p>Public crash statistics for Firefox are available through the Data Platform in a <code>socorro_crash</code> dataset.
The crash data in <a href="https://wiki.mozilla.org/Socorro">Socorro</a> is sanitized and made available to STMO.
A nightly import job converts batches of JSON documents into a columnar format using the associated JSON Schema.</p>
<h2 id="data-reference-6"><a class="header" href="#data-reference-6">Data Reference</a></h2>
<h3 id="example-1"><a class="header" href="#example-1">Example</a></h3>
<p>The dataset can be queried using SQL.
For example, we can aggregate the number of crashes and total up-time by date and reason.</p>
<pre><code class="language-sql">SELECT crash_date,
reason,
count(*) as n_crashes,
avg(uptime) as avg_uptime,
stddev(uptime) as stddev_uptime,
approx_percentile(uptime, ARRAY [0.25, 0.5, 0.75]) as qntl_uptime
FROM socorro_crash
WHERE crash_date='20180520'
GROUP BY 1,
2
</code></pre>
<p><a href="https://sql.telemetry.mozilla.org/queries/53884/source"><code>STMO#53884</code></a></p>
<h3 id="scheduling-11"><a class="header" href="#scheduling-11">Scheduling</a></h3>
<p>The job is schedule on a nightly basis on airflow.
The dag is available under <a href="https://github.com/mozilla/telemetry-airflow/blob/master/dags/socorro_import.py"><code>mozilla/telemetry-airflow:/dags/socorro_import.py</code></a>.</p>
<h3 id="schema-5"><a class="header" href="#schema-5">Schema</a></h3>
<p>The source schema is available on the <a href="https://raw.githubusercontent.com/mozilla-services/socorro/main/socorro/schemas/telemetry_socorro_crash.json"><code>mozilla-services/socorro</code> GitHub repository</a>.
This schema is transformed into a Spark-SQL structure and serialized to parquet after transforming column names from <code>camelCase</code> to <code>snake_case</code>.</p>
<h3 id="code-reference-9"><a class="header" href="#code-reference-9">Code Reference</a></h3>
<p>The code is <a href="https://github.com/mozilla-services/data-pipeline/blob/master/reports/socorro_import/ImportCrashData.ipynb">a notebook in the <code>mozilla-services/data-pipeline</code> repository</a>.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/other/socorro_crash/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="ssl-ratios"><a class="header" href="#ssl-ratios">SSL Ratios</a></h1>
<ul>
<li><a href="datasets/other/ssl/reference.html#introduction">Introduction</a>
<ul>
<li><a href="datasets/other/ssl/reference.html#content">Content</a></li>
<li><a href="datasets/other/ssl/reference.html#background-and-caveats">Background and Caveats</a></li>
<li><a href="datasets/other/ssl/reference.html#accessing-the-data">Accessing the Data</a></li>
</ul>
</li>
<li><a href="datasets/other/ssl/reference.html#data-reference">Data Reference</a>
<ul>
<li><a href="datasets/other/ssl/reference.html#combining-rows">Combining Rows</a></li>
<li><a href="datasets/other/ssl/reference.html#schema">Schema</a></li>
<li><a href="datasets/other/ssl/reference.html#scheduling">Scheduling</a></li>
<li><a href="datasets/other/ssl/reference.html#public-data">Public Data</a></li>
<li><a href="datasets/other/ssl/reference.html#code-reference">Code Reference</a></li>
</ul>
</li>
</ul>
<h1 id="introduction-10"><a class="header" href="#introduction-10">Introduction</a></h1>
<p>The public SSL dataset publishes the percentage of page loads Firefox users have performed
that were conducted over SSL. This dataset is used to produce graphs like
<a href="https://letsencrypt.org/stats/">Let's Encrypt's</a> to determine SSL adoption on the Web
over time.</p>
<h4 id="content-1"><a class="header" href="#content-1">Content</a></h4>
<p>The public SSL dataset is a table where each row is a distinct set of dimensions, with their
associated SSL statistics. The dimensions are <code>submission_date</code>, <code>os</code>, and <code>country</code>. The
statistics are <code>reporting_ratio</code>, <code>normalized_pageloads</code>, and <code>ratio</code>.</p>
<h4 id="background-and-caveats-2"><a class="header" href="#background-and-caveats-2">Background and Caveats</a></h4>
<ul>
<li>We're using normalized values in <code>normalized_pageloads</code> to obscure absolute page load counts.</li>
<li>This is across the entirety of release, not per-version, because we're looking at Web health,
not Firefox user health.</li>
<li>Any dimension tuple (any given combination of <code>submission_date</code>, <code>os</code>, and <code>country</code>) with
fewer than 5000 page loads is omitted from the dataset.</li>
<li>This is hopefully just a temporary dataset to stopgap release aggregates going away
until we can come up with a better way to publicly publish datasets.</li>
</ul>
<h4 id="accessing-the-data-5"><a class="header" href="#accessing-the-data-5">Accessing the Data</a></h4>
<p>For details on accessing the data, please look at
<a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1414839">bug 1414839</a>.</p>
<h1 id="data-reference-7"><a class="header" href="#data-reference-7">Data Reference</a></h1>
<h2 id="combining-rows"><a class="header" href="#combining-rows">Combining Rows</a></h2>
<p>This is a dataset of ratios. You can't combine ratios if they have different bases. For example,
if 50% of 10 loads (5 loads) were SSL and 5% of 20 loads (1 load) were SSL, you cannot calculate
that 20% (6 loads) of the total loads (30 loads) were SSL unless you know that the 50% was for
10 and the 5% was for 20.</p>
<p>If you're reluctant, for product reasons, to share the numbers 10 and 20, this gets tricky.</p>
<p>So what we've done is normalize the whole batch of 30 down to 1.0. That means we tell you that
50% of one-third of the loads (0.333...) was SSL and 5% of the other two-thirds of the loads
(0.666...) was SSL. Then you can figure out the overall 20% figure by this calculation:</p>
<p><code>(0.5 * 0.333 + 0.05 * 0.666) / (0.333 + 0.666) = 0.2</code></p>
<p>Notice that you must divide by the sum of the normalized pageloads (0.333 + 0.666) in order to
&quot;unnormalize&quot; the result into the true ratio. (In this toy example we're summing across all
dimensions so the sum of all included normalized pageloads was 1.0.)</p>
<p>For this dataset the same system applies. To combine rows' ratios (to, for example, see what the
SSL ratio was across all <code>os</code> and <code>country</code> for a given <code>submission_date</code>), you must first
multiply them by the rows' <code>normalized_pageviews</code> values. Then you must divide them by the sum
of the rows' <code>normalized_pageviews</code> values to &quot;unnormalize&quot; and get the true ratio.</p>
<p>Or, in JavaScript:</p>
<pre><code class="language-js">let rows = query_result.data.rows;
let normalizedRatioForDateInQuestion = rows
.filter((row) =&gt; row.submission_date == dateInQuestion)
.reduce((row, acc) =&gt; acc + row.normalized_pageloads * row.ratio, 0);
let normalizedPageloadSumForDateInQuestion = rows
.filter((row) =&gt; row.submission_date == dateInQuestion)
.reduce((row, acc) =&gt; acc + row.normalized_pageloads, 0);
let trueRatio = normalizedRatioForDateInQuestion / normalizedPageloadSumForDateInQuestion;
</code></pre>
<p>Remember that the normalization in this dataset is done across all dimensions
(<code>os</code>, <code>country</code>) per <code>submission_date</code>. Summing <code>ratio</code> (or <code>reporting_ratio</code>)
across different <code>submission_date</code> values will not give correct information.</p>
<h2 id="schema-6"><a class="header" href="#schema-6">Schema</a></h2>
<p>The data is output in STMO API format:</p>
<pre><code>&quot;query_result&quot;: {
&quot;retrieved_at&quot;: &lt;timestamp&gt;,
&quot;query_hash&quot;: &lt;hash&gt;,
&quot;query&quot;: &lt;SQL&gt;,
&quot;runtime&quot;: &lt;number of seconds&gt;,
&quot;id&quot;: &lt;an id&gt;,
&quot;data_source_id&quot;: 26, // Athena
&quot;data_scanned&quot;: &lt;some really large number, as a string&gt;,
&quot;data&quot;: {
&quot;data_scanned&quot;: &lt;some really large number, as a number&gt;,
&quot;columns&quot;: [
{&quot;friendly_name&quot;: &quot;submission_date&quot;, &quot;type&quot;: &quot;datetime&quot;, &quot;name&quot;: &quot;submission_date&quot;},
{&quot;friendly_name&quot;: &quot;os&quot;, &quot;type&quot;: &quot;string&quot;, &quot;name&quot;: &quot;os&quot;},
{&quot;friendly_name&quot;: &quot;country&quot;, &quot;type&quot;: &quot;string&quot;, &quot;name&quot;: &quot;country&quot;},
{&quot;friendly_name&quot;: &quot;reporting_ratio&quot;, &quot;type&quot;: &quot;float&quot;, &quot;name&quot;: &quot;reporting_ratio&quot;},
{&quot;friendly_name&quot;: &quot;normalized_pageloads&quot;, &quot;type&quot;: &quot;float&quot;, &quot;name&quot;: &quot;normalized_pageloads&quot;},
{&quot;friendly_name&quot;: &quot;ratio&quot;, &quot;type&quot;: &quot;float&quot;, &quot;name&quot;: &quot;ratio&quot;}
],
&quot;rows&quot;: [
{
&quot;submission_date&quot;: &quot;2017-10-24T00:00:00&quot;, // date string, day resolution
&quot;os&quot;: &quot;Windows_NT&quot;, // operating system family of the clients reporting the pageloads. One of &quot;Windows_NT&quot;, &quot;Linux&quot;, or &quot;Darwin&quot;.
&quot;country&quot;: &quot;CZ&quot;, // ISO 639 two-character country code, or &quot;??&quot; if we have no idea. Determined by performing a geo-IP lookup of the clients that submitted the pings.
&quot;reporting_ratio&quot;: 0.006825266611977031, // the ratio of pings that reported any pageloads at all. A number between 0 and 1. See [bug 1413258](https://bugzilla.mozilla.org/show_bug.cgi?id=1413258).
&quot;normalized_pageloads&quot;: 0.00001759145263985348, // the proportion of total pageloads in the dataset that are represented by this row. Provided to allow combining rows. A number between 0 and 1.
&quot;ratio&quot;: 0.6916961976822144 // the ratio of the pageloads that were performed over SSL. A number between 0 and 1.
}, ...
]
}
}
</code></pre>
<h2 id="scheduling-12"><a class="header" href="#scheduling-12">Scheduling</a></h2>
<p>The dataset updates every 24 hours.</p>
<h2 id="public-data-1"><a class="header" href="#public-data-1">Public Data</a></h2>
<p>The data is publicly available on BigQuery: <code>mozilla-public-data.telemetry_derived.ssl_ratios_v1</code>.
Data can also be accessed through the public HTTP endpoint: <a href="https://public-data.telemetry.mozilla.org/api/v1/tables/telemetry_derived/ssl_ratios/v1/files">https://public-data.telemetry.mozilla.org/api/v1/tables/telemetry_derived/ssl_ratios/v1/files</a></p>
<h2 id="code-reference-10"><a class="header" href="#code-reference-10">Code Reference</a></h2>
<p>You can find the query that generates the SSL dataset
<a href="https://sql.telemetry.mozilla.org/queries/49323/source#table"><code>STMO#49323</code></a>.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/other/ssl/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="telemetry-aggregates-reference"><a class="header" href="#telemetry-aggregates-reference">Telemetry Aggregates Reference</a></h1>
<ul>
<li><a href="datasets/batch_view/telemetry_aggregates/reference.html#introduction">Introduction</a>
<ul>
<li><a href="datasets/batch_view/telemetry_aggregates/reference.html#rows-and-columns">Rows and Columns</a></li>
<li><a href="datasets/batch_view/telemetry_aggregates/reference.html#accessing-the-data">Accessing the Data</a></li>
</ul>
</li>
<li><a href="datasets/batch_view/telemetry_aggregates/reference.html#data-reference">Data Reference</a>
<ul>
<li><a href="datasets/batch_view/telemetry_aggregates/reference.html#example-queries">Example Queries</a></li>
<li><a href="datasets/batch_view/telemetry_aggregates/reference.html#sampling">Sampling</a>
<ul>
<li><a href="datasets/batch_view/telemetry_aggregates/reference.html#invalid-pings">Invalid Pings</a></li>
</ul>
</li>
<li><a href="datasets/batch_view/telemetry_aggregates/reference.html#scheduling">Scheduling</a></li>
<li><a href="datasets/batch_view/telemetry_aggregates/reference.html#schema">Schema</a></li>
</ul>
</li>
</ul>
<h1 id="introduction-11"><a class="header" href="#introduction-11">Introduction</a></h1>
<p>The <code>telemetry_aggregates</code> dataset is a daily aggregation of the pings,
aggregating the histograms across a set of dimensions.</p>
<h4 id="rows-and-columns"><a class="header" href="#rows-and-columns">Rows and Columns</a></h4>
<p>There's one column for each of the dimensions and the histogram and each row
is a distinct set of dimensions, along with their associated histograms.</p>
<h4 id="accessing-the-data-6"><a class="header" href="#accessing-the-data-6">Accessing the Data</a></h4>
<p>This dataset is accessible via STMO by selecting from <code>telemetry_aggregates</code>.</p>
<p>The data is stored as a parquet table in S3 at the following address.</p>
<pre><code>s3://telemetry-parquet/aggregates_poc/v1/
</code></pre>
<h1 id="data-reference-8"><a class="header" href="#data-reference-8">Data Reference</a></h1>
<h2 id="example-queries-6"><a class="header" href="#example-queries-6">Example Queries</a></h2>
<p>Here's an example query that shows the number of pings received per
<code>submission_date</code> for the dimensions provided.</p>
<pre><code class="language-sql">SELECT
submission_date,
SUM(count) AS pings
FROM
telemetry_aggregates
WHERE
channel = 'nightly'
AND metric = 'GC_MS'
AND aggregate_type = 'build_id'
AND period = '201901'
GROUP BY
submission_date
ORDER BY
submission_date
;
</code></pre>
<h2 id="sampling-1"><a class="header" href="#sampling-1">Sampling</a></h2>
<h3 id="invalid-pings"><a class="header" href="#invalid-pings">Invalid Pings</a></h3>
<p>We ignore invalid pings in our processing. Invalid pings are defined as those that:</p>
<ul>
<li>The submission dates are invalid or missing.</li>
<li>The build ID is malformed.</li>
<li>The <code>docType</code> field is missing or unknown.</li>
<li>The build ID is older than a defined cutoff days.
(See the <code>BUILD_ID_CUTOFFS</code> variable in the
<a href="https://github.com/mozilla/python_mozaggregator/">code</a> for the max days per channel)</li>
</ul>
<h2 id="scheduling-13"><a class="header" href="#scheduling-13">Scheduling</a></h2>
<p>The <code>telemetry_aggregates</code> job is run daily, at midnight UTC.
The job is scheduled on <a href="https://github.com/mozilla/telemetry-airflow">Airflow</a>.
The DAG is <a href="https://github.com/mozilla/telemetry-airflow/blob/831fe84a36347f440ede4f5a90e0bf83d4fa1e1e/dags/mozaggregator_parquet.py">here</a></p>
<h2 id="schema-7"><a class="header" href="#schema-7">Schema</a></h2>
<p>The <code>telemetry_aggregates</code> table has a set of dimensions and set of
aggregates for those dimensions.</p>
<p>The partitioned dimensions are the following columns. Filtering by one of
these fields to limit the resulting number of rows can run significantly
faster:</p>
<ul>
<li><code>metric</code> is the name of the metric, like <code>&quot;GC_MS&quot;</code>.</li>
<li><code>aggregate_type</code> is the type of aggregation, either <code>&quot;build_id&quot;</code> or
<code>&quot;submission_date&quot;</code>, representing how this aggregation was grouped.</li>
<li><code>period</code> is a string representing the month in <code>YYYYMM</code> format that a ping
was submitted, like <code>'201901'</code>.</li>
</ul>
<p>The rest of the dimensions are:</p>
<ul>
<li><code>submission_date</code> is the date pings were submitted for a particular aggregate.</li>
<li><code>channel</code> is the channel, like <code>release</code> or <code>beta</code>.</li>
<li><code>version</code> is the program version, like <code>46.0a1</code>.</li>
<li><code>build_id</code> is the <code>YYYYMMDDhhmmss</code> timestamp the program was built, like
<code>20190123192837</code>.</li>
<li><code>application</code> is the program name, like <code>Firefox</code> or <code>Fennec</code>.</li>
<li><code>architecture</code> is the architecture that the program was built for (not
necessarily the one it is running on).</li>
<li><code>os</code> is the name of the OS the program is running on, like <code>Darwin</code> or <code>Windows_NT</code>.</li>
<li><code>os_version</code> is the version of the OS the program is running on.</li>
<li><code>key</code> is the key of a keyed metric. This will be empty if the underlying
metric is not a keyed metric.</li>
<li><code>process_type</code> is the process the histogram was recorded in, like <code>content</code>
or <code>parent</code>.</li>
</ul>
<p>The aggregates are:</p>
<ul>
<li><code>count</code> is the aggregate sum of the number of pings per dimensions.</li>
<li><code>sum</code> is the aggregate sum of the histogram values per dimensions.</li>
<li><code>histogram</code> is the aggregated histogram per dimensions.</li>
</ul>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/batch_view/telemetry_aggregates/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="glam-datasets"><a class="header" href="#glam-datasets">GLAM datasets</a></h1>
<p><a href="https://glam.telemetry.mozilla.org">GLAM</a> aims to answer a majority of the &quot;easy&quot; questions of how a probe or metric has changed over time.
The GLAM aggregation tables are useful for accessing the data that drives GLAM if more exploration is required.
Exploring the GLAM tables could take that a little farther, but still has some limitations.
If you need to dive deeper or aggregate on a field that isn't included here, consider reading <a href="https://docs.telemetry.mozilla.org/cookbooks/main_ping_exponential_histograms.html">Visualizing Percentiles of a Main Ping Exponential Histogram</a>.</p>
<p>The GLAM tables:</p>
<ul>
<li>Are aggregated at the client level, not the submission ping level</li>
<li>Provide a set of dimensions for subsets: channel, OS, process or ping type</li>
<li>Are aggregated by build ID and version</li>
<li>For each aggregation, the distribution and percentiles over time are calculated</li>
<li>Have the last 3 versions of data aggregated every day</li>
<li>Retain data for up to 10 major versions</li>
</ul>
<h2 id="firefox-desktop"><a class="header" href="#firefox-desktop">Firefox Desktop</a></h2>
<h3 id="data-source-table"><a class="header" href="#data-source-table">Data source table</a></h3>
<ul>
<li><code>moz-fx-data-shared-prod.telemetry.client_probe_counts</code></li>
</ul>
<h3 id="data-reference-9"><a class="header" href="#data-reference-9">Data reference</a></h3>
<ul>
<li><code>os</code>: One of Windows, Mac, Linux, or NULL for all OSes</li>
<li><code>app_version</code>: Integer representing the major version</li>
<li><code>app_build_id</code>: The full build ID, or NULL if aggregated by major version</li>
<li><code>channel</code>: One of nightly, beta, or release</li>
<li><code>metric</code>: The name of the metric</li>
<li><code>metric_type</code>: The type of metric, e.g. <code>histogram-enumerated</code></li>
<li><code>key</code>: The key if the metric is a keyed metric</li>
<li><code>process</code>: The process</li>
<li><code>client_agg_type</code>: The type of client aggregation used, e.g. <code>summed_histogram</code></li>
<li><code>agg_type</code>: One of histogram or percentiles representing what data will be in the <code>aggregates</code> column</li>
<li><code>total_users</code>: The number of users that submitted data for the combination of dimensions</li>
<li><code>aggregates</code>: The data as a key/value record, either percentiles or histogram</li>
</ul>
<h3 id="sample-query-1"><a class="header" href="#sample-query-1">Sample query</a></h3>
<pre><code class="language-sql">SELECT
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key,
process,
client_agg_type,
agg_type,
total_users,
mozfun.glam.histogram_cast_json(aggregates) AS aggregates
FROM
`moz-fx-data-shared-prod.telemetry.client_probe_counts`
WHERE
metric=&quot;checkerboard_severity&quot;
AND channel=&quot;nightly&quot;
AND os IS NULL
AND process=&quot;parent&quot;
AND app_build_id IS NULL
</code></pre>
<p>Notes:</p>
<ul>
<li>To query all OSes, use: <code>WHERE os IS NULL</code></li>
<li>To query by build ID, use: <code>WHERE app_build_id IS NOT NULL</code></li>
<li>To query by version, use: <code>WHERE app_build_id IS NULL</code></li>
</ul>
<h2 id="firefox-for-android"><a class="header" href="#firefox-for-android">Firefox for Android</a></h2>
<h3 id="data-source-tables"><a class="header" href="#data-source-tables">Data source tables</a></h3>
<ul>
<li><code>org_mozilla_fenix_glam_release__view_probe_counts_v1</code></li>
<li><code>org_mozilla_fenix_glam_beta__view_probe_counts_v1</code></li>
<li><code>org_mozilla_fenix_glam_nightly__view_probe_counts_v1</code></li>
</ul>
<h2 id="data-reference-10"><a class="header" href="#data-reference-10">Data reference</a></h2>
<ul>
<li><code>os</code>: Just &quot;Android&quot; for now</li>
<li><code>app_version</code>: Integer representing the major version</li>
<li><code>app_build_id</code>: The full build ID, or &quot;*&quot; if aggregated by major version</li>
<li><code>channel</code>: Always &quot;*&quot;, use the different source tables to select a channel</li>
<li><code>metric</code>: The name of the metric</li>
<li><code>metric_type</code>: The type of metric, e.g. <code>timing_distribution</code></li>
<li><code>key</code>: The key if the metric is a keyed metric</li>
<li><code>ping_type</code>: The type of ping, or &quot;*&quot; for all ping types</li>
<li><code>client_agg_type</code>: The type of client aggregation used, e.g. <code>summed_histogram</code></li>
<li><code>agg_type</code>: One of histogram or percentiles representing what data will be in the <code>aggregates</code> column</li>
<li><code>total_users</code>: The number of users that submitted data for the combination of dimensions</li>
<li><code>aggregates</code>: The data as a key/value record, either percentiles or histogram</li>
</ul>
<h3 id="sample-query-2"><a class="header" href="#sample-query-2">Sample query</a></h3>
<pre><code class="language-sql">SELECT
ping_type,
os,
app_version,
app_build_id,
metric,
metric_type,
key,
client_agg_type,
agg_type,
total_users,
mozfun.glam.histogram_cast_json(aggregates) AS aggregates,
FROM
`moz-fx-data-shared-prod.glam_etl.org_mozilla_fenix_glam_release__view_probe_counts_v1`
WHERE
metric=&quot;performance_time_dom_complete&quot;
AND os=&quot;Android&quot;
AND ping_type=&quot;*&quot;
AND app_build_id!=&quot;*&quot;
</code></pre>
<p>Notes:</p>
<ul>
<li>To query all ping types, use: <code>WHERE ping_type = &quot;*&quot;</code></li>
<li>To query by build ID, use: <code>WHERE app_build_id != &quot;*&quot;</code></li>
<li>To query by version, use: <code>WHERE app_build_id = &quot;*&quot;</code></li>
</ul>
<h2 id="glam-intermediate-tables"><a class="header" href="#glam-intermediate-tables">GLAM Intermediate Tables</a></h2>
<p>In addition to the above tables, the GLAM ETL produces intermediate tables that can be useful outside of the GLAM ETL in some cases.
These tables include the client ID and could be joined with other tables to filter by client based data (e.g. specific hardware).</p>
<h3 id="firefox-desktop-1"><a class="header" href="#firefox-desktop-1">Firefox Desktop</a></h3>
<p>Data sources:</p>
<ul>
<li><code>moz-fx-data-shared-prod.telemetry.clients_daily_histogram_aggregates</code></li>
<li><code>moz-fx-data-shared-prod.telemetry.clients_daily_scalar_aggregates</code></li>
</ul>
<p>These tables are:</p>
<ul>
<li>Preprocessed from main telemetry to intermediate data with one row per client per metric per day, then aggregated normalizing across clients.</li>
<li>Clients daily aggregates analogous to clients daily with:
<ul>
<li>all metrics aggregated</li>
<li>each scalar includes min, max, average, sum, and count aggregations</li>
<li>each histogram aggregated over all client data per day</li>
<li>each date is further aggregated over the dimensions: channel, os, version, build ID</li>
</ul>
</li>
</ul>
<h2 id="etl-pipeline"><a class="header" href="#etl-pipeline">ETL Pipeline</a></h2>
<h3 id="scheduling-14"><a class="header" href="#scheduling-14">Scheduling</a></h3>
<p>GLAM is scheduled to run daily via Airflow. There are two separate ETL pipelines for computing GLAM datasets for <a href="https://workflow.telemetry.mozilla.org/dags/glam/grid">Firefox Desktop legacy</a>, <a href="https://workflow.telemetry.mozilla.org/dags/glam_fenix/grid">Fenix</a> and <a href="https://workflow.telemetry.mozilla.org/dags/glam_fog/grid">Firefox on Glean</a>.</p>
<h3 id="source-code"><a class="header" href="#source-code">Source Code</a></h3>
<p>The ETL code base lives in the <a href="https://github.com/mozilla/bigquery-etl">bigquery-etl repository</a> and is partially generated. The scripts for generating ETL queries for Firefox Desktop Legacy currently live <a href="https://github.com/mozilla/bigquery-etl/tree/main/script/glam">here</a> while the GLAM logic for Glean apps lives <a href="https://github.com/mozilla/bigquery-etl/tree/main/bigquery_etl/glam">here</a>.</p>
<h3 id="steps-1"><a class="header" href="#steps-1">Steps</a></h3>
<p>GLAM has a separate set of steps and intermediate tables to aggregate scalar and histogram probes.</p>
<h4 id="latest_versions"><a class="header" href="#latest_versions"><code>latest_versions</code></a></h4>
<ul>
<li>This task pulls in the most recent version for each channel from https://product-details.mozilla.org/1.0/firefox_versions.json</li>
</ul>
<h4 id="clients_daily_histogram_aggregates_process"><a class="header" href="#clients_daily_histogram_aggregates_process"><code>clients_daily_histogram_aggregates_&lt;process&gt;</code></a></h4>
<ul>
<li>The set of steps that load data to this table are divided into different processes (<code>parent</code>, <code>content</code>, <code>gpu</code>) plus a keyed step for keyed histograms.</li>
<li>The parent job creates or overwrites the partition corresponding to the <code>logical_date</code>, and other processes append data to that partition.</li>
<li>The process uses <code>telemetry.buildhub2</code> to select rows with valid <code>build_ids</code>.</li>
<li>Aggregations are done per client, per day, and include a line for each <code>submission_date</code>, <code>client_id</code>, <code>os</code>, <code>app_version</code>, <code>build_id</code>, and <code>channel</code>.</li>
<li>The aggregation is done by adding histogram values with the same key for the dimensions listed above.</li>
<li>The queries for the different steps are generated and run as part of each step.</li>
<li>The &quot;keyed&quot; step includes all Keyed Histogram probes, regardless of process (<code>parent</code>, <code>content</code>, <code>gpu</code>).</li>
<li>As a result of the subdivisions in this step, it generates different rows for each process and keyed/non-keyed metric, which will be grouped together in the <code>clients_histogram_aggregates</code> step.</li>
<li>Clients that are on the release channel of the Windows operating system get sampled to reduce the data size.</li>
<li>The partitions are set to expire after 7 days.</li>
</ul>
<h4 id="clients_histogram_aggregates_new"><a class="header" href="#clients_histogram_aggregates_new"><code>clients_histogram_aggregates_new</code></a></h4>
<ul>
<li>This step groups together all rows that have the same <code>submission_date</code> and <code>logical_date</code> from different processes and keyed and non-keyed sources, and combines them into a single row in the <code>histogram_aggregates</code> column. It sums the histogram values with the same key.</li>
<li>This process is only applied to the last three versions.</li>
<li>The table is overwritten at every execution of this step.</li>
</ul>
<h4 id="clients_histogram_aggregates"><a class="header" href="#clients_histogram_aggregates"><code>clients_histogram_aggregates</code></a></h4>
<ul>
<li>New entries from <code>clients_histogram_aggregates_new</code> are merged with the 3 last versions of previous days partition and written to the current days partition.</li>
<li>The most recent partition contains the current snapshot of the last three versions of data.</li>
<li>The partitions expire in 7 days.</li>
</ul>
<h4 id="clients_histogram_buckets_counts"><a class="header" href="#clients_histogram_buckets_counts"><code>clients_histogram_buckets_counts</code></a></h4>
<ul>
<li>This process starts by creating wildcards for <code>os</code> and <code>app_build_id</code> which are needed for aggregating values across os and build IDs later on.</li>
<li>It then filters out builds that have less than 0.5% of WAU (which can vary per channel). This is referenced in https://github.com/mozilla/glam/issues/1575#issuecomment-946880387.</li>
<li>The process then normalizes histograms per client - it sets the sum of histogram values for each client for a given metric to 1.</li>
<li>Finally, it removes the <code>client_id</code> dimension by aggregating all histograms for a given metric and adding the clients' histogram values.</li>
</ul>
<h4 id="clients_histogram_probe_counts"><a class="header" href="#clients_histogram_probe_counts"><code>clients_histogram_probe_counts</code></a></h4>
<ul>
<li>This process generates buckets - which can be linear or exponential - based on the <code>metric_type</code>.</li>
<li>It then aggregates metrics per wildcards (<code>os</code>, <code>app_build_id</code>).</li>
<li>Finally, it rebuilds histograms using the Dirichlet Distribution, normalized using the number of clients that contributed to that histogram in the <code>clients_histogram_buckets_counts</code> step.</li>
</ul>
<h4 id="histogram_percentiles"><a class="header" href="#histogram_percentiles"><code>histogram_percentiles</code></a></h4>
<ul>
<li>Uses <code>mozfun.glam.percentile</code> UDF to build histogram percentiles, from [0.1 to 99.9]</li>
</ul>
<hr />
<h4 id="clients_daily_scalar_aggregates"><a class="header" href="#clients_daily_scalar_aggregates"><code>clients_daily_scalar_aggregates</code></a></h4>
<ul>
<li>The set of steps that load data to this table are divided into non-keyed <code>scalar</code>, <code>keyed_boolean</code> and <code>keyed_scalar</code>. The non-keyed scalar job creates or overwrites the partition corresponding to the <code>logical_date</code>, and other processes append data to that partition.</li>
<li>The process uses <code>telemetry.buildhub2</code> to select rows with valid <code>build_ids</code>.</li>
<li>Aggregations are done per client, per day and include a line for each <code>client</code>, <code>os</code>, <code>app_version</code>, <code>build_id</code>, and <code>channel</code>.</li>
<li>The queries for the different steps are generated and run as part of each step. All steps include probes regardless of process (<code>parent</code>, <code>content</code>, <code>gpu</code>).</li>
<li>As a result of the subdivisions in this step, it generates different rows for each keyed/non-keyed, boolean/scalar metric, which will be grouped together in <code>clients_scalar_aggregates</code>.</li>
<li>Clients that are on the release channel of the Windows operating system get sampled to reduce the data size.</li>
<li>Partitions expire in 7 days.</li>
</ul>
<h4 id="clients_scalar_aggregates"><a class="header" href="#clients_scalar_aggregates"><code>clients_scalar_aggregates</code></a></h4>
<ul>
<li>The process starts by taking the <code>clients_daily_scalar_aggregates</code> as the primary source.</li>
<li>It then groups all rows that have the same <code>submission_date</code> and <code>logical_date</code> from the keyed and non-keyed, scalar and boolean sources, and combines them into a single row in the <code>scalar_aggregates</code> column.</li>
<li>If the <code>agg_type</code> is <code>count</code>, <code>sum</code>, <code>true</code>, or <code>false</code>, the process will sum the values.</li>
<li>If the <code>agg_type</code> is <code>max</code>, it will take the maximum value, and if it is <code>min</code>, it will take the minimum value.</li>
<li>This process is only applied to the last three versions.</li>
<li>The partitions expire in 7 days.</li>
</ul>
<h4 id="scalar_percentiles"><a class="header" href="#scalar_percentiles"><code>scalar_percentiles</code></a></h4>
<ul>
<li>This process produces a user count and percentiles for scalar metrics.</li>
<li>It generates wildcard combinations of <code>os</code> and <code>app_build_id</code> and merges all submissions from a client for the same <code>os</code>, <code>app_version</code>, <code>app_build_id</code> and channel into the <code>scalar_aggregates</code> column.</li>
<li>The <code>user_count</code> column is computed taking sampling into account.</li>
<li>Finally it splits the aggregates into percentiles from [0.1 to 99.9]</li>
</ul>
<h4 id="client_scalar_probe_counts"><a class="header" href="#client_scalar_probe_counts"><code>client_scalar_probe_counts</code></a></h4>
<ul>
<li>This step processes booleans and scalars, although booleans are not supported by GLAM.
<ul>
<li>For boolean metrics the process aggregates their values with the following rule: &quot;never&quot; if all values for a metric are false, &quot;always&quot; if all values are true, and sometimes if there's a mix.</li>
<li>For scalar and <code>keyed_scalar</code> probes the process starts by building the buckets per metric, then it generates wildcards for os and <code>app_build_id</code>. It then aggregates all submissions from the same <code>client_id</code> under one row and assigns the <code>user_count</code> column to it with the following rule: 10 if os is &quot;Windows&quot; and channel is &quot;release&quot;, 1 otherwise. After that it finishes by aggregating the rows per metric, placing the scalar values in their appropriate buckets and summing up all <code>user_count</code> values for that metric.</li>
</ul>
</li>
</ul>
<hr />
<h4 id="glam_user_counts"><a class="header" href="#glam_user_counts"><code>glam_user_counts</code></a></h4>
<ul>
<li>Combines both aggregated scalar and histogram values.</li>
<li>This process produces a user count for each combination of <code>os</code>, <code>app_version</code>, <code>app_build_id</code>, channel.</li>
<li>It builds a client count from the union of histograms and scalars, including all combinations in which <code>os</code>, <code>app_version</code>, <code>app_build_id</code>, and <code>channel</code> are wildcards.</li>
</ul>
<h4 id="glam_sample_counts"><a class="header" href="#glam_sample_counts"><code>glam_sample_counts</code></a></h4>
<ul>
<li>This process calculates the <code>total_sample</code> column by adding up all the <code>aggregates</code> values.</li>
<li>This works because in the primary sources the values also represent a count of the samples that registered their respective keys</li>
</ul>
<h4 id="extract_user_counts"><a class="header" href="#extract_user_counts"><code>extract_user_counts</code></a></h4>
<ul>
<li>This step exports user counts in its final shape to GCS as a CSV.</li>
<li>It first copies a deduplicated version of the primary source to a temporary table, removes the previously exported CSV files from GCS, then exports the temporary table to GCS as CSV files.</li>
</ul>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/glam.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="experiment-datasets"><a class="header" href="#experiment-datasets">Experiment datasets</a></h1>
<p>See <a href="datasets/../concepts/experiments.html">Experimentation</a> for an overview of experimentation in Firefox.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/experiments.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="jetstream-datasets"><a class="header" href="#jetstream-datasets">Jetstream datasets</a></h1>
<ul>
<li><a href="datasets/jetstream.html#statistics-tables">Statistics tables</a>
<ul>
<li><a href="datasets/jetstream.html#examples">Examples</a></li>
</ul>
</li>
<li><a href="datasets/jetstream.html#client-window-aggregate-tables">Client-window aggregate tables</a></li>
<li><a href="datasets/jetstream.html#enrollment-tables">Enrollment tables</a></li>
<li><a href="datasets/jetstream.html#scheduling">Scheduling</a></li>
<li><a href="datasets/jetstream.html#code-reference">Code reference</a></li>
<li><a href="datasets/jetstream.html#documentation">Documentation</a></li>
</ul>
<p>Statistical summaries of telemetry data from experiments run in Mozilla
products are provided by <a href="https://github.com/mozilla/jetstream">Jetstream</a>. These summaries are published to
BigQuery and serve both as the substrate for the result visualization
platform and as a resource for data scientists.</p>
<p>Jetstream runs as part of the nightly ETL job (see <a href="datasets/jetstream.html#scheduling">Scheduling</a> below).
Jetstream is also run after pushes to the <a href="https://github.com/mozilla/jetstream-config"><code>jetstream-config</code></a> repository.
Jetstream publishes tables to the dataset <code>moz-fx-data-experiments.mozanalysis</code>.</p>
<p>Experiments are analyzed using the concept of analysis windows. Analysis
windows describe an interval marked from each client's day of
enrollment. The &quot;day 0&quot; analysis window aggregates data from the days
that each client enrolled in the experiment. Because the intervals are
demarcated from enrollment, they are not calendar dates; for some
clients in an experiment, day 0 could be a Tuesday, and for others a
Saturday.</p>
<p>The week 0 analysis window aggregates data from each client's days 0
through 6, the week 1 window aggregates data from days 7 through 13, and
so on.</p>
<p>Clients are given a fixed amount of time, specified in Experimenter and
often a week long, to enroll. Final day 0 results are available for
reporting at the end of the enrollment period, after the last eligible
client has enrolled, and week 0 results are available a week after the
enrollment period closes. Results for each window are published as soon
as complete data is available for all enrolled clients.</p>
<p>The &quot;overall&quot; window, published after the experiment has ended, is a
window beginning on each client's day 0 that spans the longest period
for which all clients have complete data.</p>
<p>Jetstream computes statistics over several metrics by default, including
for any features associated with the experiment in Experimenter. Data
scientists can provide configuration to add additional metrics. Advice
on configuring Jetstream can be found at the <a href="https://github.com/mozilla/jetstream-config"><code>jetstream-config</code></a> repository.</p>
<h2 id="statistics-tables"><a class="header" href="#statistics-tables">Statistics tables</a></h2>
<p>The statistics tables contain statistical summaries of their
corresponding aggregate tables. These tables are suitable for plotting
directly without additional transformations.</p>
<p>Statistics tables are named like:</p>
<p><code>statistics_&lt;slug&gt;_{day, week, overall}_&lt;index&gt;</code></p>
<p>A view is also created that concatenates all statistics tables for an
experiment of a given period type, named like:</p>
<p><code>statistics_&lt;slug&gt;_{daily, weekly, overall}</code></p>
<p>Statistics tables have the schema:</p>
<div class="table-wrapper"><table><thead><tr><th>Column name</th><th>Type</th><th>Description</th></tr></thead><tbody>
<tr><td><code>segment</code></td><td><code>STRING</code></td><td>The segment of the population being analyzed. &quot;all&quot; for the entire population.</td></tr>
<tr><td><code>metric</code></td><td><code>STRING</code></td><td>The slug of the metric, like <code>active_ticks</code> or <code>retained</code></td></tr>
<tr><td><code>statistic</code></td><td><code>STRING</code></td><td>The slug of the statistic that was used to summarize the metric, like &quot;mean&quot; or &quot;deciles&quot;</td></tr>
<tr><td><code>parameter</code></td><td><code>NUMERIC</code> (decimal)</td><td>A statistic-dependent quantity. For two-dimensional statistics like &quot;decile,&quot; this represents the x axis of the plot. For one-dimensional statistics, this is NULL.</td></tr>
<tr><td><code>comparison</code></td><td><code>STRING</code></td><td>If this row represents a comparison between two branches, this row describes what kind of comparison, like <code>difference</code> or <code>relative_uplift</code>. If this row represents a measurement of a single branch, then this column is NULL.</td></tr>
<tr><td><code>comparison_to_branch</code></td><td><code>STRING</code></td><td>If this row represents a comparison between two branches, this row describes which branch is being compared to. For simple A/B tests, this will be &quot;control.&quot;</td></tr>
<tr><td><code>ci_width</code></td><td><code>FLOAT64</code></td><td>A value between 0 and 1 describing the width of the confidence interval represented by the lower and upper columns. Valued at 0.95 for 95% confidence intervals.</td></tr>
<tr><td><code>point</code></td><td><code>FLOAT64</code></td><td>The point estimate of the statistic for the metric given the parameter.</td></tr>
<tr><td><code>lower</code></td><td><code>FLOAT64</code></td><td>The lower bound of the confidence interval for the estimate.</td></tr>
<tr><td><code>upper</code></td><td><code>FLOAT64</code></td><td>The upper bound of the confidence interval for the estimate.</td></tr>
<tr><td><code>window_index</code></td><td><code>INT64</code></td><td>(views only) A base-1 index reflecting the analysis window from which the row is drawn (i.e. day 1, day 2, …).</td></tr>
<tr><td><code>analysis_basis</code></td><td><code>STRING</code></td><td>Analysis basis statistic result is based on. Currently, <code>analysis_basis</code> can be either <code>enrollments</code> or <code>exposures</code>.</td></tr>
</tbody></table>
</div>
<p>Each combination of <code>(segment, metric, statistic, parameter, comparison, comparison_to_branch, ci_width, analysis_basis)</code> uniquely describes a single data
point.</p>
<p>The available segments in a table should be derived from inspection of
the table.</p>
<p><a href="https://github.com/mozilla/jetstream/wiki">Jetstream's Github wiki</a> has a description of each statistic and
comparison.</p>
<h3 id="examples"><a class="header" href="#examples">Examples</a></h3>
<p>To extract the mean of <code>active_hours</code> for each branch from a weekly
statistics view with a name like <code>statistics_bug_12345_slug_weekly</code>,
you could run the query:</p>
<pre><code class="language-sql">SELECT
segment,
window_index AS week,
branch,
point,
lower,
upper
FROM `moz-fx-data-experiments`.mozanalysis.statistics_bug_12345_slug_weekly
WHERE
metric = &quot;active_hours&quot;
AND statistic = &quot;mean&quot;
AND comparison IS NULL
</code></pre>
<p>This query would return a row for each user segment, for each week of
the experiment, for each branch, with the mean of the <code>active_hours</code>
metric.</p>
<p>To see whether the absolute difference of the mean of <code>active_hours</code> was
different between the control and treatment branches, you could run:</p>
<pre><code class="language-sql">SELECT
window_index AS week,
branch,
point,
lower,
upper
FROM `moz-fx-data-experiments`.mozanalysis.statistics_bug_12345_slug_weekly
WHERE
metric = &quot;active_hours&quot;
AND statistic = &quot;mean&quot;
AND comparison = &quot;difference&quot;
AND branch = &quot;treatment&quot;
AND comparison_to_branch = &quot;control&quot;
AND segment = &quot;all&quot;
</code></pre>
<p>This query would return a row for each week of the experiment containing
an estimate of the absolute difference between the treatment and control
branches for the segment containing all users.</p>
<h2 id="client-window-aggregate-tables"><a class="header" href="#client-window-aggregate-tables">Client-window aggregate tables</a></h2>
<p>The aggregate tables contain one row per enrolled <code>client_id</code>. An
aggregate table is written for each analysis window. The statistics
tables are derived from the aggregate tables. The aggregate tables are
less useful without additional processing but they may be useful for
diagnostics.</p>
<p>Aggregate tables are named like:</p>
<p><code>&lt;slug&gt;_&lt;analysis_basis&gt;_{day,week,overall}_&lt;index&gt;</code></p>
<p>Aggregate tables have flexible schemas. Every table contains the
columns:</p>
<div class="table-wrapper"><table><thead><tr><th>Column name</th><th>Type</th><th>Description</th></tr></thead><tbody>
<tr><td><code>client_id</code></td><td><code>STRING</code></td><td>Client's telemetry <code>client_id</code></td></tr>
<tr><td><code>branch</code></td><td><code>STRING</code></td><td>Branch client enrolled in</td></tr>
<tr><td><code>enrollment_date</code></td><td><code>DATE</code></td><td>First date that the client enrolled in the branch</td></tr>
<tr><td><code>exposure_date</code></td><td><code>DATE</code></td><td>First date that the client saw the exposure event (Optional)</td></tr>
<tr><td><code>num_enrollment_events</code></td><td><code>INT64</code></td><td>Number of times a client enrolled in the given branch</td></tr>
<tr><td><code>num_exposure_events</code></td><td><code>INT64</code></td><td>Number of times a client has seen the exposure event</td></tr>
<tr><td><code>analysis_window_start</code></td><td><code>INT64</code></td><td>The day after enrollment that this analysis window began; day 0 is the day of enrollment</td></tr>
<tr><td><code>analysis_window_end</code></td><td><code>INT64</code></td><td>The day after enrollment that this analysis window terminated (inclusive)</td></tr>
</tbody></table>
</div>
<p>The combination of <code>(client_id, branch)</code> is unique.</p>
<p>Each metric associated with the experiment defines an additional
(arbitrarily-typed) column.</p>
<p>Each data source associated with the experiment defines additional
<code>&lt;data_source&gt;_has_contradictory_branch</code> and
<code>&lt;data_source&gt;_has_non_enrolled_data</code> columns, which respectively
indicate whether <code>client_id</code> reported data from more than one branch or
without any tagged branch in that dataset over that analysis window.</p>
<p>Each segment associated with the experiment defines an additional boolean column.</p>
<h2 id="enrollment-tables"><a class="header" href="#enrollment-tables">Enrollment tables</a></h2>
<p>Enrollment tables contain enrollment information per <code>client_id</code> for which
an <code>enroll</code> event has been received. An enrollment table for a specific experiment
is created once after the enrollment period has completed. The enrollment table
is then re-used in sub-sequent analysis runs.</p>
<p>Enrollment tables are named like:</p>
<p><code>enrollments_&lt;slug&gt;</code></p>
<p>Enrollment tables have flexible schemas, but every table contains the columns:</p>
<div class="table-wrapper"><table><thead><tr><th>Column name</th><th>Type</th><th>Description</th></tr></thead><tbody>
<tr><td><code>client_id</code></td><td><code>STRING</code></td><td>Client's telemetry <code>client_id</code></td></tr>
<tr><td><code>branch</code></td><td><code>STRING</code></td><td>Branch client enrolled in</td></tr>
<tr><td><code>enrollment_date</code></td><td><code>DATE</code></td><td>First date that the client enrolled in the branch</td></tr>
<tr><td><code>num_enrollment_events</code></td><td><code>INT64</code></td><td>Number of times a client enrolled in the given branch</td></tr>
</tbody></table>
</div>
<p>The combination of <code>(client_id, branch)</code> is unique.</p>
<p>Each segment defines an additional non-NULL boolean column per segment which is set to
<code>true</code> if the client is in the segment and <code>false</code> otherwise.</p>
<h2 id="scheduling-15"><a class="header" href="#scheduling-15">Scheduling</a></h2>
<p>Jetstream is updated nightly by telemetry-airflow.
It is invoked by the <a href="https://github.com/mozilla/telemetry-airflow/blob/master/dags/jetstream.py"><code>jetstream</code> DAG</a>.</p>
<h2 id="code-reference-11"><a class="header" href="#code-reference-11">Code reference</a></h2>
<p>Jetstream's datasets are generated by invoking <a href="https://github.com/mozilla/jetstream">Jetstream</a>.
Data scientists can configure Jetstream or trigger a Jetstream invocation
by interacting with the <a href="https://github.com/mozilla/jetstream-config"><code>jetstream-config</code></a> repository.</p>
<h2 id="documentation"><a class="header" href="#documentation">Documentation</a></h2>
<p>Additional documentation about Jetstream can be found in the <a href="https://experimenter.info/deep-dives/jetstream/overview">Jetstream Experimenter Docs</a>.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/jetstream.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="analyzing-data-from-shield-studies"><a class="header" href="#analyzing-data-from-shield-studies">Analyzing data from SHIELD studies</a></h1>
<p>This article introduces the datasets that are useful for analyzing studies in Firefox.
After reading this article,
you should understand how to answer questions about
study enrollment,
identify telemetry from clients enrolled in an experiment,
and locate telemetry from add-on studies.</p>
<h2 id="table-of-contents-12"><a class="header" href="#table-of-contents-12">Table of contents</a></h2>
<ul>
<li><a href="datasets/experiment_telemetry.html#dashboards">Dashboards</a></li>
<li><a href="datasets/experiment_telemetry.html#experiment-slugs">Experiment slugs</a></li>
<li><a href="datasets/experiment_telemetry.html#tables">Tables</a>
<ul>
<li><a href="datasets/experiment_telemetry.html#experiments-map-ping-tables"><code>experiments</code> map (ping tables)</a></li>
<li><a href="datasets/experiment_telemetry.html#experiments-column-some-derived-tables"><code>experiments</code> column (some derived tables)</a></li>
<li><a href="datasets/experiment_telemetry.html#events"><code>events</code></a></li>
<li><a href="datasets/experiment_telemetry.html#telemetryshield_study_addon"><code>telemetry.shield_study_addon</code></a></li>
<li><a href="datasets/experiment_telemetry.html#telemetryshield_study"><code>telemetry.shield_study</code></a></li>
</ul>
</li>
</ul>
<h2 id="dashboards-1"><a class="header" href="#dashboards-1">Dashboards</a></h2>
<p><a href="https://experimenter.services.mozilla.com/">Experimenter</a> is the place to find lists of live experiments.</p>
<h2 id="experiment-slugs"><a class="header" href="#experiment-slugs">Experiment slugs</a></h2>
<p>Each experiment is associated with a slug,
which is the label used to identify the experiment to Normandy clients.
The slug is also used to identify the experiment in most telemetry.
The slug for pref-flip experiments is defined in the recipe by a field named <code>slug</code>;
the slug for add-on experiments is defined in the recipe by a field named <code>name</code>.</p>
<p>You can find the slug associated with an experiment in Experimenter.</p>
<h2 id="tables-1"><a class="header" href="#tables-1">Tables</a></h2>
<h3 id="experiments-map-ping-tables"><a class="header" href="#experiments-map-ping-tables"><code>experiments</code> map (ping tables)</a></h3>
<p>Ping tables and some derived tables include an <code>experiments</code> column
which is a mapping from an experiment slug to a struct of information
about the client's state in an experiment in which they are enrolled.</p>
<p>The struct will include the fields <code>branch</code> and <code>enrollment_id</code>,
the latter of which is a unique identifier computed at the time of enrollment
to allow counting the number of physical clients that enroll,
even in the presence of <code>client_id</code> sharing.</p>
<p>You can collect rows from enrolled clients using syntax like:</p>
<pre><code class="language-sql">SELECT
... some fields ...,
mozfun.map.get_key(experiments, 'some-experiment-slug-12345').branch
FROM
telemetry.main
WHERE
mozfun.map.get_key(experiments, 'some-experiment-slug-12345') IS NOT NULL
</code></pre>
<h3 id="experiments-column-some-derived-tables"><a class="header" href="#experiments-column-some-derived-tables"><code>experiments</code> column (some derived tables)</a></h3>
<p><a href="datasets/batch_view/main_summary/reference.html"><code>main_summary</code></a>,
<a href="datasets/batch_view/clients_daily/reference.html"><code>clients_daily</code></a>,
and some other tables
include a <code>experiments</code> column
which is a mapping from experiment slug to branch.</p>
<p>You can collect rows from enrolled clients using query syntax like:</p>
<pre><code class="language-sql">SELECT
... some fields ...,
mozfun.map.get_key(experiments, 'some-experiment-slug-12345') AS branch
FROM
telemetry.clients_daily
WHERE
mozfun.map.get_key(experiments, 'some-experiment-slug-12345') IS NOT NULL
</code></pre>
<h3 id="events-1"><a class="header" href="#events-1"><code>events</code></a></h3>
<p>The <a href="datasets/batch_view/events/reference.html"><code>events</code> table</a> includes
Normandy and Nimbus enrollment and unenrollment events
for all kinds of studies.</p>
<p>Normandy and Nimbus events both have event category <code>normandy</code>.
The event value will contain the experiment slug.</p>
<p>The event schema is described
<a href="https://hg.mozilla.org/mozilla-central/file/tip/toolkit/components/normandy/lib/TelemetryEvents.sys.mjs">in the Firefox source tree</a>.</p>
<p>The <code>events</code> table is updated daily.</p>
<h3 id="telemetryshield_study_addon"><a class="header" href="#telemetryshield_study_addon"><code>telemetry.shield_study_addon</code></a></h3>
<p>The <code>telemetry.shield_study_addon</code> table contains SHIELD telemetry from legacy add-on experiments,
i.e. key-value pairs sent with the
<code>browser.study.sendTelemetry()</code> method from the
<a href="https://github.com/mozilla/shield-studies-addon-utils/">SHIELD study add-on utilities</a>
library.</p>
<p>The <code>study_name</code> attribute of the <code>payload</code> column will contain the identifier
registered with the SHIELD add-on utilities.
This is set by the add-on; sometimes it takes the value of
<code>applications.gecko.id</code> from the add-on's <code>manifest.json</code>.
This is often not the same as the Normandy slug.</p>
<p>The schema for shield-study-addon pings is described in the
<a href="https://github.com/mozilla-services/mozilla-pipeline-schemas/tree/master/schemas/telemetry/shield-study-addon"><code>mozilla-pipeline-schemas</code> repository</a>.</p>
<p>The key-value pairs are present in <code>data</code> attribute of the <code>payload</code> column.</p>
<p>The <code>telemetry.shield_study_addon</code> table contains only full days of data.
If you need access to data with lower latency, you can use the &quot;live&quot; table
<code>telemetry_live.shield_study_addon_v4</code> which should have latency significantly
less than 1 hour.</p>
<h3 id="telemetryshield_study"><a class="header" href="#telemetryshield_study"><code>telemetry.shield_study</code></a></h3>
<p>The <code>telemetry.shield_study</code> dataset includes
enrollment and unenrollment events for legacy add-on experiments only,
sent by the <a href="https://github.com/mozilla/shield-studies-addon-utils/">SHIELD study add-on utilities</a>.</p>
<p>The <code>study_name</code> attribute of the <code>payload</code> column will contain the identifier
registered with the SHIELD add-on utilities.
This is set by the add-on; sometimes it takes the value of
<code>applications.gecko.id</code> from the add-on's <code>manifest.json</code>.
This is often not the same as the Normandy slug.</p>
<p>Normandy also emits its own enrollment and unenrollment events for these studies,
which are available in the <code>events</code> table.</p>
<p>The <code>telemetry.shield_study</code> table contains only full days of data.
If you need access to data with lower latency, you can use the &quot;live&quot; table
<code>telemetry_live.shield_study_v4</code> which should have latency significantly
less than 1 hour.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/experiment_telemetry.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="accessing-heartbeat-data"><a class="header" href="#accessing-heartbeat-data">Accessing Heartbeat data</a></h1>
<p><a href="datasets/../concepts/experiments.html#heartbeat">Heartbeat</a> survey studies return telemetry on user engagement with the survey prompt.
The heartbeat pings do not contain the survey responses themselves,
which are stored by SurveyGizmo.</p>
<p>The telemetry is received using the <code>heartbeat</code> document type,
which is <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/data/heartbeat-ping.html">described in the Firefox source tree docs</a>.</p>
<h2 id="linking-heartbeat-responses-to-telemetry"><a class="header" href="#linking-heartbeat-responses-to-telemetry">Linking Heartbeat responses to telemetry</a></h2>
<p>Heartbeat responses may be linked to Firefox telemetry
if there is a <code>&quot;includeTelemetryUUID&quot;: true</code> key in the <code>arguments</code> object
of the <a href="https://mozilla.github.io/normandy/user/actions/show-heartbeat.html"><code>show-heartbeat</code> recipe</a>.</p>
<p>Heartbeat never reports telemetry <code>client_id</code>s to SurveyGizmo, but,
when <code>includeTelemetryUUID</code> is true,
the Normandy <code>user_id</code> is reported to SurveyGizmo
as the <code>userid</code> URL variable.
Simultaneously, a <code>heartbeat</code> ping is sent to Mozilla,
containing both the telemetry <code>client_id</code> and the Normandy <code>userid</code> that was reported to SurveyGizmo.</p>
<p>The <code>userid</code> is reported by appending it to the <code>surveyId</code> field of the ping, like:</p>
<pre><code>hb-example-slug::e87bcae5-bb63-4829-822a-85ba41ee5d53
</code></pre>
<p>These can be extracted from the ping table for analysis using expressions like:</p>
<pre><code class="language-sql">SPLIT(payload.survey_id,'::')[OFFSET(1)] AS surveygizmo_userid
</code></pre>
<h2 id="data-reference-11"><a class="header" href="#data-reference-11">Data reference</a></h2>
<p>Heartbeat data is available in the <code>telemetry.heartbeat</code> table in BigQuery.</p>
<p>Its structure matches the <a href="https://github.com/mozilla-services/mozilla-pipeline-schemas/blob/8b0641ebb8aad570b79e811ae10fd81c718af48f/schemas/telemetry/heartbeat/heartbeat.4.schema.json">heartbeat ping schema</a>.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/heartbeat.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="dynamic-telemetry"><a class="header" href="#dynamic-telemetry">Dynamic telemetry</a></h1>
<p>Add-on studies may choose to implement new
<a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/collection/scalars.html">scalar</a> or <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/collection/events.html">event</a> telemetry probes.
These probes are not described in
the probe metadata files in the Firefox source tree
and are not described in the <a href="https://probes.telemetry.mozilla.org/">probe dictionary</a>.
Often, they are documented in the repositories
associated with the add-on studies instead.</p>
<p>There is no complete central reference for these.
This page is intended as a partial historical reference
for these probes.</p>
<div class="table-wrapper"><table><thead><tr><th>Start date</th><th>Study</th><th>Probe type</th><th>Probe names</th><th>Documentation</th></tr></thead><tbody>
<tr><td>2020-11</td><td><a href="https://github.com/mozilla-extensions/dnssec-interference">DNSSEC interference study</a></td><td>custom ping</td><td><code>dnssec-study-v1</code> ping</td><td>https://github.com/mozilla-extensions/dnssec-interference/blob/master/TELEMETRY.md</td></tr>
<tr><td>2020-08</td><td><a href="https://github.com/mozilla-extensions/doh-resolver-usage-study">DoH Resolver Usage Study</a></td><td>event</td><td><code>doh.study.resolverusage#resolve.domains</code></td><td>https://github.com/mozilla-extensions/doh-resolver-usage-study/blob/master/docs/TELEMETRY.md</td></tr>
<tr><td>2020-06</td><td><a href="https://github.com/mozilla-extensions/login-study">Google Accounts Login Check</a></td><td>custom ping</td><td><a href="https://github.com/mozilla-services/mozilla-pipeline-schemas/pull/561"><code>normandy-login-study</code></a> ping ingested to the <code>telemetry.normandy_login_study</code> table</td><td>https://github.com/mozilla-extensions/login-study/blob/master/login-check-metrics.md</td></tr>
<tr><td>2020-04</td><td><a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1623996">HTTP Upgrade</a></td><td>scalar</td><td><code>httpsUpgradeStudy.https</code>, <code>httpsUpgradeStudy.nonupgradable</code>, <code>httpsUpgradeStudy.upgradable</code></td><td>https://bugzilla.mozilla.org/show_bug.cgi?id=1629585</td></tr>
<tr><td>2020-02</td><td><a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1564506">Search interventions</a></td><td>scalar</td><td><code>urlbarInterventionsExperiment.tipShownCount</code>, <code>.tipPickedCount</code></td><td>missing</td></tr>
<tr><td>2019-10</td><td><a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1594926">Delegated credentials</a></td><td>event</td><td><code>delegatedcredentials#connectDC</code>, <code>#connectNoDC</code></td><td>https://github.com/kjacobs-moz/dc-experiment-addon</td></tr>
<tr><td>2019-10</td><td><a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1573840">DNS over HTTPS heuristics</a></td><td>event</td><td><code>doh#evaluate.heuristics</code>, <code>doh#state</code></td><td>https://github.com/mozilla/doh-rollout/blob/6787458a6901ef3b2a8fef86a179899213809534/docs/telemetry.md</td></tr>
</tbody></table>
</div><footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/dynamic_telemetry.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="experiment-monitoring-datasets"><a class="header" href="#experiment-monitoring-datasets">Experiment monitoring datasets</a></h1>
<ul>
<li><a href="datasets/experiment_monitoring.html#experiment-enrollment-data">Experiment enrollment data</a></li>
<li><a href="datasets/experiment_monitoring.html#experiment-search-metrics-data">Experiment search metrics data</a></li>
<li><a href="datasets/experiment_monitoring.html#derived-dataset">Derived dataset</a></li>
<li><a href="datasets/experiment_monitoring.html#gcs-data-export">GCS data export</a></li>
<li><a href="datasets/experiment_monitoring.html#implementation">Implementation</a></li>
<li><a href="datasets/experiment_monitoring.html#code-reference">Code reference</a></li>
</ul>
<p>Experiment monitoring datasets are designed to power dashboards, such as <a href="https://mozilla.cloud.looker.com/dashboards/216">Experiment Enrollment Grafana dashboard</a>, for monitoring experiments in real time. Currently, datasets for monitoring the number or enrollments and number of searches performed by clients enrolled in experiments are available.</p>
<h2 id="experiment-enrollment-data"><a class="header" href="#experiment-enrollment-data">Experiment enrollment data</a></h2>
<p><code>moz-fx-data-shared-prod.telemetry_derived.experiment_enrollment_aggregates_live_v1</code> provides enrollment, unenrollment, graduate, update and failure aggregates for experiments and branches over 5-minute intervals for Fenix and desktop experiments. This live view is also the basis of several derived views:</p>
<div class="table-wrapper"><table><thead><tr><th>Dataset name</th><th>Description</th></tr></thead><tbody>
<tr><td><code>mozdata.telemetry.experiment_unenrollment_overall</code></td><td>Overall number of clients that unenrolled from experiments</td></tr>
<tr><td><code>mozdata.telemetry.experiment_enrollment_other_events_overall</code></td><td>Number of events other than <code>enroll</code> and <code>unenroll</code> sent by clients</td></tr>
<tr><td><code>mozdata.telemetry.experiment_enrollment_cumulative_population_estimate</code></td><td>Cumulative number of clients enrolled in experiments</td></tr>
<tr><td><code>mozdata.telemetry.experiment_enrollment_overall</code></td><td>Overall number of clients enrolled in experiments</td></tr>
<tr><td><code>mozdata.telemetry.experiment_enrollment_daily_active_population</code></td><td>Number of daily active clients enrolled in experiments</td></tr>
</tbody></table>
</div>
<h2 id="experiment-search-metrics-data"><a class="header" href="#experiment-search-metrics-data">Experiment search metrics data</a></h2>
<p><code>moz-fx-data-shared-prod.telemetry_derived.experiment_search_aggregates_live_v1</code> provides aggregated search metrics of clients enrolled in Fenix and desktop experiments, such as the number of searches performed, the number of searches with ads and the number of ad clicks. This live view is also the basis of several derived views:</p>
<div class="table-wrapper"><table><thead><tr><th>Dataset name</th><th>Description</th></tr></thead><tbody>
<tr><td><code>mozdata.telemetry.experiment_cumulative_ad_clicks</code></td><td>Cumulative number of ad clicks by clients enrolled in experiments</td></tr>
<tr><td><code>mozdata.telemetry.experiment_cumulative_search_count</code></td><td>Cumulative number of searches by clients enrolled in experiments</td></tr>
<tr><td><code>mozdata.telemetry.experiment_cumulative_search_with_ads_count</code></td><td>Cumulative number of searches with ads by clients enrolled in experiments</td></tr>
</tbody></table>
</div>
<h2 id="derived-dataset-1"><a class="header" href="#derived-dataset-1">Derived dataset</a></h2>
<p>The derived views have the following schema:</p>
<div class="table-wrapper"><table><thead><tr><th>Column name</th><th>Type</th><th>Description</th></tr></thead><tbody>
<tr><td><code>time</code></td><td><code>TIMESTAMP</code></td><td>Timestamp when value was recorded</td></tr>
<tr><td><code>branch</code></td><td><code>STRING</code></td><td>Experiment branch</td></tr>
<tr><td><code>experiment</code></td><td><code>STRING</code></td><td>Experiment slug</td></tr>
<tr><td><code>value</code></td><td><code>INT64</code></td><td>Aggregated value</td></tr>
</tbody></table>
</div>
<p>As an example of how these views can be used, the following query determines the number of cumulative clients enrolled
in a the <code>multi-stage-aboutwelcome-set-default-as-first-screen</code> experiment to date in each branch of a study:</p>
<pre><code class="language-sql">SELECT
branch,
SUM(value) AS total_enrolled
FROM `mozdata.telemetry.experiment_enrollment_cumulative_population_estimate`
WHERE experiment = 'multi-stage-aboutwelcome-set-default-as-first-screen'
GROUP BY 1
ORDER BY 2
</code></pre>
<h2 id="gcs-data-export"><a class="header" href="#gcs-data-export">GCS data export</a></h2>
<p>As some dashboard solutions, such as the Experimenter console, might not have access to BigQuery, data from derived experiment monitoring views is also exported as JSON to <code>monitoring/</code> in the <code>mozanalysis</code> bucket in <code>moz-fx-data-experiments</code>. JSON files are named like: <code>&lt;experiment_slug&gt;_&lt;monitoring_dataset_name&gt;.json</code>, for example: <code>gs://mozanalysis/monitoring/bug-1683348-rollout-tab-modal-print-ui-roll-out-release-84-85_experiment_unenrollment_overall.json</code></p>
<p>A script for exporting this data is <a href="https://github.com/mozilla/telemetry-airflow/blob/ad3d678cb45c7ac67cb96a46efb6b4e731b856f0/dags/experiments_live.py#L70">scheduled to run via Airflow</a> every 5 minutes.</p>
<h2 id="implementation"><a class="header" href="#implementation">Implementation</a></h2>
<p>To keep the cost low for retrieving live monitoring data, <a href="https://cloud.google.com/bigquery/docs/materialized-views-intro">BigQuery materialized views</a> have been set up. These materialized views read delta changes from the base live tables to compute up-to-date results every 5 minutes.</p>
<p>As materialized views do not support <code>UNION ALL</code>, separate materialized views are deployed for legacy desktop telemetry and every Fenix related dataset.</p>
<p>Materialized views for experiment enrollment events:</p>
<ul>
<li><code>org_mozilla_fenix_derived.experiment_events_live_v1</code></li>
<li><code>org_mozilla_firefox_derived.experiment_events_live_v1</code></li>
<li><code>org_mozilla_firefox_beta_derived.experiment_events_live_v1</code></li>
<li><code>telemetry_derived.experiment_events_live_v1</code></li>
</ul>
<p>The <code>moz-fx-data-shared-prod.telemetry_derived.experiment_enrollment_aggregates_live_v1</code> view combines data of the past 2 days from all of the materialized views for experiment enrollments with data older than 2 days from <code>telemetry_derived.experiment_enrollment_aggregates_v1</code>.</p>
<p>Materialized view for search metrics:</p>
<ul>
<li><code>org_mozilla_fenix_derived.experiment_search_events_live_v1</code></li>
<li><code>org_mozilla_firefox_derived.experiment_search_events_live_v1</code></li>
<li><code>org_mozilla_firefox_beta_derived.experiment_search_events_live_v1</code></li>
<li><code>telemetry_derived.experiment_search_events_live_v1</code></li>
</ul>
<p>The <code>moz-fx-data-shared-prod.telemetry_derived.experiment_search_aggregates_live_v1</code> view combines data of the past 2 days from all of the materialized views for search metrics with data older than 2 days from <code>telemetry_derived.search_aggregates_v1</code>.</p>
<h2 id="code-reference-12"><a class="header" href="#code-reference-12">Code reference</a></h2>
<ul>
<li><a href="https://github.com/mozilla/bigquery-etl/blob/master/sql/moz-fx-data-shared-prod/telemetry_derived/experiment_enrollment_aggregates_live_v1/view.sql"><code>moz-fx-data-shared-prod.telemetry_derived.experiment_enrollment_aggregates_live_v1</code></a></li>
<li><a href="https://github.com/mozilla/bigquery-etl/tree/master/sql/moz-fx-data-shared-prod/org_mozilla_fenix_derived/experiment_events_live_v1/init.sql"><code>org_mozilla_fenix_derived.experiment_events_live_v1</code></a></li>
<li><a href="https://github.com/mozilla/bigquery-etl/tree/master/sql/moz-fx-data-shared-prod/org_mozilla_firefox_derived/experiment_events_live_v1/init.sql"><code>org_mozilla_firefox_derived.experiment_events_live_v1</code></a></li>
<li><a href="https://github.com/mozilla/bigquery-etl/tree/master/sql/moz-fx-data-shared-prod/org_mozilla_firefox_beta_derived/experiment_events_live_v1/init.sql"><code>org_mozilla_firefox_beta_derived.experiment_events_live_v1</code></a></li>
</ul>
<p>-<a href="https://github.com/mozilla/bigquery-etl/blob/master/sql/moz-fx-data-shared-prod/telemetry_derived/experiment_search_aggregates_live_v1/view.sql"><code>moz-fx-data-shared-prod.telemetry_derived.experiment_search_aggregates_live_v1</code></a></p>
<ul>
<li><a href="https://github.com/mozilla/bigquery-etl/tree/master/sql/moz-fx-data-shared-prod/org_mozilla_fenix_derived/experiment_search_events_live_v1/init.sql"><code>org_mozilla_fenix_derived.experiment_search_events_live_v1</code></a></li>
<li><a href="https://github.com/mozilla/bigquery-etl/tree/master/sql/moz-fx-data-shared-prod/org_mozilla_firefox_derived/experiment_search_events_live_v1/init.sql"><code>org_mozilla_firefox_derived.experiment_search_events_live_v1</code></a></li>
<li><a href="https://github.com/mozilla/bigquery-etl/tree/master/sql/moz-fx-data-shared-prod/org_mozilla_firefox_beta_derived/experiment_search_events_live_v1/init.sql"><code>org_mozilla_firefox_beta_derived.experiment_search_events_live_v1</code></a></li>
</ul>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/experiment_monitoring.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="search-data"><a class="header" href="#search-data">Search Data</a></h1>
<h2 id="introduction-12"><a class="header" href="#introduction-12">Introduction</a></h2>
<p>This article introduces the datasets we maintain for search analyses:
<code>search_aggregates</code> and <code>search_clients_engines_sources_daily</code>. After reading this article,
you should understand the search datasets well enough to produce moderately
complex analyses.</p>
<p>Additionally, see the <code>Building Intuition</code> search dashboards for applications to search datasets. Listed in recommended order of consumption:</p>
<ul>
<li><a href="https://mozilla.cloud.looker.com/dashboards-next/312">Search Monetization</a>, an overview of how search partnerships generate revenue</li>
<li><a href="https://mozilla.cloud.looker.com/dashboards-next/314">Search Mechanics</a>, an in-depth look at the search types described below + searches with ads and ad clicks, with special attention to search behavior in important subgroups of the population</li>
<li><a href="https://mozilla.cloud.looker.com/dashboards-next/319">Search Analyses</a>, a mapping of the search datasets to corresponding Looker Explores</li>
<li><a href="https://mozilla.cloud.looker.com/dashboards-next/256">Search Access Points</a>, an overview of search, ad impressions, and ad clicks across the different search access points (SAPs) built into the browser</li>
<li><a href="https://mozilla.cloud.looker.com/dashboards/542">Regional Search Providers</a>, an introduction to regional search providers <code>Baidu</code>, <code>Yandex</code>, <code>Qwant</code>, <code>Ecosia</code>, and <code>Yahoo Japan</code>.</li>
</ul>
<h2 id="table-of-contents-13"><a class="header" href="#table-of-contents-13">Table of Contents</a></h2>
<ul>
<li><a href="datasets/search.html#terminology">Terminology</a>
<ul>
<li><a href="datasets/search.html#direct-vs-follow-on-search">Direct vs Follow-on Search</a></li>
<li><a href="datasets/search.html#tagged-vs-untagged-searches">Tagged vs Untagged Searches</a></li>
</ul>
</li>
<li><a href="datasets/search.html#standard-search-aggregates">Standard Search Aggregates</a>
<ul>
<li><a href="datasets/search.html#outlier-filtering">Outlier Filtering</a></li>
</ul>
</li>
<li><a href="datasets/search.html#in-content-telemetry-issues">In Content Telemetry Issues</a>
<ul>
<li><a href="datasets/search.html#relies-on-whitelists">Relies on whitelists</a></li>
<li><a href="datasets/search.html#limited-historical-data">Limited historical data</a></li>
<li><a href="datasets/search.html#adblocker-addon"><code>AdBlocker</code> Addon</a></li>
</ul>
</li>
<li><a href="datasets/search.html#address-bar--search-overviews">Address Bar &amp; Search Overviews</a></li>
</ul>
<h1 id="terminology-1"><a class="header" href="#terminology-1">Terminology</a></h1>
<h2 id="direct-vs-follow-on-search"><a class="header" href="#direct-vs-follow-on-search">Direct vs Follow-on Search</a></h2>
<p>Searches can be split into three major classes: <em>sap</em>, <em>follow-on</em>, and <em>organic</em>.</p>
<p>SAP searches result from a direct interaction with a <strong>search access point (SAP)</strong>, locations on the Firefox UI where clients can enter search queries. Searches that originate from these SAPs are often called SAP searches. For the most recent list of search access points, see <a href="https://firefox-source-docs.mozilla.org/browser/search/telemetry.html#browsersearchtelemetry-jsm">Search telemetry doc</a> as SAPs continue to be developed over time.</p>
<p>The Firefox browser has multiple SAPs available at the same time. For visuals noting the location of Firefox SAPs, see <a href="https://mozilla.cloud.looker.com/dashboards-next/256">here</a>. These SAPs are recorded in the <code>source</code> field in search tables, and include the following:</p>
<ul>
<li><code>urlbar</code> - entering a search query in the Awesomebar. Searches typed into the search bar in the middle of the browser window will also be recorded as <code>urlbar</code> searches.</li>
<li><code>urlbar-searchmode</code> - selecting a search partner icon while entering a search query in the Awesomebar or tagging the search partner (i.e. <code>@duckduckgo</code>) before entering search query via Awesomebar (was formerly called <code>alias</code> <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1499193">added</a> as of Firefox 64.)</li>
<li><code>urlbar-handoff</code> - often referred to as new tab search. Searches by typing into the search box in the middle of the browser window will be attributed to the <code>urlbar-handoff</code> starting in Firefox 94. See more in <a href="https://probes.telemetry.mozilla.org/?search=urlbar&amp;view=detail&amp;probeId=scalar%2Fbrowser.search.content.urlbar_handoff">probe dictionary</a> and <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1732429">bug here</a>.</li>
<li><code>newtab</code> - referred to new tab search on <code>about:newtab</code> page.</li>
<li><code>abouthome</code> - referred to new tab search on <code>about:home</code> page.</li>
<li><code>searchbar</code> - the main search bar (on the top right corner of browser window); not present by default for new profiles on Firefox 57+. <code>Searchmode</code> searches via the <code>searchbar</code> are logged as regular <code>searchbar</code> searches.</li>
<li><code>contextmenu</code> - highlight text, right click, and select &quot;Search [search engine] for [highlighted text]&quot; from the context menu.</li>
<li><code>system</code> - starting Firefox from the command line with an option that immediately makes a search.</li>
<li><code>webextension</code> - initiated from a web extension (<a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1492233">added</a> as of Firefox 63).</li>
</ul>
<p>Note: Search telemetry evolves so the actual <code>source</code> name used for a specific search access point may vary between different versions of Firefox. For example, to catch SAP searches performed in Awesomebar after Firefox 94, you will need to use <code>source in (&quot;urlbar&quot;, &quot;urlbar-searchmode&quot;, &quot;urlbar-handoff&quot;)</code>.</p>
<p>Users will often interact with the Search Engine Results Page (SERP) to create &quot;downstream&quot; queries. These queries are called <strong>follow-on queries</strong> but may also be referred to as <strong>in-content queries</strong> since they are initiated from the content of the page itself and not from the Firefox UI.</p>
<p>For example, follow-on queries can be caused by:</p>
<ul>
<li>Revising a query (<code>restaurants</code> becomes <code>restaurants near me</code>)</li>
<li>Clicking on the &quot;next&quot; button</li>
<li>Accepting spelling suggestions</li>
</ul>
<p>Finally, we track the number of <em>organic</em> searches. These searches are not via SAPs and are instead entered directly via a search engine provider, typically by visiting the provider's website and entering a search query through their website's interface.</p>
<h2 id="tagged-vs-untagged-searches"><a class="header" href="#tagged-vs-untagged-searches">Tagged vs Untagged Searches</a></h2>
<p>Our partners (search engines) attribute queries to Mozilla using <strong>partner codes</strong>. When a user issues a query through one of our SAPs, we include our partner code in the URL of the resulting search.</p>
<p><strong>Tagged queries</strong> are queries that <strong>include one of our partner codes</strong>. If a SAP query is tagged, any follow-on query should also be tagged.</p>
<p><strong>Untagged queries</strong> are queries that <strong>do not include one of our partner codes</strong>. If a query is untagged, it's usually because we do not have a partner deal for that search engine and region (or it is an organic search that did not start from an SAP).</p>
<h1 id="standard-search-aggregates"><a class="header" href="#standard-search-aggregates">Standard Search Aggregates</a></h1>
<p>We report nine types of searches in our search datasets: <code>sap</code>, <code>tagged-sap</code>, <code>tagged-follow-on</code>, <code>search_with_ads</code>, <code>search_with_ads_organic</code>, <code>ad_click</code>, <code>ad_click_organic</code>, <code>organic</code>, and <code>unknown</code> (although the earliest available date for each varies). These aggregates show up as columns in the<code>search_aggregates</code> and <code>search_clients_engines_sources_daily</code> datasets. Our search datasets are all derived from <code>main_summary</code>. The aggregate columns are derived from the <code>SEARCH_COUNTS</code> histogram.</p>
<p>The <strong><code>sap</code> column counts all SAP (or direct) searches</strong>. <code>sap</code> search counts are collected via <a href="https://firefox-source-docs.mozilla.org/browser/browser/BrowserUsageTelemetry.html#search-telemetry">probes</a> within the Firefox UI These counts are <strong>very reliable, but do not count follow-on queries</strong>.</p>
<p>In 2017-06 we deployed the <a href="https://github.com/mozilla/followonsearch"><code>followonsearch</code> addon</a>, which adds probes for <code>tagged-sap</code> and <code>tagged-follow-on</code> searches. These columns <strong>attempt to count all tagged searches</strong> by looking for Mozilla partner codes in the URL of requests to partner search engines. These search counts are critical to understanding revenue since they exclude untagged searches and include follow-on searches. However, these search counts have <strong>important caveats affecting their reliability</strong>. See <a href="datasets/search.html#in-content-telemetry-issues">In Content Telemetry Issues</a> for more information.</p>
<p>In 2018, we <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1475571">incorporated</a> this code into the product (as of version 61) and also started tracking so-called &quot;organic&quot; searches that weren't initiated through a search access point (sap). This data has the same caveats as those for follow on searches, above.</p>
<p>We also started tracking &quot;unknown&quot; searches, which generally correspond to clients submitting random/unknown search data to our servers as part of their telemetry payload. This category can generally safely be ignored, unless its value is extremely high (which indicates a bug in either Firefox or the aggregation code which creates our datasets).</p>
<p>In <code>main_summary</code>, all of these searches are stored in <code>search_counts.count</code>, <strong>which makes it easy to over count searches</strong>. However, in general, please avoid using <code>main_summary</code> for search analyses -- it's slow and you will need to duplicate much of the work done to make analyses of our search datasets tractable.</p>
<h2 id="outlier-filtering"><a class="header" href="#outlier-filtering">Outlier Filtering</a></h2>
<p>We remove search count observations representing more than 10,000 searches for a single search engine in a single ping.</p>
<h1 id="in-content-telemetry-issues"><a class="header" href="#in-content-telemetry-issues">In Content Telemetry Issues</a></h1>
<p>The search code module inside Firefox (formerly implemented as an addon until version 60) implements the probe used to measure <code>tagged-sap</code> and
<code>tagged-follow-on</code> searches and also tracks organic searches. This probe is critical to understanding our revenue. It's the only tool that gives us a view of follow-on searches and differentiates between tagged and untagged queries. However, it comes with some notable caveats.</p>
<h2 id="relies-on-whitelists"><a class="header" href="#relies-on-whitelists">Relies on whitelists</a></h2>
<p>Firefox's search module attempts to count all tagged searches by looking for Mozilla partner codes in the URL of requests to partner search engines. To do this, it relies on a whitelist of partner codes and URL formats. The list of partner codes is incomplete and only covers a few top partners. These codes also occasionally change so there will be gaps in the data.</p>
<p>Additionally, changes to search engine URL formats can cause problems with our data collection. See <a href="https://sql.telemetry.mozilla.org/queries/47631/source#128887">this query</a> for a notable example.</p>
<h2 id="limited-historical-data"><a class="header" href="#limited-historical-data">Limited historical data</a></h2>
<p>The <a href="https://github.com/mozilla/followonsearch"><code>followonsearch</code> addon</a> was first deployed in 2017-06. There is no <code>tagged-*</code> search data available before this.</p>
<p><code>default_private_search_engine</code> is only available starting 2019-11-19.</p>
<h2 id="adblocker-addon"><a class="header" href="#adblocker-addon"><code>AdBlocker</code> Addon</a></h2>
<p>When speaking of search metrics, people sometimes have the <code>adblockers</code> addon in mind. These type of information is available in <code>telemetry.addons_daily</code> and <code>telemetry.clients_daily</code>. Below we give two example queries around them, for more details please go visit the related section for these two tables.</p>
<ol>
<li><a href="https://sql.telemetry.mozilla.org/queries/84938/source"><code>AdBlockers</code> DAU, and ad click to sap ratio in last 7 days</a></li>
<li><a href="https://sql.telemetry.mozilla.org/queries/84939/source">Popular <code>AdBlockers</code> average DAU in last 7 days</a></li>
</ol>
<h1 id="address-bar--search-overviews"><a class="header" href="#address-bar--search-overviews">Address Bar &amp; Search Overviews</a></h1>
<ul>
<li>Address Bar Overview by Marco Bonardo [<a href="https://mozilla.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=3cfa519d-d8cc-4b9d-a432-adff012b7bb9">Presentation</a> / <a href="https://docs.google.com/presentation/d/1Li7uBp8HJ2trTLkj8Qx_bt7nIGQyU-QyniTCo9VSesY/edit#slide=id.g82d2da351e_5_3617">Slides</a>] - an overview of Firefox's address bar (aka: the Awesomebar)</li>
<li>Address Bar Results Ranking by Marco Bonardo [<a href="https://docs.google.com/presentation/d/1r3Y70Qhpdp5Cd51hdIiX9W2AE9Tq-W8CXfumyFaNdW4/edit#slide=id.g82d2da351e_5_3617">Slides</a>] - deep-dive on how results are ranked and displayed in Firefox's address bar</li>
<li>Search Engines Overview by Mark Banner [<a href="https://mozilla.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=c0dd7221-a31f-449c-a874-adfd012609de">Presentation</a> / <a href="https://docs.google.com/presentation/d/1ibE04t8dm1ZpxJVRpVDupPnG_UOEhgEWaePS2C5BHQY/edit#slide=id.g832b271044_1_1173">Slides</a>] - an overview on how search engines are included in Firefox</li>
<li>Search Engine Configuration by Mark Banner [<a href="https://mozilla.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=774320a1-cd71-49a4-bf36-ae210156dcd5">Presentation</a> / <a href="https://docs.google.com/presentation/d/1Jg7ct3G7IU7iqunuByOyLvIXwV8nYCOqyIPENTOpW9Y/edit#slide=id.g832b271044_1_1173">Slides</a>] - deep-dive on how search engines are configured and deployed in Firefox</li>
</ul>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/search.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="search-aggregates"><a class="header" href="#search-aggregates">Search Aggregates</a></h1>
<ul>
<li><a href="datasets/search/search_aggregates/reference.html#introduction">Introduction</a>
<ul>
<li><a href="datasets/search/search_aggregates/reference.html#contents">Contents</a></li>
<li><a href="datasets/search/search_aggregates/reference.html#gotcha">Gotcha</a></li>
</ul>
</li>
<li><a href="datasets/search/search_aggregates/reference.html#data-reference">Data Reference</a>
<ul>
<li><a href="datasets/search/search_aggregates/reference.html#example-queries">Example Queries</a>
<ul>
<li><a href="datasets/search/search_aggregates/reference.html#daily-us-sap-searches">Daily US sap searches</a></li>
</ul>
</li>
<li><a href="datasets/search/search_aggregates/reference.html#scheduling">Scheduling</a></li>
<li><a href="datasets/search/search_aggregates/reference.html#schema">Schema</a></li>
</ul>
</li>
<li><a href="datasets/search/search_aggregates/reference.html#code-reference">Code Reference</a></li>
</ul>
<h1 id="introduction-13"><a class="header" href="#introduction-13">Introduction</a></h1>
<p><code>search_aggregates</code> is designed to power high level search dashboards.
It's quick and easy to query, but the data are coarse.
In particular, this dataset allows you to segment
by a limited number of client characteristics which are relevant to search markets.
However, it is not possible to normalize by client count.
If you need fine-grained data, consider using <code>search_clients_engines_sources_daily</code>
which breaks down search counts by client, engine, and source.</p>
<h2 id="contents-3"><a class="header" href="#contents-3">Contents</a></h2>
<p>Each row of <code>search_aggregates</code> contains
the standard search count aggregations
for each unique combination of the following columns.
Unless otherwise noted, these columns are taken directly from <code>main_summary</code>.</p>
<ul>
<li><code>submission_date</code> - <code>yyyymmdd</code></li>
<li><code>engine</code> - e.g. <code>google</code>, <code>bing</code>, <code>yahoo</code></li>
<li><code>source</code> - The UI component used to issue a search - e.g. <code>urlbar</code>, <code>abouthome</code></li>
<li><code>country</code></li>
<li><code>locale</code></li>
<li><code>addon_version</code> - The installed version of the [<code>followonsearch</code> addon] (before version 61)</li>
<li><code>app_version</code></li>
<li><code>distribution_id</code> - <code>NULL</code> means the standard Firefox build</li>
<li><code>search_cohort</code> - <code>NULL</code> except for small segments relating to search experimentation</li>
<li><code>default_search_engine</code></li>
<li><code>default_private_search_engine</code></li>
<li><code>os</code> - e.g. <code>Linux</code>, <code>Windows_NT</code>, <code>Darwin</code> ...</li>
<li><code>os_version</code></li>
<li><code>is_default_browser</code></li>
</ul>
<p>There are ten aggregation columns:
<code>sap</code>, <code>tagged-sap</code>, <code>tagged-follow-on</code>,<code>organic</code>, <code>unknown</code>, <code>ad_click</code>, <code>ad_click_organic</code>, <code>search_with_ads</code>, <code>search_with_ads_organic</code>, and <code>client_count</code>.
Each of these columns represent different types of searches.
For more details, see the <a href="datasets/search/search_aggregates/../../search.html">search data documentation</a>.</p>
<!--
#### Further Reading
-->
<h2 id="gotcha"><a class="header" href="#gotcha">Gotcha</a></h2>
<p>Although <code>search_aggregates</code> table is created on top of <code>search_clients_engines_sources_daily</code>, you may expect the total search metrics reported to match exactly. It's actually not the case. In case you notice the total number reported in <code>search_aggregates</code> higher than <code>search_clients_engines_sources_daily</code>, it's most likely due to <a href="https://mana.mozilla.org/wiki/display/DATA/Shredder">Shredder</a>. <code>search_aggregates</code> table is aggregated beyond the client level, so shredder doesnt have to touch it. But <code>search_clients_engines_sources_daily</code> contains <code>client_id</code> and is subject to shredder. It's expected to lose up to 1% of rows every month as Firefox responds to clients' deletion requests, which would reduce count in <code>search_clients_engines_soruces</code> but not in <code>search_aggregates</code>. An example query to show such a difference can be found in <a href="https://sql.telemetry.mozilla.org/queries/84302/source"><code>STMO#84302</code></a>.</p>
<h1 id="data-reference-12"><a class="header" href="#data-reference-12">Data Reference</a></h1>
<h2 id="example-queries-7"><a class="header" href="#example-queries-7">Example Queries</a></h2>
<h3 id="daily-us-sap-searches"><a class="header" href="#daily-us-sap-searches">Daily US sap searches</a></h3>
<pre><code class="language-sql">SELECT
submission_date,
SUM(SAP) AS search_counts
FROM search.search_aggregates
WHERE
country = 'US'
AND submission_date BETWEEN '2019-01-01' AND '2019-01-07'
GROUP BY submission_date
ORDER BY submission_date
</code></pre>
<p><a href="https://sql.telemetry.mozilla.org/queries/51140/source"><code>STMO#51140</code></a></p>
<h2 id="scheduling-16"><a class="header" href="#scheduling-16">Scheduling</a></h2>
<p>This job is
<a href="https://github.com/mozilla/bigquery-etl/blob/ad84a15d580333b41d36cfe8331e51238f3bafa1/dags/bqetl_search.py#L40">scheduled on airflow</a>
to run daily.</p>
<h2 id="schema-8"><a class="header" href="#schema-8">Schema</a></h2>
<p>As of 2021-04-29,
the current version of <code>search_aggregates</code> is <code>v8</code>,
and has a schema as follows.
The dataset is backfilled through 2016-03-11.</p>
<pre><code>root
|-- submission_date: date (nullable = true)
|-- submission_date_s3: date (nullable = true)
|-- country: string (nullable = true)
|-- engine: string (nullable = true)
|-- normalized_engine: string (nullable = true)
|-- source: string (nullable = true)
|-- app_version: string (nullable = true)
|-- distribution_id: string (nullable = true)
|-- locale: string (nullable = true)
|-- search_cohort: string (nullable = true)
|-- addon_version: string (nullable = true)
|-- tagged_sap: long (nullable = true)
|-- tagged_follow_on: long (nullable = true)
|-- sap: long (nullable = true)
|-- organic: long (nullable = true)
|-- search_with_ads: long (nullable = true)
|-- search_with_ads_organic: long (nullable = true)
|-- ad_click: long (nullable = true)
|-- ad_click_organic: long (nullable = true)
|-- unknown: long (nullable = true)
|-- client_count: long (nullable = true)
|-- default_search_engine: string (nullable = true)
|-- default_private_search_engine: string (nullable = true)
|-- os: string (nullable = true)
|-- os_version: string (nullable = true)
|-- is_default_browser: boolean (nullable = true)
</code></pre>
<h1 id="code-reference-13"><a class="header" href="#code-reference-13">Code Reference</a></h1>
<p>The <code>search_aggregates</code> job is
<a href="https://github.com/mozilla/bigquery-etl/blob/master/sql/moz-fx-data-shared-prod/search_derived/search_aggregates_v8/query.sql">defined in <code>bigquery-etl</code></a></p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/search/search_aggregates/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="search-clients-engines-sources-daily"><a class="header" href="#search-clients-engines-sources-daily">Search Clients Engines Sources Daily</a></h1>
<ul>
<li><a href="datasets/search/search_clients_engines_sources_daily/reference.html#introduction">Introduction</a>
<ul>
<li><a href="datasets/search/search_clients_engines_sources_daily/reference.html#contents">Contents</a></li>
<li><a href="datasets/search/search_clients_engines_sources_daily/reference.html#background-and-caveats">Background and Caveats</a></li>
</ul>
</li>
<li><a href="datasets/search/search_clients_engines_sources_daily/reference.html#data-reference">Data Reference</a>
<ul>
<li><a href="datasets/search/search_clients_engines_sources_daily/reference.html#example-queries">Example Queries</a></li>
<li><a href="datasets/search/search_clients_engines_sources_daily/reference.html#scheduling">Scheduling</a></li>
<li><a href="datasets/search/search_clients_engines_sources_daily/reference.html#schema">Schema</a></li>
</ul>
</li>
<li><a href="datasets/search/search_clients_engines_sources_daily/reference.html#code-reference">Code Reference</a></li>
</ul>
<h1 id="introduction-14"><a class="header" href="#introduction-14">Introduction</a></h1>
<p><code>search_clients_engines_sources_daily</code> is designed to enable client-level search analyses.
Querying this dataset can be slow;
consider using <code>search_aggregates</code> for coarse analyses.</p>
<h2 id="contents-4"><a class="header" href="#contents-4">Contents</a></h2>
<p><code>search_clients_engines_sources_daily</code> has one row for each unique combination of:
(<code>client_id</code>, <code>submission_date</code>, <code>engine</code>, <code>source</code>).</p>
<p>In addition to the standard search count aggregations,
this dataset includes some descriptive data for each client.
For example, we include <code>country</code> and <code>channel</code> for each row of data.
In the event that a client sends multiple pings on a given <code>submission_date</code>
we choose an arbitrary value from the pings for that (<code>client_id</code>, <code>submission_date</code>),
unless otherwise noted.</p>
<p>There were originally five standard search count aggregation columns:
<code>sap</code>, <code>tagged-sap</code>, and <code>tagged-follow-on</code>, <code>organic</code> and <code>unknown</code>. Over time, more search count aggregation columns were added, including <code>ad_click</code> and <code>search_with_ads</code> in late 2018 <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1505411">bug</a>; and <code>ad_click_organic</code> and <code>search_with_ads_organic</code> in late 2021 <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1664849">bug</a>.</p>
<p>Note that, if there were no such searches in a row's segment
(i.e. the count would be 0),
the column value is <code>null</code>.
Each of these columns represent different types of searches.
For more details, see the <a href="datasets/search/search_clients_engines_sources_daily/../../search.html">search data documentation</a></p>
<h2 id="background-and-caveats-3"><a class="header" href="#background-and-caveats-3">Background and Caveats</a></h2>
<p><code>search_clients_engines_sources_daily</code> does not include
(<code>client_id</code> <code>submission_date</code>) pairs
if we did not receive a ping for that <code>submission_date</code>.</p>
<p>We impute a <code>NULL</code> <code>engine</code> and <code>source</code> for pings with no search counts.
This ensures users who never search are included in this dataset.</p>
<p>This dataset is large.
If you're querying this dataset from STMO,
heavily limit the data you read using <code>submission_date</code> or <code>sample_id</code>.</p>
<p>The <code>has_adblocker_addon</code> field is True if the client had an active addon that blocks Mozilla's ability to monetize the searches via a search engine partnership. The logic for identifying ad-blocking addons is <a href="https://github.com/mozilla/search-adhoc-analysis/blob/master/monetization-blocking-addons/Monetization%20blocking%20addons.ipynb">here</a> (private notebook).</p>
<!--
#### Further Reading
-->
<h1 id="data-reference-13"><a class="header" href="#data-reference-13">Data Reference</a></h1>
<h2 id="example-queries-8"><a class="header" href="#example-queries-8">Example Queries</a></h2>
<p><a href="https://sql.telemetry.mozilla.org/queries/51141/source"><code>STMO#51141</code></a>
calculates searches per <code>normalized_channel</code> for US clients on an arbitrary day.
If you have trouble viewing this query,
it's likely you don't have the proper permissions.
For more details see the <a href="datasets/search/search_clients_engines_sources_daily/../../search.html">search data documentation</a>.</p>
<h2 id="scheduling-17"><a class="header" href="#scheduling-17">Scheduling</a></h2>
<p>This dataset is scheduled on Airflow
(<a href="https://github.com/mozilla/bigquery-etl/blob/ad84a15d580333b41d36cfe8331e51238f3bafa1/dags/bqetl_search.py#L64">source</a>).</p>
<h2 id="schema-9"><a class="header" href="#schema-9">Schema</a></h2>
<p>As of 2022-03-25, the current version of the underlying <code>search_clients_daily</code> is <code>v8</code>,
with schema as follows.
Generally, see <a href="https://console.cloud.google.com/datacatalog?project=mozdata&amp;qSystems=BIGQUERY">Data Catalog in GCP</a> for the most up-to-date schema.
It's backfilled through 2016-03-12. </p>
<pre><code>root
|-- client_id: string (nullable = true)
|-- submission_date: date (nullable = true)
|-- submission_date_s3: date (nullable = true)
|-- engine: string (nullable = true)
|-- source: string (nullable = true)
|-- country: string (nullable = true)
|-- app_version: string (nullable = true)
|-- distribution_id: string (nullable = true)
|-- locale: string (nullable = true)
|-- search_cohort: string (nullable = true)
|-- addon_version: string (nullable = true)
|-- os: string (nullable = true)
|-- os_version: string (nullable = true)
|-- channel: string (nullable = true)
|-- profile_creation_date: long (nullable = true)
|-- default_search_engine: string (nullable = true)
|-- default_search_engine_data_load_path: string (nullable = true)
|-- default_search_engine_data_submission_url: string (nullable = true)
|-- default_private_search_engine: string (nullable = true)
|-- default_private_search_engine_data_load_path: string (nullable = true)
|-- default_private_search_engine_data_submission_url: string (nullable = true)
|-- sample_id: long (nullable = true)
|-- sessions_started_on_this_day: long (nullable = true)
|-- profile_age_in_days: integer (nullable = true)
|-- subsession_hours_sum: double (nullable = true)
|-- active_addons_count_mean: double (nullable = true)
|-- max_concurrent_tab_count_max: integer (nullable = true)
|-- tab_open_event_count_sum: long (nullable = true)
|-- active_hours_sum: double (nullable = true)
|-- total_uri_count: long (nullable = true)
|-- tagged_sap: long (nullable = true)
|-- tagged_follow_on: long (nullable = true)
|-- sap: long (nullable = true)
|-- organic: long (nullable = true)
|-- search_with_ads: long (nullable = true)
|-- search_with_ads_organic: long (nullable = true)
|-- ad_click: long (nullable = true)
|-- ad_click_organic: long (nullable = true)
|-- unknown: long (nullable = true)
|-- normalized_engine: string (nullable = true)
|-- user_pref_browser_search_region: string (nullable = true)
|-- is_default_browser: boolean (nullable = true)
|-- experiments: map (nullable = true)
| |-- key: string
| |-- value: string
|-- scalar_parent_urlbar_searchmode_bookmarkmenu_sum: map (nullable = true)
| |-- key: string
| |-- value: int64
|-- scalar_parent_urlbar_searchmode_handoff_sum: map (nullable = true)
| |-- key: string
| |-- value: int64
|-- scalar_parent_urlbar_searchmode_keywordoffer_sum: map (nullable = true)
| |-- key: string
| |-- value: int64
|-- scalar_parent_urlbar_searchmode_oneoff_sum: map (nullable = true)
| |-- key: string
| |-- value: int64
|-- scalar_parent_urlbar_searchmode_other_sum: map (nullable = true)
| |-- key: string
| |-- value: int64
|-- scalar_parent_urlbar_searchmode_shortcut_sum: map (nullable = true)
| |-- key: string
| |-- value: int64
|-- scalar_parent_urlbar_searchmode_tabmenu_sum: map (nullable = true)
| |-- key: string
| |-- value: int64
|-- scalar_parent_urlbar_searchmode_tabtosearch_sum: map (nullable = true)
| |-- key: string
| |-- value: int64
|-- scalar_parent_urlbar_searchmode_tabtosearch_onboard_sum: map (nullable = true)
| |-- key: string
| |-- value: int64
|-- scalar_parent_urlbar_searchmode_topsites_newtab_sum: map (nullable = true)
| |-- key: string
| |-- value: int64
|-- scalar_parent_urlbar_searchmode_topsites_urlbar_sum: map (nullable = true)
| |-- key: string
| |-- value: int64
|-- scalar_parent_urlbar_searchmode_touchbar_sum: map (nullable = true)
| |-- key: string
| |-- value: int64
|-- scalar_parent_urlbar_searchmode_typed_sum: map (nullable = true)
| |-- key: string
| |-- value: int64
</code></pre>
<h1 id="code-reference-14"><a class="header" href="#code-reference-14">Code Reference</a></h1>
<p>The <code>search_clients_engines_sources_daily</code> job is
<a href="https://github.com/mozilla/bigquery-etl/blob/master/sql/moz-fx-data-shared-prod/search_derived/search_clients_daily_v8/query.sql">defined in <code>bigquery-etl</code></a></p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/search/search_clients_engines_sources_daily/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="search-clients-last-seen"><a class="header" href="#search-clients-last-seen">Search Clients Last Seen</a></h1>
<ul>
<li><a href="datasets/search/search_clients_last_seen/reference.html#introduction">Introduction</a>
<ul>
<li><a href="datasets/search/search_clients_last_seen/reference.html#contents">Contents</a></li>
<li><a href="datasets/search/search_clients_last_seen/reference.html#utilizing-byte-columns">Utilizing BYTE columns</a></li>
<li><a href="datasets/search/search_clients_last_seen/reference.html#engine-searches">Engine Searches</a></li>
<li><a href="datasets/search/search_clients_last_seen/reference.html#background-and-caveats">Background and Caveats</a></li>
<li><a href="datasets/search/search_clients_last_seen/reference.html#accessing-the-data">Accessing the Data</a></li>
</ul>
</li>
<li><a href="datasets/search/search_clients_last_seen/reference.html#data-reference">Data Reference</a>
<ul>
<li><a href="datasets/search/search_clients_last_seen/reference.html#example-queries">Example Queries</a></li>
<li><a href="datasets/search/search_clients_last_seen/reference.html#scheduling">Scheduling</a></li>
<li><a href="datasets/search/search_clients_last_seen/reference.html#schema">Schema</a></li>
</ul>
</li>
<li><a href="datasets/search/search_clients_last_seen/reference.html#code-reference">Code Reference</a></li>
</ul>
<h1 id="introduction-15"><a class="header" href="#introduction-15">Introduction</a></h1>
<p><code>search_clients_last_seen</code> is designed for individual search analysis for clients over the past year.
This can be useful when wondering about client search behavior over the past year.</p>
<p><strong>NOTE</strong>: <code>search_clients_last_seen</code> is currently a 1% sample, and the first day with a full year of
search activity is 2020-01-01.</p>
<h3 id="contents-5"><a class="header" href="#contents-5">Contents</a></h3>
<p><code>search_clients_last_seen</code> has just one row per-client, for any client
who was active over the past year. Here we define active as &quot;sent a main
ping&quot;; so if they are present that does not mean they searched.</p>
<p><strong>NOTE</strong>: Always choose a single <code>submission_date</code> when querying <code>search_clients_last_seen</code>.</p>
<p>The key pieces of this dataset are byte arrays that contain
daily information about client activity over the past year.
We have these for a variety of activity types:</p>
<ul>
<li><code>days_seen_bytes</code>: Days when we received a main ping from the client</li>
<li><code>days_searched_bytes</code>: Days that the client searched in any form</li>
<li><code>days_tagged_searched_bytes</code>: Days that the client performed a tagged search</li>
<li><code>days_searched_with_ads_bytes</code>: Days that the client performed a search that contained ads</li>
<li><code>days_clicked_ads_bytes</code>: Days that the client clicked an ad post-search
See &quot;Utilizing BYTE columns&quot; for how to use these fields.</li>
</ul>
<p>There are some convenience columns around these, that give the number of days
since the client was last active for that usage criteria:</p>
<ul>
<li><code>days_since_seen</code></li>
<li><code>days_since_searched</code></li>
<li><code>days_since_tagged_searched</code></li>
<li><code>days_since_searched_with_ads</code></li>
<li><code>days_since_clicked_ad</code></li>
<li><code>days_since_created_profile</code></li>
</ul>
<p>We also include a variety of dimension information (e.g. os,
country, channel, default_search) to aggregate on. The
<a href="https://github.com/mozilla/bigquery-etl/blob/master/sql/moz-fx-data-shared-prod/search_derived/search_clients_last_seen_v1/query.sql#L37">query itself</a>
lays out all of the available dimensional fields.</p>
<p>There are, finally, a few fields with daily activity data.
These include <code>active_hours_sum</code>, <code>organic</code> for organic searches,
and <code>total_searches</code>. Please note that these are just for the current day,
and not over the entire year of history contained in the <code>days_*_bytes</code> columns.</p>
<h4 id="utilizing-byte-columns"><a class="header" href="#utilizing-byte-columns">Utilizing BYTE columns</a></h4>
<p>These are stored as BigQuery <code>BYTE</code> type, so they can be a bit confusing
to use. We have a few convenience functions for using them. For these functions,
anytime we say &quot;was active&quot;, we mean &quot;within the usage criteria defined by that
column&quot;; for example, it could be days that clients searched with ads.:</p>
<p><code>udf.bits_to_days_seen</code> - The number of days the user was active.
<code>udf.bits_to_days_since_seen</code> - The number of days since the user was last active.
<code>udf.bits_to_days_since_first_seen</code> - The number of days since the user's <em>first</em>
active day. Note that this will be at most 365 days, since that is the beginning
of history for this dataset.
<code>udf.bits_to_active_n_weeks_ago</code> - Returns whether or not the user was active n weeks
ago for the given activity type.</p>
<h4 id="engine-searches"><a class="header" href="#engine-searches">Engine Searches</a></h4>
<p>Warning: This column was designed specifically for use with the revenue data, and probably isn't good for other kinds of analysis.</p>
<p>For each search engine, we store an array that contains the number of searches that user
completed each month for the past 12 months. This is across calendar months, so the number of
days are not directly comparable.</p>
<p>We have the same data for tagged searches, search with ads, and ad clicks.</p>
<h4 id="background-and-caveats-4"><a class="header" href="#background-and-caveats-4">Background and Caveats</a></h4>
<p><code>search_clients_last_seen</code> does includes
(<code>client_id</code> <code>submission_date</code>) pairs
even if we did not receive a ping for that <code>submission_date</code>.
Any client who was active over the past year will be included.</p>
<h4 id="accessing-the-data-7"><a class="header" href="#accessing-the-data-7">Accessing the Data</a></h4>
<p>Access the data at <code>search.search_clients_last_seen</code>.</p>
<!--
#### Further Reading
-->
<h1 id="data-reference-14"><a class="header" href="#data-reference-14">Data Reference</a></h1>
<h2 id="example-queries-9"><a class="header" href="#example-queries-9">Example Queries</a></h2>
<p><a href="https://sql.telemetry.mozilla.org/queries/70349/source#177176"><code>STMO#70349</code></a>
gives the WoW retention of users in different segments. Similar to GUD, a user is
considered retained if they do they same activity the next week. Note here the
outage from Armagaddon, and the outage for ad click telemetry in October.</p>
<p><a href="https://sql.telemetry.mozilla.org/queries/70348/source#177160"><code>STMO#70348</code></a>
shows the amount of different search activity taken by clients. We can use it
to determine the % of clients who partake in each activity, regardless of their
baseline amount of activity.</p>
<h2 id="scheduling-18"><a class="header" href="#scheduling-18">Scheduling</a></h2>
<p>This dataset is scheduled on Airflow
(<a href="https://github.com/mozilla/bigquery-etl/blob/ad84a15d580333b41d36cfe8331e51238f3bafa1/dags/bqetl_search.py#L52">source</a>).</p>
<h2 id="schema-10"><a class="header" href="#schema-10">Schema</a></h2>
<p>As of 2020-04-22, the current version of <code>search_clients_last_seee</code> is <code>v1</code>,
and has a schema as follows.
It's backfilled through 2020-01-01</p>
<pre><code>root
|- submission_date: date
|- client_id: string
|- sample_id: integer
|- country: string
|- app_version: string
|- distribution_id: string
|- locale: string
|- search_cohort: string
|- addon_version: string
|- os: string
|- channel: string
|- profile_creation_date: integer
|- default_search_engine: string
|- default_search_engine_data_load_path: string
|- default_search_engine_data_submission_url: string
|- profile_age_in_days: integer
|- active_addons_count_mean: float
|- user_pref_browser_search_region: string
|- os_version: string
|- max_concurrent_tab_count_max: integer
|- tab_open_event_count_sum: integer
|- active_hours_sum: float
|- subsession_hours_sum: float
|- sessions_started_on_this_day: integer
|- organic: integer
|- sap: integer
|- unknown: integer
|- tagged_sap: integer
|- tagged_follow_on: integer
|- ad_click: integer
|- search_with_ads: integer
|- total_searches: integer
|- tagged_searches: integer
+- engine_searches: record (repeated)
| |- key: string
| +- value: record
| | |- total_searches: integer (repeated)
| | |- tagged_searches: integer (repeated)
| | |- search_with_ads: integer (repeated)
| | |- ad_click: integer (repeated)
|- days_seen_bytes: bytes
|- days_searched_bytes: bytes
|- days_tagged_searched_bytes: bytes
|- days_searched_with_ads_bytes: bytes
|- days_clicked_ads_bytes: bytes
|- days_created_profile_bytes: bytes
</code></pre>
<h1 id="code-reference-15"><a class="header" href="#code-reference-15">Code Reference</a></h1>
<p>The <code>search_clients_last_seen</code> job is
<a href="https://github.com/mozilla/bigquery-etl/blob/master/sql/moz-fx-data-shared-prod/search_derived/search_clients_last_seen_v1/query.sql">defined in <code>bigquery-etl</code></a></p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/search/search_clients_last_seen/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="normalized-client-lifetime-value-ltv"><a class="header" href="#normalized-client-lifetime-value-ltv">Normalized Client Lifetime Value (LTV)</a></h1>
<ul>
<li><a href="datasets/search/client_ltv/reference.html#introduction">Introduction</a>
<ul>
<li><a href="datasets/search/client_ltv/reference.html#contents">Contents</a></li>
<li><a href="datasets/search/client_ltv/reference.html#ltv">LTV</a></li>
<li><a href="datasets/search/client_ltv/reference.html#background-and-caveats">Background and Caveats</a></li>
</ul>
</li>
<li><a href="datasets/search/client_ltv/reference.html#example-queries">Example Queries</a></li>
<li><a href="datasets/search/client_ltv/reference.html#schema">Schema</a></li>
<li><a href="datasets/search/client_ltv/reference.html#code-references">Code References</a></li>
<li><a href="datasets/search/client_ltv/reference.html#model-performance">Model Performance</a></li>
</ul>
<h1 id="introduction-16"><a class="header" href="#introduction-16">Introduction</a></h1>
<p><code>client_ltv</code> is designed to enable relative user value estimates based on their past and expected search and ad click behavior. This behavior is revenue-generating for Firefox.</p>
<h2 id="contents-6"><a class="header" href="#contents-6">Contents</a></h2>
<p><code>client_ltv</code> has one row for each (<code>client_id</code>, <code>engine</code>, <code>submission_date</code>) triplet.</p>
<p>Each row captures a year's worth of history for the <code>client_id</code> on the given <code>engine</code>, therefore the values will not change much when looking at <code>submission_date</code> in 1-day (or even 1-month) sequences, since there is significant overlap. For <strong>most</strong> analyses, using yesterday's <code>submission_date</code> will be sufficient. To get users active in the last i.e. 7 days, a join with <code>clients_last_seen</code> is required. We plan to propagate the necessary fields into <code>client_ltv</code> in the future so such a join isn't necessary.</p>
<p>Using yesterday's date, a client's row will contain the total number of searches, ad clicks, etc for the last 365 days, along with active search and ad click days (how many days did a user search or click an ad). Additionally each row contains the predicted number active search/ad click days for the <em>next</em> 365 days. See the schema at the bottom of this page for a full list of the fields.</p>
<h2 id="ltv"><a class="header" href="#ltv">LTV</a></h2>
<p>A client's &quot;lifetime&quot; is maxed out at 2 years given the structure of this dataset. Of course a client can exist for longer, but one year on either side of the date in question controls for seasonal trends and lets us easily produce annual estimates for, say, user acquisition ROI.</p>
<p>The procedure for calculating a user's LTV is as follows:</p>
<ul>
<li>Step 1: Determine the ad click value for the user's region/engine
<ul>
<li>(<em>Revenue in Country C for Engine E</em>) / (<em>Total Ad Clicks in Country C for Engine E</em>)</li>
</ul>
</li>
<li>Step 2: Determine the user's ad clicks per day for the past 365 days
<ul>
<li>(<em>Total Ad Clicks for User</em>) / (<em>Total Active Ad Click Days for User</em>)</li>
</ul>
</li>
<li>Step 3: Calculate a user's past LTV by multiplying the following:
<ul>
<li><em>Total Active Ad Click Days for User</em></li>
<li><em>Ad Clicks per Day for User</em> (derived in step 2)</li>
<li><em>Ad Click Value in Country C for Engine E</em> (derived from step 1)</li>
</ul>
</li>
<li>Step 4: Calculate a user's future LTV by multiplying the following:
<ul>
<li><em>Total <strong>Predicted</strong> Active Ad Click Days for User</em></li>
<li><em>Ad Clicks per Day for User</em> (derived in step 2)</li>
<li><em>Ad Click Value in Country C for Engine E</em> (derived from step 1)</li>
</ul>
</li>
<li>Step 5: Normalized the LTV values from (3) and (4)
<ul>
<li>(<em>User past LTV</em>) / (<em>Sum of all past user LTVs</em>)</li>
<li>(<em>User future LTV</em>) / (<em>Sum of all future user LTVs</em>)</li>
</ul>
</li>
</ul>
<p>The normalized LTV for a user can roughly be interpreted as a user's <strong>contribution</strong> to the collective value of our user-base. Note that the above procedure omits some outlier handling steps for simplicity.</p>
<h2 id="background-and-caveats-5"><a class="header" href="#background-and-caveats-5">Background and Caveats</a></h2>
<p>The <code>normalized_ltv_ad_clicks_current</code> field, for example, does <strong>not</strong> represent a user's contribution to revenue directly. It should be treated as a rough proxy. It is not appropriate to multiply revenue by this number.</p>
<p>LTV is broken down by engine, so the LTV for a user who searches on multiple engines must be interpreted in context. <strong>LTV is only available for Google and Bing on Firefox Desktop</strong> at this time.</p>
<p>We <strong>do</strong> have the ability to calculate a dollar value per user, however the (unnormalized) table is restricted to those with proper revenue access. For more information, see <a href="datasets/search/client_ltv/../../../concepts/getting_help.html">Getting Help</a>.</p>
<h1 id="example-queries-10"><a class="header" href="#example-queries-10">Example Queries</a></h1>
<p><em>Percent of users we predict will click an ad in the <strong>next</strong> 365 days by Engine.</em> (<a href="https://sql.telemetry.mozilla.org/queries/74878/source"><code>STMO#74878</code></a>)</p>
<pre><code class="language-sql">SELECT
engine,
AVG(IF(pred_num_days_seeing_ads &gt; 0, 1, 0)) as pct_predicted_ad_viewers_next_year,
AVG(IF(pred_num_days_clicking_ads &gt; 0, 1, 0)) as pct_predicted_ad_clickers_next_year,
FROM
`moz-fx-data-shared-prod`.revenue.client_ltv
WHERE
submission_date = DATE_SUB(CURRENT_DATE, INTERVAL 1 DAY)
GROUP BY
1
</code></pre>
<p><em>LTV Value of Users Over Lifetime (by <code>days_since_created_profile</code>) of Users Active in Past 7 Days</em> (<a href="https://sql.telemetry.mozilla.org/queries/74867/source#187036"><code>STMO#187036</code></a>)</p>
<pre><code class="language-sql">SELECT
days_since_created_profile,
SUM(normalized_ltv_ad_clicks_current) AS sum_normalized_ltv_ad_clicks_current,
SUM(normalized_ltv_ad_clicks_future) AS normalized_ltv_ad_clicks_future,
SUM(normalized_ltv_ad_clicks_future) / COUNT(*) AS avg_normalized_ltv_ad_clicks_future,
FROM
`moz-fx-data-shared-prod`.revenue.client_ltv
JOIN
`moz-fx-data-shared-prod`.search.search_clients_last_seen
USING(submission_date, client_id)
WHERE
submission_date = '2020-09-16'
AND days_since_created_profile &lt;= 365
AND days_since_seen &lt;= 6
GROUP BY
days_since_created_profile
ORDER BY
days_since_created_profile DESC
</code></pre>
<h1 id="schema-11"><a class="header" href="#schema-11">Schema</a></h1>
<p>As of 2020-09-16,
the current version of <code>client_ltv</code> is <code>v1</code>,
and has a schema as follows.
The dataset is backfilled through 2020-09-14.</p>
<pre><code>root
|-- submission_date: date (nullable = true)
|-- engine: string (nullable = true)
|-- country: string (nullable = true)
|-- client_id: string (nullable = true)
|-- total_client_searches_past_year: long (nullable = true)
|-- total_client_tagged_searches_past_year: long (nullable = true)
|-- total_client_ad_clicks_past_year: long (nullable = true)
|-- total_client_searches_with_ads_past_year: long (nullable = true)
|-- ad_click_days: long (nullable = true)
|-- search_days: long (nullable = true)
|-- search_with_ads_days: long (nullable = true)
|-- tagged_search_days: long (nullable = true)
|-- active_days: long (nullable = true)
|-- pred_num_days_clicking_ads: double (nullable = true)
|-- pred_num_days_seeing_ads: double (nullable = true)
|-- pred_num_days_searching: double (nullable = true)
|-- pred_num_days_tagged_searching: double (nullable = true)
|-- ad_clicks_per_day: double (nullable = true)
|-- searches_with_ads_per_day: double (nullable = true)
|-- searches_per_day: double (nullable = true)
|-- tagged_searches_per_day: double (nullable = true)
|-- ad_clicks_cutoff: double (nullable = true)
|-- searches_with_ads_cutoff: double (nullable = true)
|-- searches_cutoff: double (nullable = true)
|-- tagged_searches_cutoff: double (nullable = true)
|-- ad_clicks_per_day_capped: double (nullable = true)
|-- searches_with_ads_per_day_capped: double (nullable = true)
|-- searches_per_day_capped: double (nullable = true)
|-- tagged_searches_per_day_capped: double (nullable = true)
|-- total_ad_clicks: long (nullable = true)
|-- normalized_ltv_ad_clicks_current: double (nullable = true)
|-- normalized_ltv_search_with_ads_current: double (nullable = true)
|-- normalized_ltv_search_current: double (nullable = true)
|-- normalized_ltv_tagged_search_current: double (nullable = true)
|-- normalized_ltv_ad_clicks_future: double (nullable = true)
|-- normalized_ltv_search_with_ads_future: double (nullable = true)
|-- normalized_ltv_search_future: double (nullable = true)
|-- normalized_ltv_tagged_search_future: double (nullable = true)
</code></pre>
<h1 id="code-references"><a class="header" href="#code-references">Code References</a></h1>
<ul>
<li><a href="https://github.com/mozilla/telemetry-airflow/blob/master/jobs/ltv_daily.py">LTV daily model fitting</a></li>
<li>Unnormalized <a href="https://github.com/mozilla/bigquery-etl/blob/master/sql/moz-fx-data-shared-prod/revenue_derived/client_ltv_v1/query.sql"><code>client_ltv</code> query</a> (restricted query access)</li>
<li><a href="https://github.com/mozilla/bigquery-etl/blob/master/sql/moz-fx-data-shared-prod/revenue_derived/client_ltv_normalized/query.sql"><code>client_ltv</code> view</a> (for broad use)</li>
</ul>
<h1 id="model-performance"><a class="header" href="#model-performance">Model Performance</a></h1>
<p>There is additionally a dataset, <code>ltv_daily_model_perf</code>, that tracks the LTV model's prediction performance each day it is re-trained. For a given day, one could check the performance with the following <a href="https://sql.telemetry.mozilla.org/queries/75244/source#187873">query (<code>STMO#187873</code>)</a>:</p>
<pre><code class="language-sql">SELECT
active_days,
actual,
model
FROM
`moz-fx-data-shared-prod.analysis.ltv_daily_model_perf`
WHERE
date = '2020-09-29'
AND
metric = 'days_clicked_ads'
ORDER BY
active_days
</code></pre>
<p>This produces a histogram for the observed user frequencies and the model's predicted frequencies, allowing a chart similar to the one shown in the <a href="https://lifetimes.readthedocs.io/en/latest/Quickstart.html#assessing-model-fit">&quot;assessing model fit&quot; example</a> in the <code>lifetimes</code> documentation. This table only checks performance for clients the model expects have, for example, clicked an ad in 0 to 28 days in the past year, since most of the distribution is contained in that interval.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/search/client_ltv/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><p><code>mobile_search_clients_engines_sources_daily</code> is designed to enable client-level search analyses for mobile.
Querying this dataset can be slow;
consider using <code>mobile_search_aggregates</code> for coarse analyses.</p>
<h2 id="contents-7"><a class="header" href="#contents-7">Contents</a></h2>
<p><code>mobile_search_clients_engines_sources_daily</code> has one row for each unique combination of:
(<code>client_id</code>, <code>submission_date</code>, <code>engine</code>, <code>source</code>).</p>
<p>Alongside standard search metrics, this dataset includes client specific descriptive information as well.
For example, we include <code>normalized_app_name</code> and <code>normalized_app_name_os</code> for each row of data. <code>normalized_app_name</code> modifies the raw <code>app_name</code> data to align it more consistently with KPI reporting while <code>normalized_app_name_os</code> combines app name and os used by each client. Refer to the table below for comprehensive mapping details regarding these two fields.</p>
<div class="table-wrapper"><table><thead><tr><th><code>app_name</code></th><th><code>os</code></th><th><code>normalized_app_name_os</code></th><th><code>normalized_app_name</code></th></tr></thead><tbody>
<tr><td><code>Fenix</code></td><td>Android</td><td>Firefox Android</td><td>Firefox</td></tr>
<tr><td><code>Fennec</code></td><td>Other</td><td>Fennec Other</td><td>Fennec</td></tr>
<tr><td><code>Fennec</code></td><td>Android</td><td>Legacy Firefox Android</td><td>Fennec</td></tr>
<tr><td><code>Fennec</code></td><td>iOS</td><td>Firefox iOS</td><td>Firefox</td></tr>
<tr><td><code>Firefox Preview</code></td><td>Android</td><td>Firefox Preview</td><td>Firefox Preview</td></tr>
<tr><td><code>FirefoxConnect</code></td><td>Android</td><td>Firefox for Echo Show</td><td>Firefox for Echo Show</td></tr>
<tr><td><code>FirefoxForFireTV</code></td><td>Android</td><td>Firefox for FireTV</td><td>Firefox for FireTV</td></tr>
<tr><td><code>Focus Android Glean</code></td><td>Android</td><td>Focus Android</td><td>Focus</td></tr>
<tr><td><code>Focus iOS Glean</code></td><td>iOS</td><td>Focus iOS</td><td>Focus</td></tr>
<tr><td><code>Klar Android Glean</code></td><td>Android</td><td>Klar Android</td><td>Klar</td></tr>
<tr><td><code>Klar iOS Glean</code></td><td>iOS</td><td>Klar iOS</td><td>Klar</td></tr>
<tr><td><code>Other</code></td><td>iOS</td><td>Other iOS</td><td>Other</td></tr>
<tr><td><code>Other</code></td><td>Other</td><td>Other</td><td>Other</td></tr>
<tr><td><code>Other</code></td><td>Android</td><td>Other Android</td><td>Other</td></tr>
<tr><td><code>Zerda</code></td><td>Android</td><td>Firefox Lite Android</td><td>Firefox Lite</td></tr>
<tr><td><code>Zerda_cn</code></td><td>Android</td><td>Firefox Lite Android (China)</td><td>Firefox Lite (China)</td></tr>
</tbody></table>
</div>
<p>Note that, if there were no such searches in a row's segment
(i.e. the count would be 0),
the column value is <code>null</code>.
Each of these columns represent different types of searches.
For more details, see the <a href="datasets/search/mobile_search_clients_sources_daily/../../search.html">search data documentation</a></p>
<h2 id="background-and-caveats-6"><a class="header" href="#background-and-caveats-6">Background and Caveats</a></h2>
<p><code>mobile_search_clients_engines_sources_daily</code> does not include
(<code>client_id</code> <code>submission_date</code>) pairs
if we did not receive a ping for that <code>submission_date</code>.</p>
<p>We impute a <code>NULL</code> <code>engine</code> and <code>source</code> for pings with no search counts.
This ensures users who never search are included in this dataset.</p>
<p>This dataset is large.
If you're querying this dataset from STMO,
heavily limit the data you read using <code>submission_date</code> or <code>sample_id</code>.</p>
<p>As of August 1, 2024, the <code>mobile_search_clients_daily</code> table has been updated to extract data from the <code>baseline</code> ping tables instead of the original <code>metrics</code> ping tables. This shift maintains the same search totals with greater confidence in the mobile search engagement dates.</p>
<p>As <a href="https://docs.telemetry.mozilla.org/concepts/analysis_gotchas.html?highlight=submission">noted</a>#submission-date), the <code>submission_date</code> used throughout telemetry is the date Mozilla received that client's engagement, not necessarily the actual date on which that client engaged with Firefox. Mobile <code>metrics</code> pings are historically sent later than the actual date of activity: it takes roughly <a href="https://sql.telemetry.mozilla.org/queries/92717">4 days for Firefox to receive 95% of Fenix <code>metrics</code> pings</a> which originate from a given actual date. <code>Baseline</code> pings are more frequently sent/ received and serve as the basis of KPI DAU metrics. Therefore, this switch ensures a client's KPI DAU activity can be matched to search activity from that same active day.</p>
<p>All data prior to August 1st, 2024, is powered by the <code>metrics</code> ping and has been moved to the <code>mobile_search_clients_daily_historical</code> table. The new derived table, <code>mobile_search_clients_daily_v2</code>, has data from August 1, 2024 and is powered by the <code>baseline</code> ping. Downstream views and tables, like <code>mozdata.search.mobile_search_clients_daily</code>, pull data from both <code>mobile_search_clients_daily_v2</code> and <code>mobile_search_clients_daily_historical</code> to ensure comprehensive data coverage. <code>mozdata.search.mobile_search_clients_daily</code> remains the client-level source-of-truth for mobile search analyses.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/search/mobile_search_clients_sources_daily/intro.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="non-desktop-data"><a class="header" href="#non-desktop-data">Non-Desktop Data</a></h1>
<p>Non-Desktop data includes data for all mobile browser products as well as non browser products such as Lockwise, Firefox for Echo Show, Mozilla VPN, and Web XR Viewer among others.</p>
<h2 id="list-of-datasets"><a class="header" href="#list-of-datasets">List of Datasets</a></h2>
<ul>
<li><a href="datasets/./non_desktop/day_2_7_activation/reference.html">Day 2-7 Activation</a> - Used to calculate the <a href="datasets/../metrics/metrics.html#day-2-7-activation">Day 2-7 Activation metric</a>, a key result in 2020 for non-desktop products.</li>
<li><a href="datasets/./non_desktop/google_play_store/reference.html">Google Play Store Data</a> - Used to understand the acquisition performance for non-desktop products on the Google Play Store.</li>
<li><a href="datasets/./non_desktop/apple_app_store/reference.html">Apple App Store Data</a> - Used to understand the acquisition performance for the non-desktop products on the Apple App Store.</li>
</ul>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/non_desktop.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="non-desktop-day-2-7-activation"><a class="header" href="#non-desktop-day-2-7-activation">Non-Desktop Day 2-7 Activation</a></h1>
<ul>
<li><a href="datasets/non_desktop/day_2_7_activation/reference.html#introduction">Introduction</a></li>
<li><a href="datasets/non_desktop/day_2_7_activation/reference.html#contents">Contents</a></li>
<li><a href="datasets/non_desktop/day_2_7_activation/reference.html#accessing-the-data">Accessing the Data</a></li>
<li><a href="datasets/non_desktop/day_2_7_activation/reference.html#data-reference">Data Reference</a>
<ul>
<li><a href="datasets/non_desktop/day_2_7_activation/reference.html#example-queries">Example Queries</a></li>
<li><a href="datasets/non_desktop/day_2_7_activation/reference.html#scheduling">Scheduling</a></li>
<li><a href="datasets/non_desktop/day_2_7_activation/reference.html#schema">Schema</a></li>
</ul>
</li>
<li><a href="datasets/non_desktop/day_2_7_activation/reference.html#code-reference">Code Reference</a></li>
<li><a href="datasets/non_desktop/day_2_7_activation/reference.html#background-and-caveats">Background and Caveats</a></li>
</ul>
<h1 id="introduction-17"><a class="header" href="#introduction-17">Introduction</a></h1>
<p><code>firefox_nondesktop_day_2_7_activation</code> is designed for use in calculating the <a href="datasets/non_desktop/day_2_7_activation/../../../metrics/metrics.html#day-2-7-activation">Day 2-7 Activation metric</a>, a key result in 2020 for non-desktop products.</p>
<h1 id="contents-8"><a class="header" href="#contents-8">Contents</a></h1>
<p><code>firefox_nondesktop_day_2_7_activation</code> is a table with two key metrics: <code>new_profiles</code> and <code>day_2_7_activated</code>, aggregated over <code>submission_date</code>. It is derived from the <a href="https://docs.telemetry.mozilla.org/cookbooks/clients_last_seen_bits.html">non-desktop clients last seen table</a>.</p>
<ul>
<li><code>new_profiles</code>: Unique count of client ids with a given profile creation date. As not all initial pings are received exactly on the day of profile creation, we wait for 7 days after the profile creation date before establishing the New Profile cohort to ensure the data is complete.</li>
<li><code>days_2_7_activated</code>: Unique count of client ids who use the product at any point starting the day after they created a profile up to 6 days after.
We also include a variety of dimension information (e.g. <code>product</code>, <code>app_name</code>, <code>app_version</code>, <code>os</code>, <code>normalized_channel</code> and <code>country</code>) to aggregate on.</li>
</ul>
<p>This dataset is backfilled through 2017-01-01.</p>
<h1 id="accessing-the-data-8"><a class="header" href="#accessing-the-data-8">Accessing the Data</a></h1>
<p>Access the data at <a href="https://console.cloud.google.com/bigquery?project=moz-fx-data-shared-prod&amp;p=moz-fx-data-shared-prod&amp;d=telemetry&amp;t=firefox_nondesktop_day_2_7_activation&amp;page=table"><code>moz-fx-data-shared-prod.telemetry.firefox_nondesktop_day_2_7_activation</code></a></p>
<h1 id="data-reference-15"><a class="header" href="#data-reference-15">Data Reference</a></h1>
<h2 id="example-queries-11"><a class="header" href="#example-queries-11">Example Queries</a></h2>
<p>This query gives the <code>day 2-7 activation</code> by product:</p>
<pre><code class="language-sql">SELECT
cohort_date,
product,
SUM(day_2_7_activated) as day_2_7_activated,
SUM(new_profiles) as new_profiles,
SAFE_DIVIDE(SUM(day_2_7_activated), SUM(new_profiles)) as day_2_7_activation
FROM
mozdata.telemetry.firefox_nondesktop_day_2_7_activation
WHERE
cohort_date = &quot;2020-03-01&quot;
GROUP BY 1,2
ORDER BY 1
</code></pre>
<p><a href="https://sql.telemetry.mozilla.org/queries/72054"><code>STMO#72054</code></a></p>
<h2 id="scheduling-19"><a class="header" href="#scheduling-19">Scheduling</a></h2>
<p>This dataset is scheduled on Airflow (<a href="https://github.com/mozilla/telemetry-airflow/blob/59effc6ead0b764a9ef3d30f40fbdb4b0b3394ec/dags/copy_deduplicate.py#L337">source</a>).</p>
<h2 id="schema-12"><a class="header" href="#schema-12">Schema</a></h2>
<p>As of 2020-07-24, the current version of <code>firefox_nondesktop_day_2_7_activation</code> is v1, and has a schema as follows:</p>
<pre><code>root
|- submission_date: date
|- product: string
|- app_name: string
|- app_version: string
|- os: string
|- normalized_channel: string
|- country: string
|- new_profiles: integer
|- day_2_7_activated: integer
</code></pre>
<h1 id="code-reference-16"><a class="header" href="#code-reference-16">Code Reference</a></h1>
<p>The <code>firefox_nondesktop_day_2_7_activation job is</code> <a href="https://github.com/mozilla/bigquery-etl/blob/master/sql/moz-fx-data-shared-prod/telemetry_derived/firefox_nondesktop_day_2_7_activation_v1/query.sql">defined in <code>bigquery-etl</code></a>.</p>
<h1 id="background-and-caveats-7"><a class="header" href="#background-and-caveats-7">Background and Caveats</a></h1>
<p>Due to the delay in receiving all the initial pings and subsequent 7 day wait prior to establishing the new profile cohort, the table will lag <code>nondesktop_clients_last_seen</code> by 7 days.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/non_desktop/day_2_7_activation/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="google-play-store"><a class="header" href="#google-play-store">Google Play Store</a></h1>
<ul>
<li><a href="datasets/non_desktop/google_play_store/reference.html#introduction">Introduction</a></li>
<li><a href="datasets/non_desktop/google_play_store/reference.html#contents">Contents</a>
<ul>
<li><a href="datasets/non_desktop/google_play_store/reference.html#background-and-caveats">Background and Caveats</a></li>
<li><a href="datasets/non_desktop/google_play_store/reference.html#accessing-the-data">Accessing the Data</a></li>
</ul>
</li>
<li><a href="datasets/non_desktop/google_play_store/reference.html#data-reference">Data Reference</a>
<ul>
<li><a href="datasets/non_desktop/google_play_store/reference.html#schema">Schema</a></li>
<li><a href="datasets/non_desktop/google_play_store/reference.html#example-queries">Example Queries</a>
<ul>
<li><a href="datasets/non_desktop/google_play_store/reference.html#calculate-google-play-store-activity-for-a-given-day-and-app-by-country">Calculate Google Play Store activity for a given day and app by country</a></li>
<li><a href="datasets/non_desktop/google_play_store/reference.html#calculate-google-play-store-activity-for-a-given-day-by-source-and-app">Calculate Google Play Store activity for a given day by source and app</a></li>
</ul>
</li>
<li><a href="datasets/non_desktop/google_play_store/reference.html#scheduling">Scheduling</a></li>
</ul>
</li>
</ul>
<h1 id="introduction-18"><a class="header" href="#introduction-18">Introduction</a></h1>
<p>The <a href="https://console.cloud.google.com/bigquery?project=moz-fx-data-marketing-prod&amp;folder=&amp;organizationId=&amp;p=moz-fx-data-marketing-prod&amp;d=google_play_store&amp;page=dataset"><code>google_play_store</code></a> dataset is used to understand the acquisition performance for mobile products on the Google Play Store along key metrics and dimensions. Googles documentation for all metrics and dimensions can be found <a href="https://support.google.com/googleplay/android-developer/answer/6263332?hl=en">on the Play Store support portal</a>.</p>
<h1 id="contents-9"><a class="header" href="#contents-9">Contents</a></h1>
<p>The <a href="https://console.cloud.google.com/bigquery?project=moz-fx-data-marketing-prod&amp;folder=&amp;organizationId=&amp;p=moz-fx-data-marketing-prod&amp;d=google_play_store&amp;page=dataset"><code>google_play_store</code></a> dataset contains a collection of tables and views exported from Google. The key tables currently being used for non-desktop reporting are:</p>
<ul>
<li><a href="https://console.cloud.google.com/bigquery?p=moz-fx-data-marketing-prod&amp;d=google_play_store&amp;t=Retained_installers_channel_v1&amp;page=table"><code>Retained_installers_channel_v1</code></a> - Store activity by acquisition channels. Acquisition channels include:
<ul>
<li><strong>Play Store (organic)</strong> - Unique users who saw the apps store listing by browsing or searching on the Play Store app</li>
<li><strong>Third-party referrers</strong> - Unique users who visited the app's store listing on the Play Store app from an untagged deep link to the Play Store.</li>
<li><strong>Tracked Channels (UTM)</strong> - Unique users who visited the apps store listing on the Play Store app from a UTM-tagged link.</li>
<li><strong>Google Search (organic)</strong> - Unique users who visited the apps store listing on the Play Store app from a Google Search</li>
<li><strong>Google Ads</strong> - Unique users who visited the apps store listing on the Play Store app from a Google Ads ad. Data between Google Ads and the Play Console can differ.</li>
<li><strong>Other</strong></li>
</ul>
</li>
<li><a href="https://console.cloud.google.com/bigquery?p=moz-fx-data-marketing-prod&amp;d=google_play_store&amp;t=Retained_installers_country_v1&amp;page=table"><code>Retained_installers_country_v1</code></a> - Store activity by country.</li>
<li><a href="https://console.cloud.google.com/bigquery?p=moz-fx-data-marketing-prod&amp;d=google_play_store&amp;t=Retained_installers_play_country_v1&amp;page=table"><code>Retained_installers_play_country_v1</code></a> - Store activity for the Play Store (organic) acquisition channel. Includes a country breakdown for the channel.</li>
<li><a href="https://console.cloud.google.com/bigquery?p=moz-fx-data-marketing-prod&amp;d=google_play_store&amp;t=Retained_installers_utm_tagged_v1&amp;page=table"><code>Retained_installers_utm_tagged_v1</code></a> - Store activity for the Tracked Channels (UTM) acquisition channel by campaign UTM.</li>
</ul>
<p>There are other tables available that are yet to be explored / incorporated into regular reporting. Some that may be of interest include:</p>
<ul>
<li>Crashes by app version</li>
<li>Device and OS</li>
<li>Installs</li>
<li>Ratings</li>
</ul>
<p>The metrics included in the <code>retained_installers</code> tables are:</p>
<ul>
<li><strong><code>Store_Listing_Visitors</code></strong> - Unique users who visited the apps store listing on the Play Store app but havent installed the app</li>
<li><strong><code>Installers</code></strong> - Unique users who installed the app after visiting the apps store listing on the Play Store app.</li>
<li><strong><code>Visitors_to_installer_conversion_rate</code></strong> - Percentage of <code>Store_Listing_visitors</code> that install the app.</li>
<li><strong><code>Installers_retained_for_1_day</code></strong> - Installers who kept the app on at least one of their devices for 1 day. Installation doesnt mean the app was opened over this period.</li>
<li><strong><code>Installers_to_1_day_retention_rate</code></strong> - Percentage of installers who have the app on one of their devices 1 day after install.</li>
<li><strong><code>Installers_retained_for_7_days</code></strong> - Installers who kept the app on at least one of their devices for 7 days. Installation doesnt mean the app was opened over this period.</li>
<li><strong><code>Installers_to_7_days_retention_rate</code></strong> - Percentage of installers who have the app on one of their devices 7 days after install.</li>
<li><strong><code>Installers_retained_for_15_days</code></strong> - Installers who kept the app on at least one of their devices for 15 days. Installation doesnt mean the app was opened over this period.</li>
<li><strong><code>Installers_to_15_days_retention_rate</code></strong> - Percentage of installers who have the app on one of their devices 15 days after install.</li>
<li><strong><code>Installers_retained_for_30_days</code></strong> - Installers who kept the app on at least one of their devices for 30 days. Installation doesnt mean the app was opened over this period.</li>
<li><strong><code>Installers_to_30_days_retention_rate</code></strong> - Percentage of installers who have the app on one of their devices 30 days after install.</li>
</ul>
<h2 id="background-and-caveats-8"><a class="header" href="#background-and-caveats-8">Background and Caveats</a></h2>
<p>As of Aug 25 2020 not all the tables available in the dataset have been explored and vetted for accuracy by the data science team. Tables that were fully reviewed and being documented here are the <code>Retained_installers</code> tables whose primary use case is to explain acquisition.</p>
<p><strong>Note:</strong> Google does not make the play store data available for export every day. The export job checks for new files every day. However, having monitored the job, it appears the data is made available every 7 - 14 days, and seems to primarily be made available on weekends. Due to this lack of consistency, there will be delays in the data available for this dataset. The data currently in BigQuery is the most current data available from Google.</p>
<h2 id="accessing-the-data-9"><a class="header" href="#accessing-the-data-9">Accessing the Data</a></h2>
<p>Access the data at <code>moz-fx-data-marketing-prod.google_play_store</code></p>
<h1 id="data-reference-16"><a class="header" href="#data-reference-16">Data Reference</a></h1>
<h2 id="schema-13"><a class="header" href="#schema-13">Schema</a></h2>
<pre><code>Retained_installer tables schema
root
|- Date: date
|- Package_Name: string
|- [Acquisition_channel | country | UTM_source_campaign]: string
|- Store_Listing_Visitors: integer
|- Installers: integer
|- Visitor_to_installer_conversion_rate: float
|- installers_retained_for_1_day: integer
|- installers_to_1_day_retention_rate: float
|- installers_retained_for_7_days: integer
|- installers_to_7_days_retention_rate: float
|- installers_retained_for_15_days: integer
|- installers_to_15_days_retention_rate: float
|- installers_retained_for_30_days: integer
|- installers_to_30_days_retention_rate: float
</code></pre>
<h2 id="example-queries-12"><a class="header" href="#example-queries-12">Example Queries</a></h2>
<h3 id="calculate-google-play-store-activity-for-a-given-day-and-app-by-country"><a class="header" href="#calculate-google-play-store-activity-for-a-given-day-and-app-by-country">Calculate Google Play Store activity for a given day and app by country</a></h3>
<pre><code class="language-sql">SELECT
Date,
Package_Name,
Country,
SUM(Store_Listing_Visitors) as Store_Visits,
SUM(Installers) as installs,
SAFE_DIVIDE(SUM(Installers), SUM(Store_Listing_Visitors)) as install_rate
FROM
`moz-fx-data-marketing-prod.google_play_store.p_Retained_installers_country_v1`
WHERE
Date = &quot;2020-08-01&quot;
AND Package_Name = &quot;org.mozilla.firefox&quot;
GROUP BY
date, Package_name, Country
ORDER BY
Store_Visits DESC
</code></pre>
<p><a href="https://sql.telemetry.mozilla.org/queries/74289/source"><code>STMO#74289</code></a></p>
<h3 id="calculate-google-play-store-activity-for-a-given-day-by-source-and-app"><a class="header" href="#calculate-google-play-store-activity-for-a-given-day-by-source-and-app">Calculate Google Play Store activity for a given day by source and app</a></h3>
<pre><code class="language-sql">SELECT
Date,
Package_Name,
Acquisition_Channel,
SUM(Store_Listing_Visitors) as Store_Visits,
SUM(Installers) as installs,
SAFE_DIVIDE(SUM(Installers), SUM(Store_Listing_Visitors)) as install_rate
FROM
`moz-fx-data-marketing-prod.google_play_store.p_Retained_installers_channel_v1`
WHERE
Date = &quot;2020-08-01&quot;
GROUP BY
date, Package_name, Acquisition_Channel
ORDER BY
package_name, Store_Visits
</code></pre>
<p><a href="https://sql.telemetry.mozilla.org/queries/74288/source"><code>STMO#74288</code></a></p>
<h2 id="scheduling-20"><a class="header" href="#scheduling-20">Scheduling</a></h2>
<p>The job to retrieve the raw data from the Google Play Store can be found in <a href="https://github.com/mozilla/play-store-export">the <code>play-store-export</code> repository</a> and it is scheduled in <a href="https://github.com/mozilla/telemetry-airflow/blob/master/dags/play_store_export.py">airflow</a>.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/non_desktop/google_play_store/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="apple-app-store"><a class="header" href="#apple-app-store">Apple App Store</a></h1>
<ul>
<li><a href="datasets/non_desktop/apple_app_store/reference.html#introduction">Introduction</a></li>
<li><a href="datasets/non_desktop/apple_app_store/reference.html#contents">Contents</a>
<ul>
<li><a href="datasets/non_desktop/apple_app_store/reference.html#background-and-caveats">Background and Caveats</a></li>
<li><a href="datasets/non_desktop/apple_app_store/reference.html#accessing-the-data">Accessing the Data</a></li>
</ul>
</li>
<li><a href="datasets/non_desktop/apple_app_store/reference.html#data-reference">Data Reference</a>
<ul>
<li><a href="datasets/non_desktop/apple_app_store/reference.html#schema">Schema</a></li>
<li><a href="datasets/non_desktop/apple_app_store/reference.html#example-queries">Example Queries</a>
<ul>
<li><a href="datasets/non_desktop/apple_app_store/reference.html#calculate-apple-app-store-activity-for-a-given-day-by-app">Calculate Apple App Store Activity for a given day by app</a></li>
<li><a href="datasets/non_desktop/apple_app_store/reference.html#calculate-apple-app-store-activity-for-a-given-day-and-app-by-source">Calculate Apple App Store Activity for a given day and app by source</a></li>
</ul>
</li>
<li><a href="datasets/non_desktop/apple_app_store/reference.html#scheduling">Scheduling</a></li>
</ul>
</li>
</ul>
<h1 id="introduction-19"><a class="header" href="#introduction-19">Introduction</a></h1>
<p>The <a href="https://console.cloud.google.com/bigquery?project=moz-fx-data-marketing-prod&amp;folder=&amp;organizationId=&amp;p=moz-fx-data-marketing-prod&amp;d=apple_app_store&amp;page=dataset"><code>apple_app_store</code></a> dataset is used to understand the acquisition performance for non-desktop products on the Apple App Store along the key metrics and dimensions. Apples documentation for all metrics and dimensions can be found <a href="https://help.apple.com/app-store-connect/#/itc21781223f">in the app store connect reference</a>.</p>
<h1 id="contents-10"><a class="header" href="#contents-10">Contents</a></h1>
<p>The <a href="https://console.cloud.google.com/bigquery?project=moz-fx-data-marketing-prod&amp;folder=&amp;organizationId=&amp;p=moz-fx-data-marketing-prod&amp;d=apple_app_store&amp;page=dataset"><code>apple_app_store</code></a> dataset contains a collection of aggregated tables by a singular dimension that explains the performance of acquisition activity through the Apple App Store.</p>
<p>The dimensions (saved as individual derived tables) include:</p>
<ul>
<li><a href="https://console.cloud.google.com/bigquery?project=moz-fx-data-marketing-prod&amp;p=moz-fx-data-marketing-prod&amp;d=apple_app_store&amp;t=metrics_by_platform&amp;page=table"><code>moz-fx-data-marketing-prod.apple_app_store.metrics_by_platform</code></a> - The device type on which the app was downloaded or used. E.g. iPad, iPod, or iPhone.</li>
<li><a href="https://console.cloud.google.com/bigquery?project=moz-fx-data-marketing-prod&amp;p=moz-fx-data-marketing-prod&amp;d=apple_app_store&amp;t=metrics_by_platform_version&amp;page=table"><code>moz-fx-data-marketing-prod.apple_app_store.platform version</code></a> - The OS version on which the app was downloaded or used. App Units, In-App Purchases, and Sales are based on the version on which the app is downloaded. Active in Last 30 Days, Product Page Views, Retention, and Sessions are based on the iOS version on which the app is used.</li>
<li><a href="https://console.cloud.google.com/bigquery?project=moz-fx-data-marketing-prod&amp;p=moz-fx-data-marketing-prod&amp;d=apple_app_store&amp;t=metrics_by_app_version&amp;page=table"><code>moz-fx-data-marketing-prod.apple_app_store.app version</code></a> - The version of the app displayed on the app store at the time of activity.</li>
<li><a href="https://console.cloud.google.com/bigquery?project=moz-fx-data-marketing-prod&amp;p=moz-fx-data-marketing-prod&amp;d=apple_app_store&amp;t=metrics_by_region&amp;page=table"><code>moz-fx-data-marketing-prod.apple_app_store.region</code></a> - The App Store region in which purchases were made, based on the customers billing address. Regions include (USA and Canada, Europe, Latin America and The Caribbean, Asia Pacific, Africa, The Middle East, and India)</li>
<li><a href="https://console.cloud.google.com/bigquery?project=moz-fx-data-marketing-prod&amp;p=moz-fx-data-marketing-prod&amp;d=apple_app_store&amp;t=metrics_by_storefront&amp;page=table"><code>moz-fx-data-marketing-prod.apple_app_store.storefront</code></a> - The app store country in which purchases were made, based on the customers billing address.</li>
<li><a href="https://console.cloud.google.com/bigquery?project=moz-fx-data-marketing-prod&amp;p=moz-fx-data-marketing-prod&amp;d=apple_app_store&amp;t=metrics_by_source&amp;page=table"><code>moz-fx-data-marketing-prod.apple_app_store.source</code></a> - The source from which a customer tapped a link to your App Store product page to view your app or download it for the first time. You can view metrics based on the source from which users are finding your app. Source types include (App Store Browse, App Store Search, App Referrers, Web Referrers)</li>
<li><a href="https://console.cloud.google.com/bigquery?project=moz-fx-data-marketing-prod&amp;p=moz-fx-data-marketing-prod&amp;d=apple_app_store&amp;t=metrics_by_app_referrer&amp;page=table"><code>moz-fx-data-marketing-prod.apple_app_store.app referrer</code></a> - People landing on the app store via links from within other apps. This also includes apps using the store kit API and excludes the native Safari.</li>
<li><a href="https://console.cloud.google.com/bigquery?project=moz-fx-data-marketing-prod&amp;p=moz-fx-data-marketing-prod&amp;d=apple_app_store&amp;t=metrics_by_web_referrer&amp;page=table"><code>moz-fx-data-marketing-prod.apple_app_store.web Referrer</code></a> - Previously top websites. Shows the referring website for the app download. Web referrals must be from Safari on devices with iOS 8 or tvOS 9 or later. Taps from websites using web browsers like chrome are attributed to that app.</li>
<li><a href="https://console.cloud.google.com/bigquery?project=moz-fx-data-marketing-prod&amp;p=moz-fx-data-marketing-prod&amp;d=apple_app_store&amp;t=metrics_by_campaign&amp;page=table"><code>moz-fx-data-marketing-prod.apple_app_store.campaign</code></a> - Previously top campaigns. Tracks app and website referrals to measure the impact of an advertising campaign. Tracked by adding two tokens to any app store link to see results in app analytics.</li>
<li><a href="https://console.cloud.google.com/bigquery?project=moz-fx-data-marketing-prod&amp;p=moz-fx-data-marketing-prod&amp;d=apple_app_store&amp;t=metrics_total&amp;page=table"><code>moz-fx-data-marketing-prod.apple_app_store.total</code></a>: Total metric activity without any dimension breakdown.</li>
</ul>
<p>The metrics included in the aggregated tables are:</p>
<ul>
<li><strong><code>app_units</code></strong> - The number of first-time app purchases made on the App Store using iOS 8 and tvOS 9 or later. Updates, re-downloads, download onto other devices are not counted. Family sharing downloads are included for free apps, but not for paid apps.</li>
<li><strong><code>impressions</code></strong> - The number of times the app was viewed in the Featured, Categories, Top Charts and Search Sections of the App Store. Also includes views of the product page.</li>
<li><strong><code>impressions_unique_device</code></strong> - The number of times the app was viewed in the Featured, Categories, Top Charts and Search Sections of the App Store by unique device. Also includes views of the product page.</li>
<li><strong><code>product_page_views</code></strong> - Number of times the app's product page has been viewed on devices iOS 8 and tvOS 9 or later. Includes both App Store app and store kit API</li>
<li><strong><code>product_page_views_unique_device</code></strong> - Number of times the app's product page has been viewed on devices iOS 8 and tvOS 9 or later by unique device. Includes both App Store app and store kit API</li>
<li><strong><code>active_devices_opt_in</code></strong> - The number of devices with at least one session during the selected period. Only devices with iOS 8 and tvOS 9 or later are included. Data for this metric is “opt-in” - collected only if users have agreed to share their diagnostics and usage information with app developers.</li>
<li><strong><code>active_devices_last_30_days_opt_in</code></strong> - The number of active devices with at least one session during the previous 30 days. Data for this metric is “opt-in” - collected only if users have agreed to share their diagnostics and usage information with app developers.</li>
<li><strong><code>deletions_opt_in</code></strong> - The number of times your app was deleted on devices running iOS 12.3 or tvOS 9 or later. This data includes deletions of the app from the Home Screen and deletions of the app through Manage Storage. Data from resetting or erasing a device's content and settings is not included. Data for this metric is “opt-in” - collected only if users have agreed to share their diagnostics and usage information with app developers.</li>
<li><strong><code>installations_opt_in</code></strong> - The total number of times your app has been installed on devices with iOS 8 or tvOS 9, or later. Re-downloads on the same device, downloads to multiple devices sharing the same Apple ID, and Family Sharing installations are included. Data for this metric is “opt-in” - collected only if users have agreed to share their diagnostics and usage information with app developers.</li>
<li><strong><code>sessions_opt_in</code></strong> - The number of times the app has been used for at least two seconds. If the app is in the background and is later used again, that counts as another session. Data for this metric is “opt-in” - collected only if users have agreed to share their diagnostics and usage information with app developers.</li>
</ul>
<h2 id="background-and-caveats-9"><a class="header" href="#background-and-caveats-9">Background and Caveats</a></h2>
<p>The data is received from Apple with only one dimension per metric. As a result, we are unable to do multi-dimensional analysis. i.e. we can tell how each storefront is performing but we cant see how specific platforms or sources are contributing to it.</p>
<h2 id="accessing-the-data-10"><a class="header" href="#accessing-the-data-10">Accessing the Data</a></h2>
<p>Access the data at <a href="https://console.cloud.google.com/bigquery?project=moz-fx-data-marketing-prod&amp;folder=&amp;organizationId=&amp;p=moz-fx-data-marketing-prod&amp;d=apple_app_store&amp;page=dataset"><code>moz-fx-data-marketing-prod.apple_app_store</code></a></p>
<h1 id="data-reference-17"><a class="header" href="#data-reference-17">Data Reference</a></h1>
<h2 id="schema-14"><a class="header" href="#schema-14">Schema</a></h2>
<pre><code>metrics_by_[dimension] tables
root
|- date: date
|- app_name: string
|- [app_referrer | app_version | campaign | platform | platform_version | region | source | storefront | web_referrer]: string
|- app_units: integer
|- impressions: integer
|- impressions_unique_device: integer
|- product_page_views: integer
|- product_page_views_unique_device: integer
|- active_devices_opt_in: integer
|- active_devices_last_30_days_opt_in: integer
|- deletions_opt_in: integer
|- installations_opt_in: integer
|- sessions_opt_in: integer
</code></pre>
<h2 id="example-queries-13"><a class="header" href="#example-queries-13">Example Queries</a></h2>
<h3 id="calculate-apple-app-store-activity-for-a-given-day-by-app"><a class="header" href="#calculate-apple-app-store-activity-for-a-given-day-by-app">Calculate Apple App Store Activity for a given day by app</a></h3>
<pre><code class="language-sql">SELECT
date,
app_name,
SUM(impressions_unique_device) as unique_device_impressions,
SUM(product_page_views_unique_device) as unique_device_page_views,
SUM(app_units) as installs,
SAFE_DIVIDE(SUM(product_page_views_unique_device), SUM(impressions_unique_device)) as unique_device_page_view_rate,
SAFE_DIVIDE(SUM(app_units), SUM(product_page_views_unique_device)) as install_rate
FROM
`moz-fx-data-marketing-prod.apple_app_store.metrics_total`
WHERE
date = &quot;2020-08-20&quot;
GROUP BY
date, app_name
</code></pre>
<p><a href="https://sql.telemetry.mozilla.org/queries/74291/source"><code>STMO#74291</code></a></p>
<h3 id="calculate-apple-app-store-activity-for-a-given-day-and-app-by-source"><a class="header" href="#calculate-apple-app-store-activity-for-a-given-day-and-app-by-source">Calculate Apple App Store Activity for a given day and app by source</a></h3>
<pre><code class="language-sql">SELECT
date,
app_name,
source,
SUM(impressions_unique_device) as unique_device_impressions,
SUM(product_page_views_unique_device) as unique_device_page_views,
SUM(app_units) as installs,
SAFE_DIVIDE(SUM(product_page_views_unique_device), SUM(impressions_unique_device)) as unique_device_page_view_rate,
SAFE_DIVIDE(SUM(app_units), SUM(product_page_views_unique_device)) as install_rate
FROM
`moz-fx-data-marketing-prod.apple_app_store.metrics_by_source`
WHERE
date = &quot;2020-08-20&quot;
AND app_name = &quot;Firefox&quot;
GROUP BY
date, app_name, source
</code></pre>
<p><a href="https://sql.telemetry.mozilla.org/queries/74290/source"><code>STMO#74290</code></a></p>
<h2 id="scheduling-21"><a class="header" href="#scheduling-21">Scheduling</a></h2>
<p>The job to retrieve the raw data from the Apple App Store can be found in <a href="https://github.com/mozilla/app-store-analytics-export">the <code>app-store-analytics-export</code> repository</a>. The exported results are individual metrics grouped by a single dimension. These exports are initially loaded into the <a href="https://console.cloud.google.com/bigquery?project=moz-fx-data-marketing-prod&amp;p=moz-fx-data-marketing-prod&amp;d=apple_app_store_exported&amp;page=dataset"><code>apple_app_store_exported</code></a> data source. The exports are scheduled in <a href="https://github.com/mozilla/telemetry-airflow/blob/master/dags/app_store_analytics.py"><code>airflow</code></a>. The job to create the derived tables found in <a href="https://console.cloud.google.com/bigquery?project=moz-fx-data-marketing-prod&amp;p=moz-fx-data-marketing-prod&amp;d=apple_app_store&amp;page=dataset"><code>moz-fx-data-marketing-prod.apple_app_store</code></a> can be found in <a href="https://github.com/mozilla/bigquery-etl/tree/master/sql/moz-fx-data-marketing-prod/apple_app_store"><code>bigquery-etl</code> under <code>apple_app_store</code></a>.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/non_desktop/apple_app_store/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="other-datasets"><a class="header" href="#other-datasets">Other Datasets</a></h1>
<p>These datasets are for projects outside of the Firefox telemetry domain.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/other.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="hgpush"><a class="header" href="#hgpush">hgpush</a></h1>
<p>This dataset records facts about individual commits to the Firefox source tree
in the <a href="https://hg.mozilla.org/mozilla-central/"><code>mozilla-central</code></a> source
code repository.</p>
<h1 id="data-reference-18"><a class="header" href="#data-reference-18">Data Reference</a></h1>
<p>The dataset is accessible via <a href="https://sql.telemetry.mozilla.org"><code>STMO</code></a>.
Use the <code>eng_workflow_hgpush_parquet_v1</code> table with the <code>Athena</code> data source.
(The <code>Presto</code> data source is also available, but much slower.)</p>
<h2 id="field-types-and-descriptions"><a class="header" href="#field-types-and-descriptions">Field Types and Descriptions</a></h2>
<p>See the <a href="https://github.com/mozilla-services/mozilla-pipeline-schemas/blob/master/schemas/eng-workflow/hgpush/hgpush.1.schema.json"><code>hgpush</code> ping schema</a>
for a description of available fields.</p>
<p>Be careful to:</p>
<ul>
<li>Use the latest schema version. e.g. <code>v1</code>. Browse the <a href="https://github.com/mozilla-services/mozilla-pipeline-schemas/tree/master/schemas/eng-workflow/hgpush"><code>hgpush</code> schema directory</a> in the GitHub repo to be sure.</li>
<li>Change dataset field names from <code>camelCaseNames</code> to <code>under_score_names</code> in STMO. e.g. <code>reviewSystemUsed</code> in the ping schema becomes <code>review_system_used</code> in STMO.</li>
</ul>
<h2 id="example-queries-14"><a class="header" href="#example-queries-14">Example Queries</a></h2>
<p>Select the number of commits with an 'unknown' review system in the last 7 days:</p>
<pre><code class="language-sql">select
count(1)
from
eng_workflow_hgpush_parquet_v1
where
review_system_used = 'unknown'
and date_diff('day', from_unixtime(push_date), now()) &lt; 7
</code></pre>
<h1 id="code-reference-17"><a class="header" href="#code-reference-17">Code Reference</a></h1>
<p>The dataset is populated via the <a href="https://github.com/mozilla-conduit/commit-telemetry-service">Commit Telemetry Service</a>.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/other/hgpush/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="what-is-the-stub-installer-ping"><a class="header" href="#what-is-the-stub-installer-ping">What is the Stub Installer ping?</a></h1>
<p>When the stub installer completes with almost any result, it generates a ping containing some data about the system and about how the installation went. This ping isn't part of Firefox unified telemetry, it's a bespoke system; we can't use the telemetry client code when it isn't installed yet.</p>
<p>No ping is sent if the installer exits early because initial system requirements checks fail.</p>
<h2 id="how-its-processed"><a class="header" href="#how-its-processed">How its processed</a></h2>
<p>Installer pings are formed and sent from NSIS code (!) in the stub installer, in the <a href="https://searchfox.org/mozilla-central/source/browser/installer/windows/nsis/stub.nsi">SendPing subroutine</a>.</p>
<p>Like any other ping, they are processed into <a href="datasets/other/stub_installer/../../../cookbooks/bigquery/querying.html#structure-of-ping-tables-in-bigquery">ping tables</a>.</p>
<h2 id="how-to-access-the-data"><a class="header" href="#how-to-access-the-data">How to access the data</a></h2>
<p>You can access this data in BigQuery under <code>firefox_installer.install</code>.
The following query, for example, gives you the number of successful installs per normalized country code on April 20th, 2021:</p>
<pre><code class="language-sql">SELECT normalized_country_code,
succeeded,
count(*)
FROM firefox_installer.install
WHERE DATE(submission_timestamp) = '2021-04-20'
GROUP BY normalized_country_code,
succeeded
</code></pre>
<p><a href="https://sql.telemetry.mozilla.org/queries/81648/source"><code>STMO#81648</code></a></p>
<p>Note about <code>os_version</code>: Previous versions of Windows have used a very small set of build numbers through their entire life cycle. However, Windows 10 gets a new build number with every major update (about every 6 months), and many more builds have been released on its insider channels. So, to prevent a huge amount of noise, queries using this field should generally filter out the build number and only use the major and minor version numbers to differentiate Windows versions, unless the build number is specifically needed.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/other/stub_installer/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="bmobugs"><a class="header" href="#bmobugs"><code>bmobugs</code></a></h1>
<p>This dataset mirrors bugs (defects, enhancements, tasks) reported in the
<a href="https://bugzilla.mozilla.org/"><code>Bugzilla</code></a> bug tracker.</p>
<h1 id="data-reference-19"><a class="header" href="#data-reference-19">Data Reference</a></h1>
<p>The dataset is accessible via <a href="https://sql.telemetry.mozilla.org"><code>STMO</code></a>.
Use the <code>eng_workflow.bmobugs</code> table with the <code>Telemetry (BigQuery)</code> data source.</p>
<h2 id="field-types-and-descriptions-1"><a class="header" href="#field-types-and-descriptions-1">Field Types and Descriptions</a></h2>
<p><code>bug_id</code></p>
<p>The unique ID of the bug. The bug can be accessed at
<code>https://bugzilla.mozilla.org/show_bug.cgi?id=&lt;bug_id&gt;</code></p>
<p><code>reporter</code></p>
<p>The <code>bugmail</code> of the user who filed the bug</p>
<p><code>assigned_to</code></p>
<p>The <code>bugmail</code> of the user the bug has been assigned to
by default, this is <code>nobody@mozilla.org</code></p>
<p><code>qa_contact</code></p>
<p>The <code>bugmail</code> of the user who is responsible for answering
QA questions about the bug; This field is assigned on a per-
product::component basis</p>
<p><code>product</code></p>
<p>The product in which the bug was filed; see the
<a href="https://bugzilla.mozilla.org/describecomponents.cgi">Bugzilla product descriptions</a>
for details</p>
<p><code>component</code></p>
<p>The component of the product in which the bug was filed</p>
<p><code>bug_status</code></p>
<p>The status of the bug:</p>
<ul>
<li><code>UNCONFIRMED</code></li>
<li><code>NEW</code></li>
<li><code>ASSIGNED</code></li>
<li><code>RESOLVED</code></li>
<li><code>CLOSED</code></li>
</ul>
<p><code>keywords</code></p>
<p>Controlled vocabulary <a href="https://bugzilla.mozilla.org/describekeywords.cgi">keywords</a>
assigned to the bug</p>
<p><code>groups</code></p>
<p><code>Bugzilla</code> groups to which the bug has been assigned</p>
<p><code>flags</code></p>
<p>Flags requested by users of other users</p>
<ul>
<li><code>name</code>: the name of the flag such as <code>needinfo</code></li>
<li><code>status</code>: <code>?</code>, <code>+</code>, <code>-</code></li>
<li><code>setter_id</code>: the <code>bugmail</code> of the user requesting the flag</li>
<li><code>requestee_id</code>: the <code>bugmail</code> of the user who the flag was requested of</li>
</ul>
<p><code>priority</code></p>
<p>The bug's priority, as set by the developers who triage the product::component</p>
<p><code>resolution</code></p>
<p>If non-default, <code>---</code>, the resolution of the bug; which includes <code>FIXED</code>, <code>VERIFIED</code>,
<code>WORKSFORME</code>, <code>DUPLICATE</code>, <code>INVALID</code>, <code>WONTFIX</code>, and <code>MOVED</code>.</p>
<p><code>blocked_by</code></p>
<p>Bugs which much be resolved before this bug can be worked on
or resolved</p>
<p><code>depends_on</code></p>
<p>Bugs depending on the resolution of this bug before they can be resolved.</p>
<p><code>duplicate_of</code></p>
<p>If the bug has been resolved as <code>DUPLICATE</code>, the id of the bug of which
this is a duplicate</p>
<p><code>duplicates</code></p>
<p>List of bugs which have been resolved as duplicates of this bug</p>
<p><code>target_milestone</code></p>
<p>The version number of the branch of Nightly in which the change
set associated with the bug was landed; note that version
numbers vary by product</p>
<p><code>version</code></p>
<p>The version of Firefox in which the bug was reported</p>
<p><code>delta_ts</code></p>
<p>The timestamp of when the bug was last modified</p>
<p><code>creation_ts</code></p>
<p>The timestamp of when the bug was filed</p>
<h2 id="example-queries-15"><a class="header" href="#example-queries-15">Example Queries</a></h2>
<p>Select the number of bugs in the Core product filed
in the past 7 days:</p>
<pre><code class="language-sql">SELECT
count(distinct bug_id)
FROM
eng_workflow.bmobugs
WHERE
product = 'Core'
AND date_diff(current_date(), date(parse_timestamp('%Y-%m-%d %H:%M:%S', creation_ts)), DAY) &lt;= 7
AND date(submission_timestamp) &gt;= '2019-01-01' -- required submission date filter
</code></pre>
<h1 id="code-reference-18"><a class="header" href="#code-reference-18">Code Reference</a></h1>
<p>The dataset is populated via the
<a href="https://github.com/mozilla-bteam/bmo/blob/master/Bugzilla/Report/Ping/Simple.pm">simple ping service on Bugzilla</a>.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/other/bmobugs/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="build-metadata"><a class="header" href="#build-metadata">Build metadata</a></h1>
<p><a href="https://buildhub.moz.tools/">Buildhub</a> is a database of metadata for official Mozilla builds of
Firefox, Thunderbird, and Fennec (legacy Firefox for Android).
Support for the new Firefox for Android is being tracked in
<a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1622948">bug 1622948</a>.</p>
<p>It includes data on the <a href="datasets/../concepts/terminology.html#build-id">build id</a> and revision,
which can help you understand what changes went into a specific version
of the product you see in telemetry.</p>
<p>Buildhub data is exported to BigQuery as it becomes available (generally very soon after our software is released) at <code>mozdata.telemetry.buildhub2</code>.</p>
<p>The <code>build</code> column contains a struct value
that matches the structure of the <code>buildhub.json</code> files
built by <a href="https://searchfox.org/mozilla-central/source/toolkit/mozapps/installer/informulate.py"><code>informulate.py</code></a> during a Firefox build.</p>
<p>An example of a <code>build</code> record is:</p>
<pre><code class="language-json">{
&quot;build&quot;: {
&quot;as&quot;: &quot;/builds/worker/workspace/build/src/clang/bin/clang -std=gnu99&quot;,
&quot;cc&quot;: &quot;/builds/worker/workspace/build/src/clang/bin/clang -std=gnu99&quot;,
&quot;cxx&quot;: &quot;/builds/worker/workspace/build/src/clang/bin/clang++&quot;,
&quot;date&quot;: &quot;2019-06-03T18:14:08Z&quot;,
&quot;host&quot;: &quot;x86_64-pc-linux-gnu&quot;,
&quot;id&quot;: &quot;20190603181408&quot;,
&quot;target&quot;: &quot;x86_64-pc-linux-gnu&quot;
},
&quot;download&quot;: {
&quot;date&quot;: &quot;2019-06-03T20:49:46.559307+00:00&quot;,
&quot;mimetype&quot;: &quot;application/octet-stream&quot;,
&quot;size&quot;: 63655677,
&quot;url&quot;: &quot;https://archive.mozilla.org/pub/firefox/candidates/68.0b7-candidates/build1/linux-x86_64/en-US/firefox-68.0b7.tar.bz2&quot;
},
&quot;source&quot;: {
&quot;product&quot;: &quot;firefox&quot;,
&quot;repository&quot;: &quot;https://hg.mozilla.org/releases/mozilla-beta&quot;,
&quot;revision&quot;: &quot;ed47966f79228df65b6326979609fbee94731ef0&quot;,
&quot;tree&quot;: &quot;mozilla-beta&quot;
},
&quot;target&quot;: {
&quot;channel&quot;: &quot;beta&quot;,
&quot;locale&quot;: &quot;en-US&quot;,
&quot;os&quot;: &quot;linux&quot;,
&quot;platform&quot;: &quot;linux-x86_64&quot;,
&quot;version&quot;: &quot;68.0b7&quot;
}
}
</code></pre>
<p>The fields of <code>build</code> are documented in the <a href="https://github.com/mozilla-releng/buildhub2/blob/master/schema.yaml">schema</a>.</p>
<p>Notably, the timestamp in <code>build.download.date</code> reflects the publication date of the build on https://archive.mozilla.org.
This is earlier than the moment that the build would have been offered to clients through either the download website
or the in-product updater.</p>
<p>To find the earliest publication date for each release in the Firefox 82 series,
you can use a query like:</p>
<pre><code class="language-sql">SELECT
build.target.version,
MIN(build.download.date) AS published
FROM mozdata.telemetry.buildhub2
WHERE
build.target.version LIKE &quot;82.%&quot;
AND ENDS_WITH(build.source.repository, &quot;mozilla-release&quot;)
GROUP BY 1
ORDER BY 1
</code></pre>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/buildhub.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="release-information"><a class="header" href="#release-information">Release information</a></h1>
<p><a href="https://product-details.mozilla.org/1.0">Product Details</a> is a public JSON API which contains release information for Firefox desktop, Fenix and Thunderbird. The data contains release dates for different product versions.</p>
<p>Product Details data is exported to BigQuery daily. Data for Firefox Desktop releases is available at <code>mozdata.telemetry.releases</code> and for Fenix releases at <code>mozdata.org_mozilla_fenix.releases</code>.</p>
<p>As an example, the following query finds release dates for each non-dev release in the Firefox 82 series:</p>
<pre><code class="language-sql">SELECT
date,
version
FROM mozdata.telemetry.releases
WHERE version LIKE &quot;82.%&quot;
AND category != &quot;dev&quot;
ORDER BY date
</code></pre>
<h1 id="code-reference-19"><a class="header" href="#code-reference-19">Code reference</a></h1>
<ul>
<li><a href="https://github.com/mozilla/bigquery-etl/blob/main/sql/moz-fx-data-shared-prod/telemetry_derived/releases_v1/schema.yaml">BigQuery schema</a></li>
<li><a href="https://github.com/mozilla/bigquery-etl/blob/main/sql/moz-fx-data-shared-prod/telemetry_derived/releases_v1/query.py">Import script</a></li>
</ul>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/releases.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="firefox-suggest"><a class="header" href="#firefox-suggest">Firefox Suggest</a></h1>
<h2 id="introduction-20"><a class="header" href="#introduction-20">Introduction</a></h2>
<p>Firefox Suggest is a <code>monetizable feature</code> in the Firefox urlbar. Suggest provides real-time recommendations as users type in the urlbar. The recommendations include URLs from the user's browsing history, open tabs and bookmarks, as well as URLs for sponsored and non-sponsored content from third party partners like Ad Marketplace and Wikipedia.</p>
<p>Firefox Suggestions compete directly with Search Engine Suggestions for user attention and clicks in the urlbar. A holistic analysis of the urlbar should include data from all the sources providing recommendations to the urlbar, including Firefox Suggestions, Search Engine Suggestions, auto-fill, etc.</p>
<p>This section will include information about Firefox Suggest data. See <a href="https://docs.telemetry.mozilla.org/datasets/search.html">Search Datasets</a> for documentation on Search Engine Data.</p>
<h2 id="data-collection"><a class="header" href="#data-collection">Data Collection</a></h2>
<p>Firefox Suggestions may be served to the urlbar from Firefox itself, or by a Mozilla-owned service called Merino. When users opt in to sharing their search query data, Firefox sends the search queries to Merino, and Merino responds with recommendations.</p>
<p>In addition to the search queries, we collect Category 1 and 2 telemetry data from Firefox about how users are interacting with Suggest.</p>
<h3 id="interactions-data-cat-1-and-2"><a class="header" href="#interactions-data-cat-1-and-2">Interactions data (Cat 1 and 2)</a></h3>
<p>Interactions data related to Firefox Suggest is collected in the following ways.</p>
<ol>
<li>
<p>Interactions with Firefox Suggestions in the urlbar (i.e., clicks, impressions, blocks, clicks on help links) are collected using the standard (legacy) Telemetry system as Scalars and Events.
Full documentation of the probes is <a href="https://firefox-source-docs.mozilla.org/browser/urlbar/firefox-suggest-telemetry.html">here</a>. The Scalars are available in <a href="https://docs.telemetry.mozilla.org/datasets/batch_view/clients_daily/reference.html">Clients Daily</a>.</p>
</li>
<li>
<p>Interactions with Firefox Suggestions in the urlbar (i.e., clicks, impressions, blocks) are collected using Custom Contextual Services Pings.
The Custom Contextual Services Pings contain additional information not available in the standard Scalars and Events, such as the advertiser that provided the recommendation, if any. This data has a much shorter retention period than the data collecting in (1) above. It also does not contain the Firefox Desktop Client ID, and is not join-able by design to any datasets outside of Contextual Services. Full documentation of the probes is <a href="https://firefox-source-docs.mozilla.org/browser/urlbar/firefox-suggest-telemetry.html#contextual-services-pings">here</a>.</p>
</li>
<li>
<p><a href="about:preferences">Preference Settings</a> are collected using the standard (legacy) Telemetry system in the Environment.
Full documentation of the probes is <a href="https://firefox-source-docs.mozilla.org/browser/urlbar/firefox-suggest-telemetry.html#environment">here</a>. The Preferences are available in <a href="https://docs.telemetry.mozilla.org/datasets/batch_view/clients_daily/reference.html">Clients Daily</a>. Choices users made on opt-in modals (which propagate to <a href="about:preferences">Preferences</a>) are also recorded in the Environment.</p>
</li>
<li>
<p>Exposure Events for experiments are recorded using the standard Nimbus system.
Full documentation of the probes is <a href="https://firefox-source-docs.mozilla.org/browser/urlbar/firefox-suggest-telemetry.html#nimbus-exposure-event">here</a>.</p>
</li>
</ol>
<h3 id="search-queries-and-merino"><a class="header" href="#search-queries-and-merino">Search queries and Merino</a></h3>
<ol>
<li>
<p>Search queries sent to Merino by Firefox are logged by Merino.
Full documentation of the probes is <a href="https://firefox-source-docs.mozilla.org/browser/urlbar/firefox-suggest-telemetry.html#merino-search-queries">here</a>. For more information about the much shorter retention periods, security and access controls on this data, see the <a href="https://docs.google.com/document/d/11rOM3r5AOPUrqDnCAODY7gknxnqtjphgINSK5oAR9T4/edit#">Search Terms Data Access Policy</a> (Mozilla internal only).</p>
</li>
<li>
<p>Merino responses as seen from Firefox.
We collect data about Merino's response times and response types in Firefox using the standard (legacy) Telemetry system as Histograms. Full documentation of the probes is <a href="https://firefox-source-docs.mozilla.org/browser/urlbar/firefox-suggest-telemetry.html#histograms">here</a>.</p>
</li>
<li>
<p>Service and operational data on Merino
We also collect data about Merino as a service from Merino directly. Full documentation of the data is <a href="https://mozilla-services.github.io/merino/data.html">here</a>.</p>
</li>
</ol>
<h2 id="big-query-tables-and-looker-explores"><a class="header" href="#big-query-tables-and-looker-explores">Big Query Tables and Looker Explores</a></h2>
<p>Note: gotchas with historical Suggest revenue data are outlined <a href="https://docs.google.com/spreadsheets/d/1g-DhmvDJd4lXE7xRUrovCpLV9agAVTdCLzD6YyzvPDk/edit?usp=sharing">here</a>.</p>
<div class="table-wrapper"><table><thead><tr><th>Access-restriction(s)</th><th>Big Query Table</th><th>Looker Explore</th><th>Description</th></tr></thead><tbody>
<tr><td>All Mozillians</td><td><code>telemetry.suggest_clients_daily</code></td><td>Firefox Desktop &gt; Suggest Clients Daily</td><td>Workhorse dataset for Suggest, includes desktop data. All new Suggest metrics are added to this table. Does not include advertiser data.</td></tr>
<tr><td>Contextual Services</td><td><code>contextual_services.event_aggregates</code></td><td>Contextual Services &gt; Event Aggregates</td><td>Dataset with Sponsored Tiles and Suggest analyses by advertiser. No longer maintained. Instead use the derived <code>event_aggregates_[product]</code> datasets.</td></tr>
<tr><td>Contextual Services</td><td><code>contextual_services.event_aggregates_suggest</code></td><td>Contextual Services &gt; Event Aggregates Suggest</td><td>Workhorse dataset for Suggest analyses by advertiser.</td></tr>
<tr><td>Contextual Services</td><td><code>contextual_services_derived.adm_forecasting</code></td><td>Contextual Services &gt; <code>AdM</code> Forecasting</td><td>Dataset with required components for Sponsored Tiles and Suggest revenue forecasts.</td></tr>
<tr><td>Contextual Services, Revenue</td><td><code>contextual_services.event_aggregates</code> x <code>revenue.revenue_data</code></td><td>Revenue &gt; <code>AdM</code> Revenue with Telemetry</td><td>Revenue information combined with usage metrics. This dataset is useful for <code>CPC</code> analyses.</td></tr>
</tbody></table>
</div><footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/other/suggest/suggest.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="sponsored-tiles"><a class="header" href="#sponsored-tiles">Sponsored Tiles</a></h1>
<h2 id="introduction-21"><a class="header" href="#introduction-21">Introduction</a></h2>
<p>Firefox Sponsored Tiles is an advertising-based feature available on the new tab of both desktop and mobile Firefox browsers. This feature populates the new tab's Top Sites section with up to two Sponsored Tiles. These Sponsored Tiles display a company logo and link to the company website. Mozilla's <code>Contile</code> serves the advertisements and may refresh the advertisers every 15 minutes. Each click on a Sponsored Tile generates revenue for Mozilla.</p>
<h2 id="data-usage-considerations"><a class="header" href="#data-usage-considerations">Data Usage Considerations</a></h2>
<p>Mozillians may access Sponsored Tiles data when advertiser information is <strong>not</strong> attached. This includes metrics such as Sponsored Tile impressions, clicks, dismissals, and disablement. Sponsored Tiles is an important component of new tab user behavior and a growing source of revenue. Therefore, it is encouraged to monitor Sponsored Tile engagement metrics in do no harm experiments testing changes to the new tab.</p>
<p>Access to Sponsored Tiles data by advertiser is restricted to members of the contextual services working group. These restrictions are designed to protect user privacy, preventing excessive access to data which links a given client to their interactions with different advertisers. For more information on requesting access, see <a href="https://mana.mozilla.org/wiki/display/DATA/Data+Access+Policies">comment here</a>.</p>
<h2 id="big-query-tables-and-looker-explores-1"><a class="header" href="#big-query-tables-and-looker-explores-1">Big Query Tables and Looker Explores</a></h2>
<p>Note: gotchas with historical Sponsored Tiles revenue data are outlined <a href="https://docs.google.com/spreadsheets/d/1g-DhmvDJd4lXE7xRUrovCpLV9agAVTdCLzD6YyzvPDk/edit?usp=sharing">here</a>.</p>
<div class="table-wrapper"><table><thead><tr><th>Access-restriction(s)</th><th>Big Query Table</th><th>Looker Explore</th><th>Description</th></tr></thead><tbody>
<tr><td>All Mozillians</td><td><code>telemetry.sponsored_tiles_clients_daily</code></td><td>Firefox Desktop &gt; Sponsored Tiles Clients Daily</td><td>Workhorse dataset for Sponsored Tiles, includes desktop and mobile data. All new Sponsored Tiles metrics are added to this table. Does not include advertiser data.</td></tr>
<tr><td>All Mozillians</td><td><code>telemetry.newtab</code></td><td>Firefox Desktop &gt; <code>Newtab</code></td><td>Expanded <code>newtab</code> <strong>desktop</strong> dataset. Requires unnesting events.</td></tr>
<tr><td>All Mozillians</td><td><code>telemetry.newtab_interactions</code></td><td>Firefox Desktop &gt; New Tab Interactions</td><td>In-development dataset for basic analyses. Available metrics are limited to <strong>desktop</strong> clicks and impressions.</td></tr>
<tr><td>Contextual Services</td><td><code>contextual_services.event_aggregates</code></td><td>Contextual Services &gt; Event Aggregates</td><td>Dataset with Sponsored Tiles and Suggest analyses by advertiser. No longer maintained. Instead use the derived <code>event_aggregates_[product]</code> datasets.</td></tr>
<tr><td>Contextual Services</td><td><code>contextual_services.event_aggregates_spons_tiles</code></td><td>Contextual Services &gt; Event Aggregates <code>Spons</code> Tiles</td><td>Workhorse dataset for Sponsored Tiles analyses by advertiser.</td></tr>
<tr><td>Contextual Services</td><td><code>contextual_services_derived.adm_forecasting</code></td><td>Contextual Services &gt; <code>AdM</code> Forecasting</td><td>Dataset with required components for Sponsored Tiles and Suggest revenue forecasts.</td></tr>
<tr><td>Contextual Services, Revenue</td><td><code>contextual_services.event_aggregates</code> x <code>revenue.revenue_data</code></td><td>Revenue &gt; <code>AdM</code> Revenue with Telemetry</td><td>Revenue information combined with usage metrics. This dataset is useful for <code>CPC</code> analyses.</td></tr>
</tbody></table>
</div><footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/other/sponsored_tiles/sponsored_tiles.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="newtab-interactions"><a class="header" href="#newtab-interactions">Newtab Interactions</a></h1>
<h2 id="introduction-22"><a class="header" href="#introduction-22">Introduction</a></h2>
<p>The <code>telemetry.newtab_interactions</code> dataset is useful for analyzing data on the New Tab on Firefox Desktop. It contains a single source for user interactions with the New Tab and allows analysis to be done at either a client or visit level. This table contains data from the Glean <a href="https://dictionary.telemetry.mozilla.org/apps/firefox_desktop/pings/newtab">&quot;newtab&quot; ping</a>.</p>
<h2 id="content-2"><a class="header" href="#content-2">Content</a></h2>
<p>This dataset can contain more than one row for each <code>submission_date</code>, <code>client_id</code> and <code>visit_id</code>. Each unique tab opened is defined as a unique New Tab <code>visit_id</code>. We recommend aggregating by <code>client_id</code>, <code>submission_date</code> and <code>visit_id</code>.</p>
<p>The <code>client_id</code> is the Glean client identifier, there is also the <code>legacy_telemetry_client_id</code> which can be used to join this data to non-Glean data tables.</p>
<h2 id="background-and-caveats-10"><a class="header" href="#background-and-caveats-10">Background and Caveats</a></h2>
<h3 id="scheduling-22"><a class="header" href="#scheduling-22">Scheduling:</a></h3>
<p>&quot;newtab&quot; pings can be sent for one of two reasons.
<code>component_init</code>: The newtab component init'd, and the newtab and homepage settings have been categorized. This is mostly to ensure we hear at least once from clients configured to not show a newtab UI.
<code>newtab_session_end</code>: The newtab visit ended. Could be by navigation, being closed, etc.</p>
<p>This dataset does not have a long history. The newtab ping went live in Fall 2022 and therefore we do not have data prior to that launch. Any historical analysis will not be possible until we collect more data.</p>
<p>Currently, telemetry is not yet instrumented for organic topsites tiles. This work is in progress and will be added to the dataset soon. For now, all total topsites metrics (impression, clicks) only include sponsored topsites data.</p>
<p>Some of the preference settings enabling features like Pocket and Topsites can be set to <code>enabled</code> in regions where these features are not available. In these cases, it is best to filter by country to only include regions where these features are available. In Looker, there are pre-set country groups for Topsites Available and Pocket Available to make this easier.</p>
<h2 id="data-reference-20"><a class="header" href="#data-reference-20">Data Reference</a></h2>
<h3 id="field-descriptions-1"><a class="header" href="#field-descriptions-1">Field Descriptions</a></h3>
<h4 id="environment-specific"><a class="header" href="#environment-specific">Environment Specific:</a></h4>
<p><code>newtab_visit_id</code>: the unique id for that New Tab visit
<code>client_id</code> - Glean client id
<code>legacy_telemetry_client_id</code>: The client_id according to Telemetry.
Might not always have a value due to being too early for it to have loaded
<code>newtab_visit_started_at</code>: the timestamp when the tab was opened
<code>newtab_visit_ended_at</code>: the timestamp when the tab was closed
<code>experiments</code>: field to tag any active experiments the user is in
<code>pocket_is_signed_in</code>: boolean flag indicating if the user is signed into pocket
<code>pocket_enabled</code> - boolean flag indicating whether the user has pocket enabled in settings. This can be enabled even in countries where Pocket is not available, so it is useful to add a country filter when using this.
<code>pocket_sponsored_stories_enabled</code>: boolean indicating whether the setting to show Sponsored Stories in the New Tab is enabled. This can be enabled even in countries where Pocket is not available, so it is useful to add a country filter when using this.
<code>pocket_sponsored_stories_enabled</code>: boolean flag indicating whether Pocket sponsored stories are enabled in settings. This can be enabled even in countries where Pocket is not available, so it is useful to add a country filter when using this.
<code>topsites_enabled</code>: boolean flag for whether the client has topsites enabled in settings. This can be enabled even if topsites are not available in that country.
<code>newtab_homepage_category</code>: the current setting of the homepage URL. Classified into categories using <code>SiteClassifier</code>.
<code>newtab_newtab_category</code>: the current setting of the new tab URL. Classified into categories using <code>SiteClassifier</code>.
<code>newtab_open_source</code>: describes the situation when the tab was opened. One of “about:Welcome”, “about:Home” or “about:NewTab” to reflect whether the tab is the first for a new profile/new Firefox install, the first in a new window or a new tab in an existing session window.
<code>newtab_search_enabled</code>: boolean indicating whether the setting to enable search on New Tab is enabled
<code>is_new_profile</code>: flag indicating if the profile is new, pulled from the <code>unified_metrics</code> table
<code>activity_segment</code>: flag indicating which activity segment the client falls into on this day, pulled from the <code>unified_metrics</code> table</p>
<h4 id="search-specific"><a class="header" href="#search-specific">Search Specific:</a></h4>
<p><code>search_engine</code>: the search engine for this search
<code>search_access_point</code>: the access point where the search originated. For New Tab, this is always the New Tab search bar (handoff searches).
<code>searches</code>: count of searches
<code>tagged_search_ad_clicks</code>: count of ad clicks that resulted from tagged New Tab searches
<code>tagged_search_ad_impressions</code>: count of search engine results pages with ads which resulted from New Tab searches. This does not count the number of ads seen, but the number of pages seen which displayed ads.
<code>follow_on_search_ad_clicks</code>: count of ad clicks which resulted from a follow on search from a search that originated on the New Tab.
<code>follow_on_search_ad_impressions</code>: search engine results pages with ads which resulted from follow on searches from searches which originated on the New Tab. This does not count the number of ads seen, but the number of pages seen which displayed ads.</p>
<h4 id="pocket-specific"><a class="header" href="#pocket-specific">Pocket Specific:</a></h4>
<p><code>pocket_impressions</code>: count of total impressions on Pocket tiles, including both organic and sponsored. Each tile displayed contributes to this count.
<code>sponsored_pocket_impressions</code>: the count of sponsored Pocket impressions
<code>organic_pocket_impressions</code>: the count of organic Pocket impressions
<code>pocket_clicks</code>: count of total Pocket clicks, including both organic and sponsored.
<code>sponsored_pocket_clicks</code>: the count of sponsored Pocket clicks
<code>organic_pocket_clicks</code>: the count of organic pocket clicks
<code>pocket_saves</code>: count of times “save to pocket” was selected on all tiles
<code>sponsored_pocket_saves</code>: count of times “save to pocket” was selected on sponsored tiles
<code>organic_pocket_saves</code>: count of times “save to pocket” was selected on organic tiles
<code>pocket_topic_click</code>: count of clicks on “Popular Topics”
<code>pocket_story_position</code>: the tile position</p>
<h4 id="topsites-specific"><a class="header" href="#topsites-specific">Topsites Specific:</a></h4>
<p><code>topsites_impressions</code>: count of impressions on topsites tiles. Currently this is only instrumented for sponsored topsites and does not include non-sponsored topsites.
<code>sponsored_topsite_impressions</code>: count of impressions on sponsored topsites tiles. Each tile displayed contributes to this count.
<code>topsites_clicks</code>: count of clicks on topsites tiles. Currently this is only instrumented for sponsored topsites and does not include non-sponsored topsites.
<code>sponsored_topsite_clicks</code>: count of clicks on sponsored topsites tiles</p>
<h2 id="scheduling-23"><a class="header" href="#scheduling-23">Scheduling</a></h2>
<p>This dataset is updated daily via the telemetry-airflow infrastructure.</p>
<h2 id="schema-15"><a class="header" href="#schema-15">Schema</a></h2>
<p>The data is partitioned by <code>submission_date</code>.</p>
<h2 id="code-reference-20"><a class="header" href="#code-reference-20">Code Reference</a></h2>
<p>This dataset is generated by bigquery-etl. Refer to this repository for information on how to run or augment this dataset.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/other/newtab_interactions/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="urlbar-events"><a class="header" href="#urlbar-events">Urlbar Events</a></h1>
<h2 id="table-of-contents-14"><a class="header" href="#table-of-contents-14">Table of Contents</a></h2>
<ul>
<li><a href="datasets/other/urlbar_events/reference.html#introduction">Introduction</a></li>
<li><a href="datasets/other/urlbar_events/reference.html#urlbar-search-sessions">Urlbar search sessions</a>
<ul>
<li><a href="datasets/other/urlbar_events/reference.html#measurement">Measurement</a></li>
<li><a href="datasets/other/urlbar_events/reference.html#summary">Summary</a></li>
</ul>
</li>
<li><a href="datasets/other/urlbar_events/reference.html#results-impressions-and-clicks">Results, impressions and clicks</a></li>
<li><a href="datasets/other/urlbar_events/reference.html#urlbar-events-table">Urlbar events table</a>
<ul>
<li><a href="datasets/other/urlbar_events/reference.html#gotchas">Gotchas</a></li>
<li><a href="datasets/other/urlbar_events/reference.html#example-queries">Example queries</a></li>
<li><a href="datasets/other/urlbar_events/reference.html#column-descriptions">Column descriptions</a></li>
<li><a href="datasets/other/urlbar_events/reference.html#scheduling">Scheduling</a></li>
<li><a href="datasets/other/urlbar_events/reference.html#schema">Schema</a></li>
<li><a href="datasets/other/urlbar_events/reference.html#code-reference">Code reference</a></li>
</ul>
</li>
</ul>
<h2 id="introduction-23"><a class="header" href="#introduction-23">Introduction</a></h2>
<p>The <code>urlbar_events</code> table, derived from Glean <code>urlbar</code> events,
provides a data source for understanding user interactions with the urlbar and search.
Its structure and fields are designed around the notion of urlbar search sessions.
This data is Desktop-only.</p>
<h2 id="urlbar-search-sessions"><a class="header" href="#urlbar-search-sessions">Urlbar search sessions</a></h2>
<p>A <strong>urlbar search session</strong> is a sequence of interactions with the
<a href="https://firefox-source-docs.mozilla.org/browser/urlbar/nontechnical-overview.html">urlbar</a>,
starting from when the urlbar receives focus,
and ending when the user navigates to a new page or focuses outside of it,
causing the result panel to close.
In this context, <strong>search</strong> means <em>using the urlbar to search for a page</em>,
not specifically <em>using a search engine</em>.</p>
<p>The following diagram shows the user interaction flow through a search session.
The most common case (&quot;user types a query and clicks on a result&quot;) is shown with <strong>bold</strong> arrows.
Unusual cases are shown with dotted lines.</p>
<pre class="mermaid">flowchart LR
A[Urlbar receives focus]
A ==&gt; B[Initial results displayed]
B ==&gt; C[User types character&lt;br&gt;&lt;br&gt;Results update]
C ==&gt; D{User takes&lt;br&gt;&lt;b&gt;event action&lt;/b&gt;}
D ==&gt;|engaged| E([Selects result&lt;br&gt;&lt;code&gt;urlbar.engagement&lt;/code&gt;])
D --&gt;|abandoned| F([Focus outside of urlbar&lt;br&gt;&lt;code&gt;urlbar.abandonment&lt;/code&gt;])
D -.-&gt;|annoyance| G([Selects result option&lt;br&gt;&lt;code&gt;urlbar.engagement&lt;/code&gt;])
E ==&gt; H[Urlbar loses focus&lt;br&gt;&lt;br&gt;Result panel closes]
A -.-&gt;|paste &amp; go| E
B --&gt;|zero prefix| D
C ==&gt; C
F --&gt; H
G -.-&gt;|panel closes| H
G -.-&gt;|panel stays open| B
E -.-&gt;|search mode| B
</pre>
<p>A search session includes one or more <strong>event actions</strong> taken by the user,
usually in response to the results that are displayed.
There are 3 types of event action:</p>
<ul>
<li><strong>Engaged:</strong> the user selects a result.
This includes pressing Enter after typing, which has the effect of selecting the first result.</li>
<li><strong>Abandoned:</strong> the user focuses outside of the urlbar without selecting a result.</li>
<li><strong>Annoyance:</strong> the user selects an auxiliary option associated with a result,
e.g. <code>Dismiss</code> in the meatball menu.</li>
</ul>
<p>The search session ends if the event action causes the panel to close, e.g. by navigating to a new page.
Most search sessions see the user typing some characters and selecting a result,
ending the session after one event action.</p>
<p>However, in some instances the event action leaves the panel open for further interaction,
e.g. selecting the <code>Dismiss</code> annoyance signal.
In such cases, the search session will contain multiple event actions.
Also, in some search sessions, the user can take an event action without typing any characters
or without results being displayed, e.g. using the <code>Paste &amp; Go</code> context menu option.</p>
<p>An event action is called <strong>terminal</strong> if it causes the session to end.
Whether or not an event action is terminal is determined <em>a posteriori</em> from its characteristics.
For the complete logic on terminal event actions, see
<a href="https://github.com/mozilla/bigquery-etl/blob/main/sql_generators/urlbar_events/templates/desktop_query.sql#L30">this code</a>.</p>
<h3 id="measurement"><a class="header" href="#measurement">Measurement</a></h3>
<p>Measurement for search sessions is collected through Glean
<a href="https://dictionary.telemetry.mozilla.org/apps/firefox_desktop/metrics/urlbar_engagement"><code>urlbar.engagement</code></a>
and
<a href="https://dictionary.telemetry.mozilla.org/apps/firefox_desktop/metrics/urlbar_abandonment"><code>urlbar.abandonment</code></a>
events, which record one event for each event action.
(There is also a
<a href="https://dictionary.telemetry.mozilla.org/apps/firefox_desktop/metrics/urlbar_impression"><code>urlbar.impression</code></a>
event, but it is not currently used for data analysis).</p>
<p>These contain a snapshot of the urlbar state at the moment when the event action was taken,
such as the types of results that were showing, and the result that was selected (if any).
This means that we have information about the final set of results from which the user made a selection,
but not the intermediate result sets shown on each keystroke.</p>
<p>A search session with multiple event actions will generate multiple Glean events.
There is currently no &quot;session ID&quot; linking these together,
as the majority of search sessions generate only 1 event,
and the analytical focus is on counting occurrences rather than event sequences.
Also, the events do not contain an indicator of whether they are terminal.
This determination is made at ETL time based on the event contents.</p>
<h3 id="summary"><a class="header" href="#summary">Summary</a></h3>
<p>This table summarizes key information about the 3 types of event action:</p>
<div class="table-wrapper"><table><thead><tr><th>Event action</th><th>Terminal?</th><th>Glean event</th><th>Event extra fields of interest</th></tr></thead><tbody>
<tr><td>Engaged</td><td><ul><li>Usually yes (e.g. clicking on a result)</li><li>Sometimes no (e.g. entering search mode)</li></ul></td><td><code>urlbar.engagement</code></td><td><ul><li>Ordered list of displayed results: <code>results</code></li><li>Selected result type: <code>selected_result</code></li><li>Selected result position (1-indexed): <code>selected_position</code></li></ul></td></tr>
<tr><td>Abandoned</td><td>Yes</td><td><code>urlbar.abandonment</code></td><td>Ordered list of displayed results: <code>results</code></td></tr>
<tr><td>Annoyance</td><td><ul><li>Sometimes yes (e.g. &quot;Learn More&quot;)</li><li>Sometimes no (e.g. &quot;Dismiss&quot;)</li></ul></td><td><code>urlbar.engagement</code></td><td><ul><li>Ordered list of displayed results: <code>results</code></li><li>Selected result type: <code>selected_result</code></li><li>Selected result position (1-indexed): <code>selected_position</code></li><li>Annoyance signal (the option selected for a result): <code>engagement_type</code></li></ul></td></tr>
</tbody></table>
</div>
<h2 id="results-impressions-and-clicks"><a class="header" href="#results-impressions-and-clicks">Results, impressions and clicks</a></h2>
<p>The primary use case for this data is calculating click and impression rates
for different types of urlbar search results in order to answer Product questions.
The <a href="https://mozilla.cloud.looker.com/explore/firefox_desktop/urlbar_events">Urlbar Events Looker explore</a>
is built on top of the <code>urlbar_events</code> table to serve this need.</p>
<p>The <code>results</code> and <code>selected_result</code> fields in the Glean event extras report &quot;raw&quot; result types,
which are sometimes more granular than Product needs.
Product has developed a
<a href="https://github.com/mozilla/bigquery-etl/blob/main/sql/mozfun/norm/result_type_to_product_name/udf.sql">mapping</a>
which translates these raw values into interpretable <strong>Product result types</strong> (e.g. &quot;search suggestion&quot;).
All columns in the <code>urlbar_events</code> table containing raw result types (e.g. <code>selected_result</code>)
have a corresponding Product version (e.g. <code>product_selected_result</code>) with the mapped values.
If a raw result type does not map to any Product result type, the mapping returns <code>other</code>.</p>
<p>An <strong>impression</strong> is defined as a result that is showing in the result panel at event action time.
This means:</p>
<ul>
<li>We only consider 1 set of impressions per event action. As the user types characters,
they will see intermediate result sets, as the result panel updates on each keystroke.
However, these are currently not taken into account.</li>
<li>At event action time, there are usually multiple results showing, i.e. multiple impressions.
Many impression sets have 10 impressions (the default number).
The number of impressions shown on an event action is given in the <code>num_total_results</code> column.</li>
<li>An impression set may have multiple impressions of the same type.
e.g. multiple search suggestions are usually surfaced for a typed query.</li>
</ul>
<p>The ordered list of result impressions for each event action is given in the array-valued column <code>results</code>.</p>
<p>A <strong>click</strong> occurs when the user selects a result, i.e. taking an engaged event action.</p>
<ul>
<li>We use this as standard terminology, even though the user may not have physically clicked a mouse.</li>
<li>The majority of clicks are terminal: they cause a page to be loaded and the search session to end.
In a few rare cases, a click is not terminal.</li>
</ul>
<p>The type of result selected is given in the <code>selected_result</code>/<code>product_selected_result</code> columns.</p>
<p><strong>CTR</strong> can be computed in 2 ways for a given result type:</p>
<ol>
<li>num clicks / total num impressions</li>
<li>num clicks / num search sessions with at least 1 such impression</li>
</ol>
<p>We generally use (2.) for Product-focused analyses and experiments, including the
<a href="https://mozilla.cloud.looker.com/explore/firefox_desktop/urlbar_events">Looker explore</a>.
For result types that have at most 1 impression per result set (e.g. navigational),
these will be the same.
For types that tend to have multiple impressions per result set (e.g. search suggestions),
(1.) could be much lower than (2.).</p>
<p>An <strong>annoyance</strong> occurs when the user selects an option associated with a result, e.g. &quot;Dismiss&quot;,
without selecting the result itself.
These are usually found in the meatball menu next to the displayed result.
The <code>annoyance_signal_type</code> column gives the type of annoyance option that was selected,
and <code>selected_result</code>/<code>product_selected_result</code> give the result type with which the annoyance is associated.</p>
<p>For more examples of previously used metrics, see the
<a href="https://github.com/mozilla/metric-hub/blob/main/jetstream/outcomes/firefox_desktop/firefox_suggest.toml">Firefox Suggest Jetstream outcome</a>.</p>
<h2 id="urlbar-events-table"><a class="header" href="#urlbar-events-table">Urlbar events table</a></h2>
<p>The <a href="https://github.com/mozilla/bigquery-etl/tree/main/sql_generators/urlbar_events/templates"><code>mozdata.firefox_desktop.urlbar_events</code></a>
table contains 1 row for each Glean event (i.e. 1 row per event action) reported across all Desktop users.</p>
<p>As discussed above, most search sessions only have 1 associated row, but some have multiple.
There is no session identifier linking rows associated with the same session
(although it may be possible to infer such linkage from event sequencing).
However, the <code>is_terminal</code> column indicates whether the event action was terminal.
The event action type is listed in the <code>event_action</code> column.</p>
<p>Most of the Glean event extras fields are included in separate columns.
Additionally:</p>
<ul>
<li>The array-valued <code>results</code> column lists the ordered results showing at event action time.
Each array element is a struct with <code>result_type</code>, <code>product_result_type</code>, <code>position</code>, and <code>result_group</code>.</li>
<li><code>selected_result</code>, <code>product_selected_result</code>, <code>selected_position</code> give the selected result
associated with an engagement or annoyance.</li>
<li><code>annoyance_signal_type</code> gives the annoyance option selected, if any.</li>
<li><code>event_id</code> is a row identifier UUID. This is mainly useful when unnesting the <code>results</code> column.</li>
<li><code>glean_client_id</code>, <code>seq</code> (from the event's <code>ping_info</code>), <code>event_timestamp</code> can be used
to build event sequences and interlace with SERP events.</li>
</ul>
<p>This table summarizes the main column values associated with each event action:</p>
<div class="table-wrapper"><table><thead><tr><th>Event action</th><th><code>event_action</code></th><th><code>is_terminal</code></th><th><code>selected_result</code></th><th><code>annoyance_signal_type</code></th></tr></thead><tbody>
<tr><td>Engaged</td><td><code>engaged</code></td><td><code>true</code> or <code>false</code></td><td><code>selected_result</code> from Glean engagement event</td><td><code>null</code></td></tr>
<tr><td>Abandoned</td><td><code>abandoned</code></td><td><code>true</code></td><td><code>null</code></td><td><code>null</code></td></tr>
<tr><td>Annoyance</td><td><code>annoyance</code></td><td><code>true</code> or <code>false</code></td><td><code>selected_result</code> from Glean engagement event</td><td><code>engagement_type</code> from Glean engagement event</td></tr>
</tbody></table>
</div>
<h3 id="gotchas"><a class="header" href="#gotchas">Gotchas</a></h3>
<ul>
<li>Each search session may have multiple associated rows.
To count <strong>unique search sessions</strong> (the most common use case), condition on <code>is_terminal = true</code>.</li>
<li>To work with impressions, <code>UNNEST</code> the <code>results</code> column.</li>
</ul>
<h3 id="example-queries-16"><a class="header" href="#example-queries-16">Example queries</a></h3>
<p>To count number of search sessions:</p>
<pre><code class="language-sql">SELECT
COUNT(*)
FROM
`mozdata.firefox_desktop.urlbar_events`
WHERE
is_terminal
</code></pre>
<p>Engagement rate (proportion of search sessions ending with an engaged action):</p>
<pre><code class="language-sql">SELECT
COUNTIF(event_action = 'engaged') / COUNT(*)
FROM
`mozdata.firefox_desktop.urlbar_events`
WHERE
is_terminal
</code></pre>
<p>Impression rate for history (proportion of search sessions that ended with a history impression showing):</p>
<pre><code class="language-sql">SELECT
COUNT(DISTINCT IF(r.product_result_type = 'history', event_id, NULL))
/ COUNT(DISTINCT event_id)
FROM
`mozdata.firefox_desktop.urlbar_events`,
UNNEST(results) AS r
WHERE
is_terminal
</code></pre>
<p>Number of clicks on a history result:</p>
<pre><code class="language-sql">SELECT
COUNT(*)
FROM
`mozdata.firefox_desktop.urlbar_events`
WHERE
is_terminal
AND event_action = 'engaged'
AND product_selected_result = 'history'
</code></pre>
<p>CTR for history (denominator is search sessions that had an impression,
not number of impressions):</p>
<pre><code class="language-sql">SELECT
COUNT(DISTINCT IF(event_action = 'engaged' AND product_selected_result = 'history', event_id, NULL))
/ COUNT(DISTINCT IF(r.product_result_type = 'history', event_id, NULL))
FROM
`mozdata.firefox_desktop.urlbar_events`,
UNNEST(results) AS r
WHERE
is_terminal
</code></pre>
<p>Number of result dismissals (annoyance):</p>
<pre><code class="language-sql">SELECT
COUNT(*)
FROM
`mozdata.firefox_desktop.urlbar_events`
WHERE
event_action = 'annoyance'
AND annoyance_signal_type = 'dismiss'
</code></pre>
<h3 id="column-descriptions"><a class="header" href="#column-descriptions">Column descriptions</a></h3>
<p>Descriptions for relevant columns in the table are provided here.</p>
<p>(This will be moved to <code>bigquery-etl</code> in the future,
but tables generated dynamically using <code>sql_generators</code> don't currently support <code>schema.yaml</code>.)</p>
<div class="table-wrapper"><table><thead><tr><th>Column</th><th>Description</th></tr></thead><tbody>
<tr><td><code>event_name</code></td><td>Name of the <code>urlbar</code> Glean event represented by this row: <code>engagement</code> or <code>abandonment</code></td></tr>
<tr><td><code>event_timestamp</code></td><td>Glean <a href="https://mozilla.github.io/glean/book/user/pings/events.html#contents">event timestamp</a></td></tr>
<tr><td><code>event_id</code></td><td>Row identifier UUID. When unnesting the <code>results</code> column, use <code>COUNT(DISTINCT event_id)</code> to count events.</td></tr>
<tr><td><code>seq</code></td><td><a href="https://mozilla.github.io/glean/book/user/pings/index.html#seq"><code>ping_info.seq</code></a> from the events ping. Use together with <code>event_timestamp</code> for event sequencing.</td></tr>
<tr><td><code>normalized_engine</code></td><td>Normalized default search engine</td></tr>
<tr><td><code>pref_fx_suggestions</code></td><td>Is Firefox Suggest enabled (nonsponsored suggestions)?</td></tr>
<tr><td><code>pref_sponsored_suggestions</code></td><td>Are Firefox Suggest sponsored suggestions enabled?</td></tr>
<tr><td><code>pref_data_collection</code></td><td>Has the user opted into Firefox Suggest data collection, aka Suggest Online?</td></tr>
<tr><td><code>engagement_type</code></td><td>How the user selected the result [e.g. <code>click</code>, <code>enter</code>]</td></tr>
<tr><td><code>interaction</code></td><td>How the user started the search action [e.g. <code>typed</code>, <code>pasted</code>]</td></tr>
<tr><td><code>num_chars_typed</code></td><td>Length of the query string typed by the user</td></tr>
<tr><td><code>num_total_results</code></td><td>Number of results displayed</td></tr>
<tr><td><code>selected_position</code></td><td>Rank of the selected result, starting from 1, if any</td></tr>
<tr><td><code>selected_result</code></td><td>Raw type identifier for the selected result, if any [e.g. <code>search_suggest</code>, <code>bookmark</code>]</td></tr>
<tr><td><code>product_selected_result</code></td><td><a href="datasets/other/urlbar_events/reference.html#results-impressions-and-clicks">Product type identifier</a> for the selected result, if any [e.g. <code>wikipedia_enhanced</code>, <code>default_partner_search_suggestion</code>]</td></tr>
<tr><td><code>results</code></td><td>Array listing info about each result displayed</td></tr>
<tr><td><code>results.position</code></td><td>Display rank of this result, starting from 1</td></tr>
<tr><td><code>results.result_type</code></td><td>Raw type identifier for this result</td></tr>
<tr><td><code>results.product_result_type</code></td><td>Product type identifier for this result</td></tr>
<tr><td><code>results.result_group</code></td><td>Result group this result belongs to [e.g. <code>heuristic</code>, <code>suggest</code>]</td></tr>
<tr><td><code>event_action</code></td><td>Event action: <code>engaged</code>, <code>abandoned</code>, or <code>annoyance</code></td></tr>
<tr><td><code>is_terminal</code></td><td>Did the event action cause the search session to end? Filter on <code>is_terminal = TRUE</code> to count unique search sessions.</td></tr>
<tr><td><code>engaged_result_type</code></td><td>Raw type identifier for the selected result, if any</td></tr>
<tr><td><code>product_engaged_result_type</code></td><td>Product type identifier for the selected result, if any</td></tr>
<tr><td><code>annoyance_signal_type</code></td><td>Annoyance option selected, if any. This uses the value of <code>engagement_type</code> when <code>event_action</code> is annoyance. [e.g. <code>dismiss</code>, <code>help</code>]</td></tr>
</tbody></table>
</div>
<h3 id="scheduling-24"><a class="header" href="#scheduling-24">Scheduling</a></h3>
<p>This dataset is scheduled on Airflow and updated daily.</p>
<h3 id="schema-16"><a class="header" href="#schema-16">Schema</a></h3>
<p>The data is partitioned by <code>submission_date</code>.</p>
<h3 id="code-reference-21"><a class="header" href="#code-reference-21">Code reference</a></h3>
<p>This table is
<a href="https://github.com/mozilla/bigquery-etl/blob/main/sql_generators/urlbar_events/__init__.py">generated</a>
from a templated query defined under
<a href="https://github.com/mozilla/bigquery-etl/blob/main/sql_generators/urlbar_events/templates/desktop_query.sql"><code>bigquery_etl/sql_generators</code></a>.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/other/urlbar_events/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="urlbar-events-daily"><a class="header" href="#urlbar-events-daily">Urlbar Events Daily</a></h1>
<h2 id="table-of-contents-15"><a class="header" href="#table-of-contents-15">Table of Contents</a></h2>
<ul>
<li><a href="datasets/other/urlbar_events_daily/reference.html#introduction">Introduction</a></li>
<li><a href="datasets/other/urlbar_events_daily/reference.html#use-case">Use Case</a></li>
<li><a href="datasets/other/urlbar_events_daily/reference.html#urlbar-events-daily-table">Urlbar events daily table</a>
<ul>
<li><a href="datasets/other/urlbar_events_daily/reference.html#dimensions">Dimensions</a></li>
<li><a href="datasets/other/urlbar_events_daily/reference.html#scheduling">Scheduling</a></li>
<li><a href="datasets/other/urlbar_events_daily/reference.html#schema">Schema</a></li>
<li><a href="datasets/other/urlbar_events_daily/reference.html#code-reference">Code reference</a></li>
</ul>
</li>
</ul>
<h2 id="introduction-24"><a class="header" href="#introduction-24">Introduction</a></h2>
<p>The <code>urlbar_events_daily</code> table, derived from <code>urlbar_events</code>, which in turn is derived from Glean <code>urlbar</code> events, provides a daily aggregate table across the different user interactions with the urlbar. This data is Desktop-only.</p>
<p>Details on the <code>urlbar_events</code> table can be found <a href="datasets/other/urlbar_events_daily/../urlbar_events/reference.html">here</a>.</p>
<p>More details about the <code>urlbar</code> can be found <a href="https://firefox-source-docs.mozilla.org/browser/urlbar/nontechnical-overview.html">here</a>.</p>
<h2 id="use-case"><a class="header" href="#use-case">Use Case</a></h2>
<p>The aim of this table is to provide easy-to-use information to our Business Development partners and Product Management stakeholders. This table gives them accurate, up-to-date, easily accessible data.</p>
<p>This table will also power related dashboards in Looker. Its aim is to speed up processing and display time for dependent dashboards.</p>
<h2 id="urlbar-events-daily-table"><a class="header" href="#urlbar-events-daily-table">Urlbar events daily table</a></h2>
<h3 id="dimensions"><a class="header" href="#dimensions">Dimensions</a></h3>
<p>This table is grouped by the following dimensions:</p>
<ul>
<li><code>submission_date</code></li>
<li><code>normalized_country_code</code></li>
<li><code>normalized_channel</code></li>
<li><code>firefox_suggest_enabled</code></li>
<li><code>sponsored_suggestions_enabled</code></li>
<li><code>product_result_type</code></li>
</ul>
<p>These are the aggregates counts that are available in this table:</p>
<ul>
<li><code>urlbar_impressions</code></li>
<li><code>urlbar_clicks</code></li>
<li><code>urlbar_annoyances</code></li>
<li><code>urlbar_sessions</code></li>
</ul>
<p>For more information about the exact definition for these metrics, please visit <a href="datasets/other/urlbar_events_daily/../urlbar_events/reference.html#measurement">here</a></p>
<h3 id="scheduling-25"><a class="header" href="#scheduling-25">Scheduling</a></h3>
<p>This dataset is scheduled on Airflow and updated daily.</p>
<h3 id="schema-17"><a class="header" href="#schema-17">Schema</a></h3>
<p>The data is partitioned by <code>submission_date</code>.</p>
<h3 id="code-reference-22"><a class="header" href="#code-reference-22">Code reference</a></h3>
<p>This table is created from the following
<a href="https://github.com/mozilla/bigquery-etl/blob/main/sql/moz-fx-data-shared-prod/firefox_desktop_derived/urlbar_events_daily_v1/query.sql">query</a>.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/other/urlbar_events_daily/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="serp-events"><a class="header" href="#serp-events">SERP Events</a></h1>
<h2 id="table-of-contents-16"><a class="header" href="#table-of-contents-16">Table of Contents</a></h2>
<ul>
<li><a href="datasets/other/serp_events/reference.html#introduction">Introduction</a></li>
<li><a href="datasets/other/serp_events/reference.html#serp-impression-model">SERP impression model</a>
<ul>
<li><a href="datasets/other/serp_events/reference.html#ad-detection-and-visibility">Ad detection and visibility</a></li>
</ul>
</li>
<li><a href="datasets/other/serp_events/reference.html#measurement">Measurement</a>
<ul>
<li><a href="datasets/other/serp_events/reference.html#limitations-of-ad-impression-detection">Limitations of ad impression detection</a></li>
</ul>
</li>
<li><a href="datasets/other/serp_events/reference.html#ad-impressions-and-clicks">Ad impressions and clicks</a></li>
<li><a href="datasets/other/serp_events/reference.html#serp-events-table">SERP events table</a>
<ul>
<li><a href="datasets/other/serp_events/reference.html#ad-component-tagging">Ad component tagging</a></li>
<li><a href="datasets/other/serp_events/reference.html#assumptions-on-event-sequences">Assumptions on event sequences</a></li>
<li><a href="datasets/other/serp_events/reference.html#gotchas">Gotchas</a></li>
<li><a href="datasets/other/serp_events/reference.html#example-queries">Example queries</a></li>
<li><a href="datasets/other/serp_events/reference.html#column-descriptions">Column descriptions</a></li>
<li><a href="datasets/other/serp_events/reference.html#scheduling">Scheduling</a></li>
<li><a href="datasets/other/serp_events/reference.html#schema">Schema</a></li>
<li><a href="datasets/other/serp_events/reference.html#code-reference">Code reference</a></li>
</ul>
</li>
</ul>
<h2 id="introduction-25"><a class="header" href="#introduction-25">Introduction</a></h2>
<p>The <code>serp_events</code> table, derived from Glean <code>serp</code> events, provides a source
for understanding user interactions with search engine result pages (SERPs).
It is structured around SERP page loads and engagements
with different UI components on the SERP.
This data is Desktop-only.</p>
<h2 id="serp-impression-model"><a class="header" href="#serp-impression-model">SERP impression model</a></h2>
<p>A <strong>SERP impression</strong> consists of a SERP page load,
together with any user engagements with the links and UI features displayed on the page.
It starts from when the SERP is loaded.
Soon after the page loads, the browser runs a scan to detect sponsored search results (i.e. ads).
The user may engage with the page by clicking on a link or UI element.
The SERP impression ends when the user navigates away or closes the page.</p>
<p>The following diagram outlines the flow through a SERP impression,
along with the Glean events which are sent at different points.</p>
<pre class="mermaid">flowchart LR
A[User loads SERP] --&gt; AA([&lt;code&gt;serp.impression&lt;/code&gt;])
AA --&gt; B{Ads loaded?}
B --&gt;|Yes| C([&lt;code&gt;serp.ad_impression&lt;/code&gt;])
C --&gt;|Each component| C
B --&gt;|No| D{User engages?}
C --&gt; D
D --&gt;|No| E([&lt;code&gt;serp.abandonment&lt;/code&gt;])
D --&gt;|Yes| F([&lt;code&gt;serp.engagement&lt;/code&gt;])
F --&gt;|User engages again?| F
F --&gt; G[User leaves SERP]
E --&gt; G
AA -.-&gt; H{{User engages\nbefore ad\ncategorization\ncomplete}}
H -.-&gt; E
classDef edgecase fill:lightgrey,stroke:grey,font-size:10pt
class H edgecase
linkStyle 11,12 stroke-width:1px
</pre>
<p>A SERP impression is considered <strong>engaged</strong> if the user clicks at least once
on a search result link or UI component (out of the ones we track).
If the user leaves the SERP without engaging, the impression is considered <strong>abandoned</strong>.</p>
<h3 id="ad-detection-and-visibility"><a class="header" href="#ad-detection-and-visibility">Ad detection and visibility</a></h3>
<p>Depending on the search provider, ads may displayed across different
<a href="https://docs.google.com/document/d/1OxixC4r7hytWtwsHY0tDkq3rlY9vocmf5o0ija07A9o/edit#bookmark=id.nzlxxwj74kro">display components</a>
(areas of the SERP with specific UI treatments), such as inline sponsored results,
a carousel showing result tiles streaming horizontally across the top,
a sidebar with result tiles laid out in a grid.
The ad detection procedure scans each of these components to check whether ad links are present.</p>
<p>Ad detection checks for ad links that are <strong>loaded</strong>, i.e. present in the DOM.
Loaded ad links may or may not be visible to the user.</p>
<ul>
<li>Ad links are considered <strong>visible</strong> if the user has an opportunity to see them,
i.e. display properties of the DOM element containing the ad link make them visible,
and they are in the area of the page showing on the user's screen.</li>
<li>They are considered <strong>blocked</strong> if the display properties of the DOM element
containing the ad link appear to have been altered by an ad blocker so as to make it invisible.</li>
<li>Ad links that are neither visible nor explicitly blocked are considered <strong>not showing</strong>.
These may be hidden by the SERP or outside of the area of the page the user has seen,
e.g. &quot;below the fold&quot;, or additional results in the carousel the user needs to scroll to.</li>
</ul>
<p>Usually, if an ad blocker is in use, all loaded ads will be blocked.
In such cases, we infer an <strong>ad blocker to be in use</strong>.</p>
<h2 id="measurement-1"><a class="header" href="#measurement-1">Measurement</a></h2>
<p>Measurement for SERP impressions is collected through Glean <code>serp</code> category events.
All events include the <code>impression_id</code> field in their event extras,
which is used to link together events associated with the same SERP impression.
SERP events are only implemented for specific search engines.</p>
<p>When a user loads a SERP, a
<a href="https://dictionary.telemetry.mozilla.org/apps/firefox_desktop/metrics/serp_impression"><code>serp.impression</code></a>
event is recorded with a newly-generated <code>impression_id</code>,
containing some top-level information about the impression and the search that led to it.
Here, a &quot;SERP impression&quot; means a single page load,
not a sequence of pages associated with the same search term
(which might be called a &quot;search session&quot;).
If the user loads Page 2 of the search results, or opens the Shopping results page,
that is considered a separate SERP impression and generates a new <code>impression_id</code>.</p>
<p>When ad detection runs, a
<a href="https://dictionary.telemetry.mozilla.org/apps/firefox_desktop/metrics/serp_ad_impression"><code>serp.ad_impression</code></a>
event is generated for each display component containing at least 1 loaded element.
It records counts of:</p>
<ul>
<li>loaded elements: <code>ads_loaded</code></li>
<li>elements which are visible to the user: <code>ads_visible</code></li>
<li>elements which appear to have been blocked: <code>ads_hidden</code>.</li>
</ul>
<p>These counts have the following properties:</p>
<ul>
<li><code>ads_loaded &gt; 0</code></li>
<li><code>0 &lt;= ads_visible + ads_hidden &lt;= ads_loaded</code></li>
<li>Usually <code>ads_hidden = 0</code> or <code>= ads_loaded</code>.</li>
</ul>
<p>Despite the naming prefixed by <code>ads_</code>,
<code>ad_impression</code> events are also reported
for certain other page components which do not contain ad links and are not monetizable,
such as the shopping tab and Google's refined search buttons.
For these, <code>ads_loaded</code> tracks whether the feature was on the page, and is either 0 or 1.
For components containing ads, the counts refer to ad links.</p>
<p>A separate
<a href="https://dictionary.telemetry.mozilla.org/apps/firefox_desktop/metrics/serp_engagement"><code>serp.engagement</code></a>
event is recorded each time the user clicks on one of the instrumented UI components.
These include the ad components, organic links on the page,
as well as certain other non-ad page components.
Along with clicks on links, some components report additional engagement types,
such as the Expand (right arrow) button for the carousel,
or submitting a new search from the search box.</p>
<p>The following table summarizes impressions and engagements instrumented for the main components.
Others components or engagement actions may be instrumented in the future;
for the most up-to-date list, refer to the
<a href="https://dictionary.telemetry.mozilla.org/apps/firefox_desktop/metrics/serp_ad_impression"><code>serp.ad_impression</code></a>
and
<a href="https://dictionary.telemetry.mozilla.org/apps/firefox_desktop/metrics/serp_engagement"><code>serp.engagement</code></a>
event documentation.</p>
<div class="table-wrapper">
<table>
<thead>
<tr>
<th></th>
<th>Component</th>
<th>Impression reported</th>
<th>Possible engagement actions</th>
<th>Search engines supported</th>
</tr>
</thead>
<tbody>
<tr>
<th rowspan="5">Ad components</th>
<td><code>ad_carousel</code></td>
<td><code>serp.ad_impression</code></td>
<td><code>clicked</code>, <code>expanded</code></td>
<td>all</td>
</tr>
<tr>
<td><code>ad_image_row</code></td>
<td><code>serp.ad_impression</code></td>
<td><code>clicked</code></td>
<td>all</td>
</tr>
<tr>
<td><code>ad_link</code></td>
<td><code>serp.ad_impression</code></td>
<td><code>clicked</code></td>
<td>all</td>
</tr>
<tr>
<td><code>ad_sidebar</code></td>
<td><code>serp.ad_impression</code></td>
<td><code>clicked</code></td>
<td>all except Ecosia</td>
</tr>
<tr>
<td><code>ad_sitelink</code></td>
<td><code>serp.ad_impression</code></td>
<td><code>clicked</code></td>
<td>all</td>
</tr>
<tr>
<th rowspan="5">Other page components</th>
<td><code>refined_search_buttons</code></td>
<td><code>serp.ad_impression</code></td>
<td><code>clicked</code>, <code>expanded</code></td>
<td>Google only</td>
</tr>
<tr>
<td><code>shopping_tab</code></td>
<td><code>serp.ad_impression</code></td>
<td><code>clicked</code></td>
<td>all</td>
</tr>
<tr>
<td><code>cookie_banner</code></td>
<td><code>serp.ad_impression</code></td>
<td><code>clicked_accept</code>, <code>clicked_reject</code>, <code>clicked_more_options</code></td>
<td>all</td>
</tr>
<tr>
<td><code>incontent_searchbox</code></td>
<td>None (impression assumed)</td>
<td><code>clicked</code>, <code>submitted</code></td>
<td>all</td>
</tr>
<tr>
<td><code>non_ads_link</code></td>
<td>None (impression assumed)</td>
<td><code>clicked</code></td>
<td>all</td>
</tr>
</tbody>
</table>
</div>
<p>If the user leaves the page without making an engagement,
a
<a href="https://dictionary.telemetry.mozilla.org/apps/firefox_desktop/metrics/serp_abandonment"><code>serp.abandonment</code></a>
event is generated, indicating the <code>reason</code> for abandonment:
navigating to a different page, or closing the tab or window.</p>
<p>One subtlety here is that there is no explicit signal for when an engaged SERP impression ends.
The user may leave the SERP open a long time and keep clicking on different links or components.
To count engagements, we need to aggregate all <code>engagement</code> events for that <code>impression_id</code>,
and these may come in over some undefined period of time.
We handle this by imposing a 2-day maximum SERP impression length at ETL time,
as described <a href="datasets/other/serp_events/reference.html#assumptions-on-event-sequences">below</a>.</p>
<h3 id="limitations-of-ad-impression-detection"><a class="header" href="#limitations-of-ad-impression-detection">Limitations of ad impression detection</a></h3>
<p>As the ad detection procedure runs at most once for a SERP impression
against a snapshot of the page,
ad impression and click reporting will be subject to some small systematic bias.
This is important to be aware of, although no explicit correction is used at present.</p>
<p>As a user continues to interact with the SERP,
it is possible for additional ads to become visible and for the user to engage with those.
For example, if the user keeps scrolling further down the page,
they may begin to see ads which were considered &quot;not showing&quot; by ad detection.
Engagements with these ads will be recorded, but the impressions may not,
meaning that the number of visible ad links may be undercounted.</p>
<p>There is also an edge case in which the user may click on a result
before ad detection has time to complete;
such impressions are reported as abandoned, and ad impressions and clicks are ignored.</p>
<p>However, the Legacy Telemetry ad click measurement will count all of these cases as ad clicks,
since it checks links for ads at click time rather than taking a snapshot.
This means that the <code>serp</code> events will undercount ad clicks somewhat relative to Legacy Telemetry.</p>
<p>The different cases are described in the following table:</p>
<div class="table-wrapper"><table><thead><tr><th>Engagement target</th><th>Click reporting</th><th>Link impression reporting</th></tr></thead><tbody>
<tr><td>Non-ad component</td><td><code>serp.engagement</code> for target</td><td>No explicit reporting, impression assumed</td></tr>
<tr><td>Ad detected as visible</td><td><code>serp.engagement</code> for ad component</td><td>Included in <code>ads_visible</code> count of <code>serp.ad_impression</code> for component</td></tr>
<tr><td>Ad detected but not visible<p>E.g. user scrolls to reveal an ad that was on the page but not in the visible area when ad detection was run</td><td><code>serp.engagement</code> for ad component</td><td>Included in <code>ads_loaded</code> count but not <code>ads_visible</code> count of <code>serp.ad_impression</code> for component</td></tr>
<tr><td>Ad not previously detected<p>E.g. user scrolls down and more results are loaded automatically. Includes ads that were not on the page when ad detection was run</td><td><code>serp.engagement</code> for <code>non_ad_link</code></td><td>Not included in <code>serp.ad_impression</code></td></tr>
<tr><td>Engagement before ad detection completed</td><td>None. A <code>serp.abandonment</code> is sent instead.</td><td>None</td></tr>
</tbody></table>
</div>
<h2 id="ad-impressions-and-clicks"><a class="header" href="#ad-impressions-and-clicks">Ad impressions and clicks</a></h2>
<p>One of the main applications of this data is to compute ad impressions and clicks per SERP impression.
As discussed above, a SERP impression may include several ad links across different display components,
as well as organic links, and see multiple engagements with any of these.
Depending on the use case, ad impressions and clicks may be viewed either per-component or per-SERP impression.</p>
<p>To count impressions and clicks, we can either count individual ads and clicks,
or the number of page loads with least one ad or click.
The latter is usually preferred, since the former could give CTR counts larger than 1
and is more susceptible to issues described <a href="datasets/other/serp_events/reference.html#limitations-of-ad-impression-detection">above</a>.
The rate of individual ad impressions per component may be of interest as well.</p>
<p>A display component in a SERP impression is said to have:</p>
<ul>
<li>an <strong>ad impression</strong> if it had at least 1 visible ad</li>
<li>a <strong>click</strong> if it had at least 1 visible ad and at least 1 click engagement.</li>
</ul>
<p>This means that, for a given SERP impression and display component,
<em>ad impression</em> and <em>click</em> are both binary 0/1 variables.
<strong>CTR</strong> for ads is then defined as <code>clicks / ad impressions</code>, and will be between 0 and 1.
We can also compute CTR for components that don't have explicit impression reporting,
such as organic results, by assuming 1 impression per SERP.</p>
<p>Impressions, clicks, and CTR can be computed per SERP impression instead
by considering an impression or click to have occurred if at least 1 display component had one.</p>
<p>If a component has ads loaded, and they are all hidden,
an <strong>ad blocker is inferred</strong> to be in use on the component.
At the SERP level, ad ad blocker is inferred to be in use if is it inferred on at least 1 ad component.</p>
<h2 id="serp-events-table"><a class="header" href="#serp-events-table">SERP events table</a></h2>
<p>The
<a href="https://github.com/mozilla/bigquery-etl/blob/main/sql_generators/serp_events_v2/templates/view.sql"><code>mozdata.firefox_desktop.serp_events</code></a>
table has <strong>1 row per SERP impression</strong> (indexed by <code>impression_id</code>),
combining information from all <code>serp</code> event types
recorded for that <code>impression_id</code>.
The <code>submission_date</code> column gives the submission date of the <code>serp.impression</code> event.</p>
<p>Alongside impression-level fields and summaries,
the table has 3 array-valued fields:</p>
<ul>
<li><code>ad_components</code>, with 1 entry per <a href="datasets/other/serp_events/reference.html#measurement">ad component</a>,
listing counts of impressions and engagements</li>
<li><code>non_ad_impressions</code>, with 1 entry per non-ad page component,
listing impression counts</li>
<li><code>non_ad_engagements</code>, with 1 entry per non-ad page component and engagement type,
listing engagement counts.</li>
</ul>
<p>These arrays only include components with at least one non-zero count.
For SERP impressions with no impressions or engagements,
the corresponding arrays will be empty.</p>
<p>SERP impressions with no engagements are considered abandoned and have a non-null <code>abandon_reason</code>.</p>
<h3 id="ad-component-tagging"><a class="header" href="#ad-component-tagging">Ad component tagging</a></h3>
<p>Tagging components as containing ads,
and computing related fields such as <code>ad_components</code> and <code>num_ad_clicks</code>,
is implemented at the
<a href="https://github.com/mozilla/bigquery-etl/blob/main/sql_generators/serp_events_v2/templates/view.sql">view layer</a>
using the
<a href="https://github.com/mozilla/bigquery-etl/blob/main/sql/mozfun/serp_events/is_ad_component/udf.sql"><code>is_ad_component</code> UDF</a>.</p>
<p>The underlying
<a href="https://github.com/mozilla/bigquery-etl/blob/main/sql_generators/serp_events_v2/templates/desktop_query.sql">derived table</a>
instead has 2 array-valued fields:</p>
<ul>
<li><code>component_impressions</code>, with 1 entry per component reported in a <code>serp.ad_impression</code> event</li>
<li><code>engagements</code>, with 1 entry per component and engagement action reported in a <code>serp.engagement</code> event.</li>
</ul>
<h3 id="assumptions-on-event-sequences"><a class="header" href="#assumptions-on-event-sequences">Assumptions on event sequences</a></h3>
<p>We expect to see the following events reported for a given <code>impression_id</code>:</p>
<ul>
<li>1 <code>serp.impression</code> event</li>
<li>Either 1 <code>serp.abandonment</code> event or else 1 or more <code>serp.engagement</code> events</li>
<li>0 or more <code>serp.ad_impression</code> events, with at most 1 per component</li>
</ul>
<p>Impression IDs for events which don't meet these requirements
are excluded from the table (this only affects a handful of impressions).</p>
<p>As discussed above,
a user could potentially keep a SERP open in their browser for multiple days
and keep recording clicks over that time,
as impression IDs don't have an inherent expiration.
In filling the table,
we only allow events for an impression ID to span at most <strong>2 consecutive submission dates</strong>:
<code>serp</code> events with a submission date 2 or more days after
the first submission date observed for that impression ID are ignored<sup class="footnote-reference"><a href="#1">1</a></sup>.
As a result, the <code>serp_events</code> table has a <strong>2-day lag</strong> in its data
rather than the 1-day lag present for most other datasets.</p>
<p>On day <code>D</code>, the ETL logic looks like the following:</p>
<ol>
<li>Pull all <code>serp</code> events with submission dates <code>D-2</code> or <code>D-1</code></li>
<li>Retain event sequences (sharing a common <code>impression_id</code>)
meeting the above requirements whose <code>serp.impression</code> event has submission date <code>D-2</code></li>
<li>Compute 1 row for each event sequence and insert into the table
with submission date <code>D-2</code>.</li>
</ol>
<h3 id="gotchas-1"><a class="header" href="#gotchas-1">Gotchas</a></h3>
<ul>
<li>The table fills at a 2-day lag: the most recent submission date in the table is 2 days ago, not yesterday.</li>
<li>Use <code>num_ads_visible</code> or <code>ad_components.num_visible</code> to count ad impressions,
and <code>num_ad_clicks</code> or <code>ad_components.num_clicks</code> to count ad clicks.
The table does not explicitly require <code>num_visible &gt; 0</code> when <code>num_clicks &gt; 0</code>.</li>
<li><code>ad_component.blocker_inferred</code> applies individually to each ad component,
and a single <code>impression_id</code> may have different values of <code>blocker_inferred</code> for different components.
The impression-level field <code>ad_blocker_inferred</code> is <code>true</code> if
any ad component has <code>blocker_inferred = true</code>.</li>
<li>Ad blocker use can only be inferred when ads are loaded
(which is a minority of all SERP impressions).
If ads are not loaded, <code>ad_blocker_inferred</code> will report <code>false</code>,
but really, there is not enough information to make a determination.</li>
<li>The array-valued fields will contain empty arrays rather than <code>NULL</code>s
when there are no corresponding entries.
For example, if a SERP impression has neither impressions nor engagements for ad components,
<code>ad_components</code> will be <code>[]</code>.</li>
</ul>
<h3 id="example-queries-17"><a class="header" href="#example-queries-17">Example queries</a></h3>
<p>Number of engaged and abandoned SERP impressions:</p>
<pre><code class="language-sql">SELECT
IF(abandon_reason IS NOT NULL, 'engaged', 'abandoned') AS session_type,
COUNT(*) AS num_serp
FROM
`mozdata.firefox_desktop.serp_events`
GROUP BY
1
</code></pre>
<p>Number of SERP impressions with ads loaded:</p>
<pre><code class="language-sql">SELECT
num_ads_loaded &gt; 0 AS has_ads_loaded,
COUNT(*) AS num_serp
FROM
`mozdata.firefox_desktop.serp_events`
GROUP BY
1
</code></pre>
<p>Number of SERP impression-level ad impressions and clicks</p>
<pre><code class="language-sql">SELECT
COUNT(*) as num_with_ad_impression,
COUNTIF(num_ad_clicks &gt; 0) as num_with_ad_click,
FROM
`mozdata.firefox_desktop.serp_events`
WHERE
num_ads_visible &gt; 0
</code></pre>
<p>Proportion of loaded ads that are visible, by search engine &amp; component:</p>
<pre><code class="language-sql">SELECT
search_engine,
component,
SAFE_DIVIDE(SUM(num_visible), SUM(num_loaded)) as prop_visible
FROM
`mozdata.firefox_desktop.serp_events`, UNNEST(ad_components)
GROUP BY
1,
2
ORDER BY
1,
2
</code></pre>
<p>Number of SERP impressions with ads loaded and an ad blocker in use:</p>
<pre><code class="language-sql">SELECT
ad_blocker_inferred,
COUNT(*) as num_serp
FROM
`mozdata.firefox_desktop.serp_events`
WHERE
num_ads_loaded &gt; 0
GROUP BY
1
</code></pre>
<p>Per-component ad impression and click-through rates, among sessions with ads showing:</p>
<pre><code class="language-sql">SELECT
component,
SAFE_DIVIDE(SUM(num_visible), COUNT(DISTINCT impression_id)) AS ad_imp_rate,
-- only count clicks when ads are visible
SAFE_DIVIDE(
COUNT(DISTINCT IF(num_clicks &gt; 0, impression_id, NULL)),
COUNT(DISTINCT impression_id)
) AS ad_ctr
FROM
`mozdata.firefox_desktop.serp_events`, UNNEST(ad_components)
WHERE
num_visible &gt; 0
GROUP BY
1
</code></pre>
<p>Abandonment reason distribution:</p>
<pre><code class="language-sql">SELECT
abandon_reason,
COUNT(*) AS num_serp
FROM
`mozdata.firefox_desktop.serp_events`
WHERE
abandon_reason IS NOT NULL
GROUP BY
1
</code></pre>
<p>Number of SERP impressions with a shopping tab visible:</p>
<pre><code class="language-sql">SELECT
EXISTS(
SELECT * FROM UNNEST(non_ad_impressions) AS x
WHERE x.component = 'shopping_tab' AND x.num_elements_loaded &gt; 0
) AS has_shopping_tab,
COUNT(*) AS num_serp
FROM
`mozdata.firefox_desktop.serp_events`
GROUP BY
1
</code></pre>
<h3 id="column-descriptions-1"><a class="header" href="#column-descriptions-1">Column descriptions</a></h3>
<p>The <code>v2</code> table has 1 row per SERP impression, each representing a single page load of a SERP.
Most columns contain impression-level properties.
There are also 3 array-valued columns listing impressions and engagements by component.</p>
<div class="table-wrapper"><table><thead><tr><th>Column</th><th>Description</th></tr></thead><tbody>
<tr><td><code>impression_id</code></td><td>UUID identifying SERP page loads. Use <code>COUNT(DISTINCT impression_id)</code> to count unique SERP impressions when <code>CROSS JOIN UNNEST</code>ing the array-valued columns.</td></tr>
<tr><td><code>ping_seq</code></td><td><code>ping_info.seq</code> from the events ping. Use together with <code>event_timestamp</code> for event sequencing.</td></tr>
<tr><td><code>event_timestamp</code></td><td>Glean event timestamp for the <code>serp.impression</code> event corresponding to the SERP page load.</td></tr>
<tr><td><code>is_shopping_page</code></td><td><code>true</code> when the SERP is a shopping page, resulting from clicking on the &quot;Shopping&quot; tab; <code>false</code> otherwise.</td></tr>
<tr><td><code>is_private</code></td><td><code>true</code> when the SERP was loaded while in Private Browsing Mode; <code>false</code> otherwise.</td></tr>
<tr><td><code>is_signed_in</code></td><td><code>true</code> when the SERP was loaded while signed into a search provider account; <code>false</code> otherwise.</td></tr>
<tr><td><code>search_engine</code></td><td><code>google</code>, <code>bing</code>, <code>duckduckgo</code>, <code>ecosia</code> (only these support SERP events currently).</td></tr>
<tr><td><code>sap_source</code></td><td>How the user arrived at the SERP [e.g. <code>urlbar</code>, <code>follow_on_from_refine_on_SERP</code>]. There are a number of cases where this will be <code>unknown</code>, e.g. clicking on a link that opens a new SERP, or clicking on a history result containing a SERP URL.</td></tr>
<tr><td><code>is_tagged</code></td><td>Whether the search is tagged (<code>true</code>) or organic (<code>false</code>).</td></tr>
<tr><td><code>abandon_reason</code></td><td>Why the SERP is deemed abandoned: <code>tab_close</code>, <code>window_close</code>, <code>navigation</code>, or <code>null</code> if not abandoned.</td></tr>
<tr><td><code>ad_components</code></td><td>Array with 1 entry for each ad component which had either an impression or engagement, <code>[]</code> if none.</td></tr>
<tr><td><code>ad_components.component</code></td><td>SERP display component containing ad links [e.g. <code>ad_link</code>, <code>ad_carousel</code>].</td></tr>
<tr><td><code>ad_components.num_loaded</code></td><td>Number of ads loaded in the component. They may or may not be visible on the page, depending on ad blocking and the display properties of the component.</td></tr>
<tr><td><code>ad_components.num_visible</code></td><td>Number of ads visible to the user in the component.</td></tr>
<tr><td><code>ad_components.num_blocked</code></td><td>Number of ads blocked by an ad blocker in the component.</td></tr>
<tr><td><code>ad_components.num_notshowing</code></td><td>Number of ads in the component which are loaded but not visible, and not blocked by an ad blocker. For example, ads in the carousel that will be shown on clicking the &quot;Expand&quot; button.</td></tr>
<tr><td><code>ad_components.num_clicks</code></td><td>Number of clicks on ad links in the component.</td></tr>
<tr><td><code>ad_components.num_other_engagements</code></td><td>Number of engagements in the component which are not ad clicks. E.g. clicking &quot;Expand&quot; for the carousel.</td></tr>
<tr><td><code>ad_components.blocker_inferred</code></td><td><code>true</code> if all loaded ads are blocked, in which case we infer an ad blocker is in use in the component; <code>false</code> otherwise. Note that the same SERP impression can have <code>blocker_inferred = true</code> for some ad components and <code>false</code> for others.</td></tr>
<tr><td><code>non_ad_engagements</code></td><td>Array with 1 entry for each non-ad component (which had an engagement) and engagement action, <code>[]</code> if none.</td></tr>
<tr><td><code>non_ad_engagements.component</code></td><td>SERP display component not containing ad links [e.g. <code>non_ads_link</code>, <code>shopping_tab</code>].</td></tr>
<tr><td><code>non_ad_engagements.action</code></td><td>Engagement action taken in the component.</td></tr>
<tr><td><code>non_ad_engagements.num_engagements</code></td><td>Number of engagements of that action type taken in the component.</td></tr>
<tr><td><code>non_ad_impressions</code></td><td>Array with 1 entry for each non-ad component which had an impression, <code>[]</code> if none.</td></tr>
<tr><td><code>non_ad_impressions.component</code></td><td>SERP display component not containing ad links [e.g. <code>shopping_tab</code>, <code>refined_search_buttons</code>].</td></tr>
<tr><td><code>non_ad_impressions.num_elements_loaded</code></td><td>Number of instrumented elements loaded in the component. They may or may not be visible on the page, depending on ad blocking and the display properties of the component. For many non-ad components this will be either 0 or 1.</td></tr>
<tr><td><code>non_ad_impressions.num_elements_visible</code></td><td>Number of instrumented elements visible to the user in the component.</td></tr>
<tr><td><code>non_ad_impressions.num_elements_blocked</code></td><td>Number of instrumented elements blocked by an ad blocker in the component.</td></tr>
<tr><td><code>non_ad_impressions.num_elements_notshowing</code></td><td>Number of instrumented elements in the component which are loaded but not visible, and not blocked by an ad blocker.</td></tr>
<tr><td><code>num_ad_clicks</code></td><td>Total number of clicks on links in ad components for the SERP page load.</td></tr>
<tr><td><code>num_non_ad_link_clicks</code></td><td>Total number of clicks on organic result links (<code>non_ads_link</code> target) for the SERP page load.</td></tr>
<tr><td><code>num_other_engagements</code></td><td>Total number of engagements for the SERP page load which are neither ad clicks nor organic link clicks. These include <code>expanded</code> actions on the <code>ad_carousel</code> component, as well as clicks or other engagement actions on non-ad components.</td></tr>
<tr><td><code>num_ads_loaded</code></td><td>Total number of ads loaded in ad components for the SERP page load. They may or may not be visible on the page, depending on ad blocking and the display properties of the page.</td></tr>
<tr><td><code>num_ads_visible</code></td><td>Total number of ads visible to the user in ad components for the SERP page load.</td></tr>
<tr><td><code>num_ads_blocked</code></td><td>Total number of ads blocked by an ad blocker in ad components for the SERP page load.</td></tr>
<tr><td><code>num_ads_notshowing</code></td><td>Total number of ads which are loaded but not visible, and not blocked by an ad blocker, for the SERP page load. For example, ads in the carousel that will be shown on clicking &quot;Expand&quot; button. Use this to count &quot;ads that are available but not visible&quot;.</td></tr>
<tr><td><code>ad_blocker_inferred</code></td><td><code>true</code> if all loaded ads are blocked in at least one ad component, in which case we infer an ad blocker is in use on the SERP; <code>false</code> otherwise.</td></tr>
</tbody></table>
</div>
<h3 id="scheduling-26"><a class="header" href="#scheduling-26">Scheduling</a></h3>
<p>This dataset is scheduled on Airflow and updated daily.</p>
<h3 id="schema-18"><a class="header" href="#schema-18">Schema</a></h3>
<p>The data is partitioned by <code>submission_date</code>.</p>
<h3 id="code-reference-23"><a class="header" href="#code-reference-23">Code reference</a></h3>
<p>The derived table is
<a href="https://github.com/mozilla/bigquery-etl/blob/main/sql_generators/serp_events_v2/__init__.py">generated</a>
from a templated query defined under
<a href="https://github.com/mozilla/bigquery-etl/blob/main/sql_generators/serp_events_v2/templates/desktop_query.sql"><code>bigquery_etl/sql_generators</code></a>
and accessible via its
<a href="https://github.com/mozilla/bigquery-etl/blob/main/sql_generators/serp_events_v2/templates/view.sql">view</a>.</p>
<!-- prettier-ignore -->
<div class="footnote-definition" id="1"><sup class="footnote-definition-label">1</sup>
<p>This limit of 2 days was chosen as a trade-off between data completeness and lag time.
A previous analysis showed that, even if we allow events for an impression ID to span up to 7 days,
99.5% of impression IDs only have events spanning 1 or 2 consecutive days.</p>
</div>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/other/serp_events/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="mozilla-accounts-data"><a class="header" href="#mozilla-accounts-data">Mozilla Accounts Data</a></h1>
<blockquote>
<p>⚠️ Formerly Firefox Accounts, this service was renamed publicly in 2023, but is still often referred to by its previous abbreviation <code>FxA</code> internally, including in this documentation.</p>
</blockquote>
<h2 id="table-of-contents-17"><a class="header" href="#table-of-contents-17">Table of Contents</a></h2>
<ul>
<li><a href="datasets/fxa.html#introduction">Introduction</a></li>
<li><a href="datasets/fxa.html#what-is-mozilla-accounts">What is Mozilla Accounts?</a></li>
<li><a href="datasets/fxa.html#metrics-background">Metrics Background</a></li>
<li><a href="datasets/fxa.html#metrics-taxonomies">Metrics Taxonomies</a></li>
<li><a href="datasets/fxa.html#service-databases">Service databases</a></li>
</ul>
<h2 id="introduction-26"><a class="header" href="#introduction-26">Introduction</a></h2>
<p>This article provides an overview of Mozilla accounts metrics: what is measured and how. See the other articles in this chapter for more details about the specific measurements that are available for analysis.</p>
<p>The <a href="https://mozilla.github.io/ecosystem-platform/relying-parties/reference/metrics-for-relying-parties">Mozilla accounts documentation</a> maintains additional detail, as well as the source code, for the metrics described here.</p>
<h2 id="what-is-mozilla-accounts"><a class="header" href="#what-is-mozilla-accounts">What is Mozilla Accounts?</a></h2>
<p><a href="https://www.mozilla.org/en-US/firefox/accounts/">Mozilla accounts</a> is Mozilla's authentication solution for account-based end-user services and features. At the time of writing, sync is by far the most popular account-relying service. Below is a partial list of current FxA-relying services:</p>
<ul>
<li><a href="https://support.mozilla.org/en-US/kb/how-do-i-set-sync-my-computer">Sync</a>
<ul>
<li>Requires FxA.</li>
</ul>
</li>
<li><a href="https://addons.mozilla.org/en-US/firefox/">AMO</a>
<ul>
<li>For developer accounts; not required by end-users to use or download addons.</li>
</ul>
</li>
<li><a href="https://getpocket.com/login/?ep=1">Pocket</a>
<ul>
<li>FxA is an optional authentication method among others.</li>
</ul>
</li>
<li><a href="https://monitor.firefox.com">Monitor</a>
<ul>
<li>Required to receive email alerts. Not required for email scans.</li>
</ul>
</li>
<li><a href="https://relay.firefox.com/">Relay</a>
<ul>
<li>Required to use the service</li>
</ul>
</li>
<li><a href="https://wiki.mozilla.org/IAM/Frequently_asked_questions">Mozilla IAM</a>
<ul>
<li>Optional authentication method among others.</li>
</ul>
</li>
</ul>
<p>A single account can be used to authenticate with all of the services listed above (though see the note below about Chinese users).</p>
<p>Note that in addition to being the most commonly used relier of FxA, sync is also unique in its integration with FxA - unlike the other reliers in the list above, sync is currently <strong>not</strong> an FxA oauth client. When someone signs into an oauth client using Firefox, nothing in the browser changes - more specifically, client-side telemetry probes such as <a href="https://probes.telemetry.mozilla.org/?view=detail&amp;probeId=histogram%2FFXA_CONFIGURED"><code>FXA_CONFIGURED</code></a> do not change state. Thus at the present time the only way to measure usage of FxA oauth reliers is to use the FxA server-side measures described below.</p>
<blockquote>
<p>⚠️ China runs its own stack for sync, but Chinese sign-ups for oauth reliers still go through the &quot;one and only&quot; oauth server. This means that Chinese users who want to use both sync and an oauth service (e.g. Monitor) will have to register for two accounts. It also means that only metrics for Chinese oauth users will show up in the datasets described below; any sync-related measures will not. At present, you must contact those responsible for maintaining the FxA stack in China for metrics on Chinese sync users.</p>
</blockquote>
<h2 id="metrics-background"><a class="header" href="#metrics-background">Metrics Background</a></h2>
<p>FxA metrics are logged both server-side and client-side. There are many <a href="https://github.com/mozilla/fxa/tree/main/packages">FxA &quot;servers&quot;</a> that handle different aspects of account authentication and management. The metrics of most interest to data analysts are logged by the FxA auth server, content server and oauth server. Each server writes their metrics into their log stream, and some post-processing scripts combine the metrics events from all three servers into datasets that are available in BigQuery. In 2023 a new logging implementation was integrated leveraging the <a href="datasets/../concepts/glean/glean.html">Glean</a> libraries and pipelines which means both server- and client-side will use the Glean system. All new metrics are being implemented in Glean and the legacy metrics will likely be removed in 2024.</p>
<p>In general, metrics logged by the <a href="https://github.com/mozilla/fxa/tree/main/packages/fxa-auth-server">FxA auth server</a> reflect authentication events such as account creation, logins to existing accounts, etc.
Metrics logged by the <a href="https://github.com/mozilla/fxa/tree/main/packages/fxa-content-server">FxA content server</a> reflect user interaction and progression through the FxA web UI - form views, form engagement, form submission, etc.
The <a href="https://github.com/mozilla/fxa/pull/3176">FxA oauth server</a> logs metrics events when oauth clients (Monitor, Lockwise, etc) create and check authentication tokens.</p>
<h2 id="metrics-taxonomies"><a class="header" href="#metrics-taxonomies">Metrics Taxonomies</a></h2>
<p>In 2023 we integrated Glean with Mozilla Accounts and Event Metrics are now available for the <a href="https://dictionary.telemetry.mozilla.org/apps/accounts_backend">server-side</a> and <a href="https://dictionary.telemetry.mozilla.org/apps/accounts_frontend">client-side</a>.</p>
<p>There are two additional legacy event types described below:</p>
<p><a href="https://github.com/mozilla/fxa-auth-server/blob/master/docs/metrics-events.md"><strong>Flow Metrics</strong></a>: these are an older set of metrics events that can be queried through the <code>firefox_accounts</code> dataset in the <code>mozdata</code> project in BigQuery. See <a href="https://github.com/mozilla/fxa-auth-server/blob/master/docs/metrics-events.md">this documentation</a> for detailed description of the types of flow events that are logged and the tables that contain them (note this documentation does not contain an exhaustive list of all flow metrics but is generally still accurate about the ones that are described). These will likely evolve significantly in 2024.</p>
<p><strong>Amplitude Events</strong>: FxA started to send metrics events to Amplitude circa October 2017 and ended around June 2020. While we stopped using Amplitude, the term Amplitude Events lives on to reference this set of events. Amplitude events can be queried through the <code>moz-fx-data-shared-prod.firefox_accounts</code> dataset in BigQuery. <a href="https://github.com/mozilla/bigquery-etl/blob/main/sql/moz-fx-data-shared-prod/firefox_accounts/fxa_content_auth_events/view.sql"><code>moz-fx-data-shared-prod.firefox_accounts.fxa_content_auth_events</code></a> is probably the easiest BigQuery view to use, though it does not contain email bounce events. These are being completely replaced by the Glean Event Metrics and will be removed in 2024. FxA's Amplitude metrics were originally just re-configured and re-named versions of the flow metrics. However things have since diverged a bit and there are now metrics events that only have an Amplitude version but no corresponding flow event, and vice-versa. If you are wondering whether a certain event is logged its likely you will have to check both data sources.</p>
<p>Note that the BigQuery <a href="https://github.com/mozilla/bigquery-etl/tree/main/sql">ETL jobs</a> run daily.</p>
<h2 id="service-databases"><a class="header" href="#service-databases">Service databases</a></h2>
<p>Transactional databases used by Mozilla Accounts services are replicated to BigQuery. You can find table schemas in <a href="https://github.com/mozilla/ecosystem-platform/blob/master/docs/reference/database-structure.md">Ecosystem Platform documentation</a>.</p>
<p>There are two datasets, containing data from the production and stage databases:</p>
<ul>
<li><code>moz-fx-data-shared-prod.accounts_db_external</code></li>
<li><code>moz-fx-data-shared-prod.accounts_db_nonprod_external</code></li>
</ul>
<p>These datasets are restricted to a specific workgroup. Some user-facing views are available in <code>mozdata.accounts_db</code>.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/fxa.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="attribution-of-mozilla-accounts"><a class="header" href="#attribution-of-mozilla-accounts">Attribution of Mozilla Accounts</a></h1>
<h2 id="table-of-contents-18"><a class="header" href="#table-of-contents-18">Table of Contents</a></h2>
<ul>
<li><a href="datasets/fxa_metrics/attribution.html#introduction">Introduction</a></li>
<li><a href="datasets/fxa_metrics/attribution.html#types-of-attribution">Types of Attribution</a>
<ul>
<li><a href="datasets/fxa_metrics/attribution.html#service-attribution">Service Attribution</a></li>
<li><a href="datasets/fxa_metrics/attribution.html#funnel-attribution-entrypoint-and-utm-parameters">Funnel Attribution (entrypoint and utm parameters)</a></li>
</ul>
</li>
</ul>
<h2 id="introduction-27"><a class="header" href="#introduction-27">Introduction</a></h2>
<p>Users can create or login to an account through an increasingly large number of relying services and entrypoints. This article describes how we attribute authentications to their point of origin, and documents some of the most frequently trafficked entrypoints.</p>
<h2 id="types-of-attribution"><a class="header" href="#types-of-attribution">Types of Attribution</a></h2>
<p>We can attribute accounts to the <strong>service</strong> that they sign up for, as well as the <strong>entrypoint</strong> that they use to begin the authentication flow. Each service typically has many entrypoints; sync, for example, has web-based entrypoints and browser-based entrypoints (see below).</p>
<h3 id="service-attribution"><a class="header" href="#service-attribution">Service Attribution</a></h3>
<p>There is a variable called <code>service</code> that we use to (1) attribute users to the relying services of FxA that they have authenticated with and (2) attribute individual events to the services they are associated with. <strong>Except in the case of sync</strong>, <code>service</code> is a mapping from the oauth <code>client_id</code> of the relying service/product to a human readable string. Note that this mapping is currently maintained by hand, and is done after the events have been logged by the server. Currently, mapping to the human-readable <code>service</code> variable is only done for amplitude metrics, where it is treated as a user property. There is also a <code>service</code> variable in the <code>activity_events</code> and <code>flow_metadata</code> STMO tables (FxA Activity Metrics data source), however it only contains the opaque oauth <code>client_id</code>, not the human-readable string. A table of some of the most common oauth <code>client_id</code>s along with their corresponding <code>service</code> mapping is shown below. This is not a complete list.</p>
<div class="table-wrapper"><table><thead><tr><th><code>service</code></th><th>oauth <code>client_id</code></th><th>Description</th></tr></thead><tbody>
<tr><td><code>fenix</code></td><td><code>a2270f727f45f648</code></td><td>Sync implementation for Fenix</td></tr>
<tr><td><code>fx-monitor</code></td><td><code>802d56ef2a9af9fa</code></td><td>Firefox Monitor (<a href="https://monitor.firefox.com">website</a>)</td></tr>
<tr><td><code>pocket-mobile</code></td><td><code>7377719276ad44ee</code></td><td>Pocket Mobile App</td></tr>
<tr><td><code>pocket-web</code></td><td><code>749818d3f2e7857f</code></td><td>Pocket Website</td></tr>
<tr><td><code>amo-web</code></td><td><code>a4907de5fa9d78fc</code></td><td><code>addons.mozilla.org</code></td></tr>
<tr><td><code>fxa-content</code></td><td><code>ea3ca969f8c6bb0d</code></td><td>Oauth ID used when a user is signing in with cached credentials (i.e. does not have to re-enter username/password) and when the user is logging into the FxA settings page.</td></tr>
<tr><td><code>mozilla-email-preferences</code></td><td><code>c40f32fd2938f0b6</code></td><td>Oauth ID used when a user is signing in to modify their marketing email preferences (e.g., to opt-out.)</td></tr>
</tbody></table>
</div>
<h3 id="funnel-attribution-entrypoint-and-utm-parameters"><a class="header" href="#funnel-attribution-entrypoint-and-utm-parameters">Funnel Attribution (entrypoint and utm parameters)</a></h3>
<p>We can also attribute users to where they began the authentication process, be it from a website or an application. Attribution is done through query parameters appended to links that point at <code>accounts.firefox.com</code> (which hosts the actual authentication process). These parameters are logged along with with any metrics events that the user generates during the authentication flow. The table below lists the query parameters that are currently in use, along with the values associated with some of the most common funnels. Note that only <code>entrypoint</code> is typically logged for flows beginning within the browser. Web-based entrypoints are listed first, followed by entrypoints that are found within the browser chrome itself.</p>
<p>See the <a href="https://mozilla.github.io/ecosystem-platform/relying-parties/reference/metrics-for-relying-parties">Metrics for Relying Parties</a> documentation for more implementational detail on utm/entrypoint parameters.</p>
<div class="table-wrapper"><table><thead><tr><th><code>entrypoint</code></th><th>utm parameters</th><th>Description &amp; Notes</th></tr></thead><tbody>
<tr><td><code>activity-stream-firstrun</code></td><td><strong><code>utm_source</code></strong> = <code>activity-stream</code>, <strong><code>utm_campaign</code></strong> = <code>firstrun</code>, <strong><code>utm_medium</code></strong> = <code>referral</code> or <code>email</code></td><td>The <a href="about:welcome">about:welcome</a> page that is shown to new profiles on browser <code>firstrun</code>. <code>utm_term</code> is sometimes used to track variations for experiments.</td></tr>
<tr><td><code>firstrun</code> (not supported for current versions)</td><td><strong><code>utm_source</code></strong> = <code>firstrun</code></td><td>This is the old version of the <code>firstrun</code> page that was hosted on the web as part of mozilla.org (<a href="https://www.mozilla.org/en-US/firefox/62.0/firstrun/">example</a>). Starting with Firefox version 62, it was replaced by an in-browser version (see row above). Although it is not used for newer versions, it is still hosted for the sake of e.g. profiles coming through the dark funnel on older versions.</td></tr>
<tr><td><code>mozilla.org-whatsnewXX</code></td><td><strong><code>utm_source</code></strong> = <code>whatsnewXX</code>, <strong><code>utm_campaign</code></strong> = <code>fxa-embedded-form</code>, <strong><code>utm_content</code></strong> = <code>whatsnew</code>, <strong><code>utm_medium</code></strong> = <code>referral</code> or <code>email</code></td><td>Where <code>XX</code> = the browser version, e.g. 67 (<a href="https://www.mozilla.org/en-US/firefox/67.0.1/whatsnew/">example</a>). The &quot;what's new&quot; page that is shown to users after they upgrade browser versions. Important notes: <strong>(1)</strong> Users who are signed into a Firefox account have a different experience than those that are signed out. Signed-in users typically see a promotion of FxA-relying services, while signed-out users see a Call to Action to create an account. <strong>(2)</strong> The attribution parameters for this page were standardized starting on version 66. <strong>Previous values for entrypoint</strong> include <code>whatsnew</code> and <code>mozilla.org-wnp64</code> - these values should be used when doing historical analysis of versions prior to 66.</td></tr>
<tr><td><code>new-install-page</code> (current), <code>firefox-new</code> (previously)</td><td>varies (can contain values passed through by referrals)</td><td><a href="https://www.mozilla.org/en-US/firefox/new/">example</a>. The &quot;install Firefox&quot; page. This page doesn't always promote FxA and it will often only promote it to a certain % of traffic or to certain segments.</td></tr>
<tr><td><code>fxa-discoverability-native</code></td><td>NA</td><td>The in-browser toolbar icon. This was introduced with version 67.0</td></tr>
<tr><td><code>menupanel</code></td><td>NA</td><td>The in-browser account item in the &quot;hamburger&quot; menu on desktop (three-line menu in the upper right corner) as well as the sync/FxA menu item on android and iOS.</td></tr>
<tr><td><code>preferences</code></td><td>NA</td><td>The &quot;sign into sync&quot; button found in the sync section in desktop preferences.</td></tr>
<tr><td><code>synced-tabs</code></td><td>NA</td><td>The &quot;sign into sync&quot; button found in synced-tabs section under the library menu.</td></tr>
<tr><td><code>sendtab</code></td><td>NA</td><td>The &quot;sign into sync&quot; button found in the &quot;send tab to device&quot; menu accessible by right-clicking on a tab.</td></tr>
</tbody></table>
</div><footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/fxa_metrics/attribution.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="mozilla-account-funnels"><a class="header" href="#mozilla-account-funnels">Mozilla Account Funnels</a></h1>
<h2 id="table-of-contents-19"><a class="header" href="#table-of-contents-19">Table of Contents</a></h2>
<ul>
<li><a href="datasets/fxa_metrics/funnels.html#introduction">Introduction</a></li>
<li><a href="datasets/fxa_metrics/funnels.html#registration-funnel">Registration Funnel</a></li>
<li><a href="datasets/fxa_metrics/funnels.html#login-funnel">Login Funnel</a></li>
<li><a href="datasets/fxa_metrics/funnels.html#branches-off-the-login-funnel-password-reset-account-recovery-2fa">Branches off the Login Funnel: Password Reset, Account Recovery, 2FA.</a>
<ul>
<li><a href="datasets/fxa_metrics/funnels.html#password-reset-and-recovery-codes">Password Reset and Recovery Codes</a>
<ul>
<li><a href="datasets/fxa_metrics/funnels.html#password-reset-funnel-without-recovery-key">Password Reset Funnel Without Recovery Key</a></li>
<li><a href="datasets/fxa_metrics/funnels.html#password-reset-funnel-with-recovery-key">Password Reset Funnel With Recovery Key</a></li>
</ul>
</li>
<li><a href="datasets/fxa_metrics/funnels.html#login-with-2fa-totp">Login with 2FA (TOTP)</a>
<ul>
<li><a href="datasets/fxa_metrics/funnels.html#login-with-2fatotp-funnel-no-recovery-code">Login with 2FA/TOTP Funnel (No Recovery Code)</a></li>
<li><a href="datasets/fxa_metrics/funnels.html#login-with-2fatotp-funnel-w-recovery-code">Login with 2FA/TOTP Funnel w/ Recovery Code</a></li>
</ul>
</li>
</ul>
</li>
<li><a href="datasets/fxa_metrics/funnels.html#settings">Settings</a></li>
</ul>
<h2 id="introduction-28"><a class="header" href="#introduction-28">Introduction</a></h2>
<p>There are two primary &quot;funnels&quot; that users step through when authenticating with FxA. The <strong>registration</strong> funnel reflects the steps required for a <strong>new</strong> FxA user (or more precisely, email address) to create an account. The <strong>login</strong> funnel reflects the steps necessary for an <strong>existing</strong> FxA user to sign into their account.</p>
<p>We are also in the process of developing funnels for paying subscribers. We will add documentation on that once the work is is closer to complete.</p>
<h2 id="registration-funnel"><a class="header" href="#registration-funnel">Registration Funnel</a></h2>
<p>While there are some variations, the typical registration funnel is comprised of the steps described in the chart below. Except where noted, these events are emitted by the FxA content server.</p>
<div class="table-wrapper"><table><thead><tr><th>Step</th><th>Amplitude Event</th><th>Flow Event</th><th>Description</th></tr></thead><tbody>
<tr><td>1</td><td><code>fxa_email_first - view</code></td><td><code>flow.enter-email.view</code></td><td>View (impression) of the form that the user enters their email address into to start the process. Note that this form can be hosted by FxA, or hosted by the relying party. In the latter case, the relier is responsible for handing the user's email address off to the FxA funnel. See &quot;counting top of funnel events&quot; below.</td></tr>
<tr><td>2</td><td><code>fxa_reg - view</code></td><td><code>flow.signup.view</code></td><td>View of the registration form. If the user got to this step via step 1, FxA has detected that their email address is not present in the DB, and thus a new account can be created. The user creates their password and enters their age.</td></tr>
<tr><td>3</td><td><code>fxa_reg - engage</code></td><td><code>flow.signup.engage</code></td><td>A user focuses/clicks on one of the registration form fields.</td></tr>
<tr><td>4</td><td><code>fxa_reg - submit</code></td><td><code>flow.signup.submit</code></td><td>A user submits the registration form (could be unsuccessfully).</td></tr>
<tr><td>5</td><td><code>fxa_reg - created</code></td><td><code>account.created</code></td><td>This event is emitted by the auth server. It indicates that user has entered a valid email address and password, and that their account has been created and added to the DB. However, the account is still &quot;unverified&quot; at this point and therefore not accessible by the user.</td></tr>
<tr><td>6</td><td><code>fxa_email - sent</code> (<code>email_type</code> = <code>registration</code>)</td><td><code>email.verification.sent</code></td><td>An email is sent to the user to verify their new account. Depending on the service, it either contains a verification link or a verification code that the user enters into the registration form to verify their email address.</td></tr>
<tr><td>7</td><td><code>fxa_reg - cwts_view</code></td><td><code>flow.signup.choose-what-to-sync.view</code></td><td>User views the &quot;choose what to sync&quot; screen which allows the users to select what types of browser data they want to synchronize. <strong>Note that the user is not required to submit this page</strong> - if they do not take any action then all the data types will be synced by default. Thus you may not want to include this (and the following two events) in your funnel analysis if you do not care about the user's actions here.</td></tr>
<tr><td>8</td><td><code>fxa_reg - cwts_engage</code></td><td><code>flow.signup.choose-what-to-sync.engage</code></td><td>User clicks on the &quot;choose what to sync&quot; screen.</td></tr>
<tr><td>9</td><td><code>fxa_reg - cwts_submit</code></td><td><code>flow.signup.choose-what-to-sync.submit</code></td><td>User submits the &quot;choose what to sync&quot; screen. See also the amplitude user property <code>sync_engines</code> which stores which data types the user selected.</td></tr>
<tr><td>10</td><td><code>fxa_email - click</code></td><td><code>email.verify_code.clicked</code></td><td>A user has clicked on the verification link contained in the email sent in step 6. Note this only applies to cases where a clickable link is sent; for reliers that use activation codes, this event will not be emitted (so be aware of this when constructing your funnels).</td></tr>
<tr><td>11</td><td><code>fxa_reg - email_confirmed</code></td><td><code>account.verified</code></td><td>This event is emitted by the auth server. A user has successfully verified their account. They should now be able to use it.</td></tr>
<tr><td>12</td><td><code>fxa_reg - complete</code></td><td><code>flow.complete</code></td><td>The account registration process is complete. Note there are NO actions required of the user to advance from step 8 to step 9; there should be virtually no drop-off there. The flow event is identical for registration and login.</td></tr>
</tbody></table>
</div>
<p>The chart above provides the most detailed version of the registration funnel that can currently be constructed. However, it should not be considered the &quot;canonical&quot; version of the funnel - depending on the question it may make sense to omit some of the steps. For example, at the time of writing some browser entrypoints (e.g. <code>menupanel</code>) link directly to step 2 and skip the initial email form. Having both steps 7 and 8 may also be redundant in some cases, etc. Also, as noted above, you may want to omit the &quot;choose what to sync&quot; steps if you do not care about the users' actions there.</p>
<h2 id="login-funnel"><a class="header" href="#login-funnel">Login Funnel</a></h2>
<p>The login funnel describes the steps required for an existing FxA user to login to their account. With some exceptions, most of the steps here are parallel to the registration funnel (but named differently).</p>
<p>Users must confirm their login via email in the following cases:</p>
<ol>
<li>A user is logging into sync with an account that is more than 4 hours old.</li>
<li>A user is logging into an oauth relier that uses encryption keys, if the user had not logged into their account in the previous 72? (check this) hours.</li>
</ol>
<div class="table-wrapper"><table><thead><tr><th>Step</th><th>Amplitude Event</th><th>Flow Event</th><th>Description</th></tr></thead><tbody>
<tr><td>1</td><td><code>fxa_email_first - view</code></td><td><code>flow.enter-email.view</code></td><td>Similar to the registration funnel, a view (impression) of the form that the user enters their email address into to start the process. Note that this form can be hosted by FxA, or hosted by the relying party. In the latter case, the relier is responsible for handing the user's email address off to the FxA funnel. See &quot;counting top of funnel events&quot; below.</td></tr>
<tr><td>2</td><td><code>fxa_login - view</code></td><td><code>flow.signin.view</code></td><td>View of the login form. If the user got to this step via step 1, FxA has detected that their email address IS present in the DB, and thus an existing account can be logged into. The user enters their password on this form.</td></tr>
<tr><td>3</td><td><code>fxa_login - engage</code></td><td><code>flow.signup.engage</code></td><td>A user focuses/clicks on the login form field</td></tr>
<tr><td>4</td><td><code>fxa_login - submit</code></td><td><code>flow.signup.submit</code></td><td>A user submits the login form (could be unsuccessfully).</td></tr>
<tr><td>5</td><td><code>fxa_login - success</code></td><td><code>account.login</code></td><td>This event is emitted by the auth server. It indicates that user has submitted the correct password. However, in some cases the user may still have to confirm their login via email (see above).</td></tr>
<tr><td>6</td><td><code>fxa_email - sent</code> (<code>email_type</code> = <code>login</code>)</td><td><code>email.confirmation.sent</code></td><td>An email is sent to the user to confirm the login. Depending on the service, it either contains a confirmation link or a verification code that the user enters into the login form.</td></tr>
<tr><td>7</td><td><code>fxa_email - click</code></td><td><code>email.verify_code.clicked</code></td><td>A user has clicked on the confirmation link contained in the email sent in step 6. Note this only applies to cases where a clickable link is sent; for reliers that use confirmation codes, this event will not be emitted (so be aware of this when constructing your funnels). Note that this event is identical to its counterpart in the registration funnel.</td></tr>
<tr><td>8</td><td><code>fxa_login - email_confirmed</code></td><td><code>account.confirmed</code></td><td>This event is emitted by the auth server. A user has successfully confirmed the login via email.</td></tr>
<tr><td>9</td><td><code>fxa_login - complete</code></td><td><code>flow.complete</code></td><td>The account registration process is complete. Note there are NO actions required of the user to advance from step 8 to step 9; there should be virtually no drop-off there. The flow event is identical for registration and login.</td></tr>
</tbody></table>
</div>
<p>Note again that you may want to check whether the service you are analyzing requires email confirmation on login.</p>
<h2 id="branches-off-the-login-funnel-password-reset-account-recovery-2fa"><a class="header" href="#branches-off-the-login-funnel-password-reset-account-recovery-2fa">Branches off the Login Funnel: Password Reset, Account Recovery, 2FA.</a></h2>
<p>Some additional funnels are &quot;branches&quot; off the main login funnel above:</p>
<ol>
<li>The password reset funnel</li>
</ol>
<ul>
<li>Optionally - the user resets their password with a recovery key</li>
</ul>
<ol start="2">
<li>Login with 2FA (TOTP)</li>
</ol>
<ul>
<li>Optionally - user uses a 2FA recovery code to login to their 2FA-enabled account (e.g. if they misplace their second factor.)</li>
</ul>
<h3 id="password-reset-and-recovery-codes"><a class="header" href="#password-reset-and-recovery-codes">Password Reset and Recovery Codes</a></h3>
<p>Users can click &quot;Forgot Password?&quot; during sign-in to begin the password reset process. The funnel is described in the chart below.</p>
<p><strong>An important &quot;FYI&quot; here</strong>: passwords are used to encrypt accounts' sync data. This implies a <strong>bad scenario</strong> where a change of password can lead to loss of sync data, if there are no longer any devices that can connect to the account and re-upload/restore the data after the reset occurs. This would happen, for example, if you only had one device connected to sync, lost the device, then tried to login to a new device to access your synced data. If you do a password reset while logging into the second device, the remote copy of your sync data will be overwritten (with whatever happens to be on the second device).</p>
<p>Thus the recovery codes. If a user (1) sets up recovery codes via settings (and stores them somewhere accessible) (2) tries to reset their password and (3) enters a valid recovery code during the password reset process, sync data can be restored without risking the &quot;bad scenario&quot; above.</p>
<h4 id="password-reset-funnel-without-recovery-key"><a class="header" href="#password-reset-funnel-without-recovery-key">Password Reset Funnel Without Recovery Key</a></h4>
<p><em>Note: There may be other places where a user can initiate the password reset process, but I think that its most common during login. In any case, the steps starting at 2 should all be the same.</em></p>
<div class="table-wrapper"><table><thead><tr><th>Step</th><th>Amplitude Event</th><th>Flow Event</th><th>Description</th></tr></thead><tbody>
<tr><td>1</td><td><code>fxa_login - view</code></td><td><code>flow.signin.view</code></td><td>View of the login form, which contains the &quot;Forgot Password&quot; Link.</td></tr>
<tr><td>2</td><td><code>fxa_login - forgot_password</code></td><td><code>flow.signin.forgot-password</code></td><td>User clicks on the &quot;Forgot Password&quot; Link.</td></tr>
<tr><td>3</td><td>Not Implemented</td><td><code>flow.reset-password.view</code></td><td>View of the form asking the user to confirm that they want to reset.</td></tr>
<tr><td>4</td><td><code>fxa_login - forgot_submit</code></td><td><code>flow.reset-password.engage</code>, <code>flow.reset-password.submit</code></td><td>User clicks on the button confirming that they want to reset.</td></tr>
<tr><td>5</td><td><code>fxa_email - sent</code> (<code>email_template</code> = <code>recovery</code>)</td><td><code>email.recoveryEmail.sent</code></td><td>Delivery of the PW reset link to the user via email.</td></tr>
<tr><td>5-a</td><td>Not Implemented</td><td><code>flow.confirm-reset-password.view</code></td><td>View of the screen telling the user to confirm the reset via email.</td></tr>
<tr><td>6</td><td>Not Implemented</td><td><code>flow.complete-reset-password.view</code></td><td>User views the form to create a new password. (viewable after clicking the link in the email above)</td></tr>
<tr><td>7</td><td>Not Implemented</td><td><code>flow.complete-reset-password.engage</code></td><td>User clicks on the form to create a new password.</td></tr>
<tr><td>8</td><td>Not Implemented</td><td><code>flow.complete-reset-password.submit</code></td><td>User submits the form to create a new password.</td></tr>
<tr><td>9</td><td><code>fxa_login - forgot_complete</code></td><td><code>flow.complete</code> (the auth server also emits <code>account.reset</code>)</td><td>User has completed the password reset funnel.</td></tr>
</tbody></table>
</div>
<h4 id="password-reset-funnel-with-recovery-key"><a class="header" href="#password-reset-funnel-with-recovery-key">Password Reset Funnel With Recovery Key</a></h4>
<p><em>Note we still need to implement amplitude events for the recovery code part of this funnel. The funnel is identical to the one above up until step 6.</em></p>
<div class="table-wrapper"><table><thead><tr><th>Step</th><th>Amplitude Event</th><th>Flow Event</th><th>Description</th></tr></thead><tbody>
<tr><td>1</td><td><code>fxa_login - view</code></td><td><code>flow.signin.view</code></td><td>View of the login form, which contains the &quot;Forgot Password&quot; Link.</td></tr>
<tr><td>2</td><td><code>fxa_login - forgot_pwd</code></td><td><code>flow.signin.forgot-password</code></td><td>User clicks on the &quot;Forgot Password&quot; Link.</td></tr>
<tr><td>3</td><td>Not Implemented</td><td><code>flow.reset-password.view</code></td><td>View of the form asking the user to confirm that they want to reset.</td></tr>
<tr><td>4</td><td><code>fxa_login - forgot_submit</code></td><td><code>flow.reset-password.engage</code>, <code>flow.reset-password.submit</code></td><td>User clicks on the button confirming that they want to reset.</td></tr>
<tr><td>5</td><td><code>fxa_email - sent</code> (<code>email_template</code> = <code>recovery</code>)</td><td><code>email.recoveryEmail.sent</code></td><td>Delivery of the PW reset link to the user via email.</td></tr>
<tr><td>5-a</td><td>Not Implemented</td><td><code>flow.confirm-reset-password.view</code></td><td>View of the screen telling the user to confirm the reset via email.</td></tr>
<tr><td>6</td><td><code>fxa_login - forgot_password_confirm_recovery_key_view</code></td><td><code>flow.account-recovery-confirm-key.view</code></td><td>User views the form to enter their account recovery key. (viewable after clicking the link in the email above)</td></tr>
<tr><td>7</td><td><code>fxa_login - forgot_password_confirm_recovery_key_engage</code></td><td><code>flow.account-recovery-confirm-key.engage</code></td><td>User clicks on the form to enter their account recovery key.</td></tr>
<tr><td>8</td><td><code>fxa_login - forgot_password_confirm_recovery_key_submit</code></td><td><code>flow.account-recovery-confirm-key.submit</code></td><td>User submits the form to enter their account recovery key.</td></tr>
<tr><td>9</td><td>Not Implemented</td><td><code>flow.account-recovery-confirm-key.success</code> or <code>flow.account-recovery-confirm-key.invalidRecoveryKey</code></td><td>User submitted a valid (success) or invalid recovery key.</td></tr>
<tr><td>10</td><td><code>fxa_login - forgot_password_recovery_key_view</code></td><td><code>flow.account-recovery-reset-password.view</code></td><td>User views the form to change their password after submitting a valid recovery key.</td></tr>
<tr><td>11</td><td><code>fxa_login - forgot_password_recovery_key_engage</code></td><td><code>flow.account-recovery-reset-password.engage</code></td><td>User clicks on the form to change their password after submitting a valid recovery key.</td></tr>
<tr><td>12</td><td><code>fxa_login - forgot_password_recovery_key_submit</code></td><td><code>flow.account-recovery-reset-password.submit</code></td><td>User submits the form to change their password after submitting a valid recovery key.</td></tr>
<tr><td>13</td><td><code>fxa_login - forgot_password_recovery_key_success</code></td><td><code>flow.account-recovery-reset-password.recovery-key-consume.success</code></td><td></td></tr>
<tr><td>14</td><td><code>fxa_login - forgot_complete</code></td><td><code>flow.complete</code> (the auth server also emits <code>account.reset</code>)</td><td>User has completed the password reset funnel.</td></tr>
</tbody></table>
</div>
<h3 id="login-with-2fa-totp"><a class="header" href="#login-with-2fa-totp">Login with 2FA (TOTP)</a></h3>
<p>Users can setup two factor authentication (2FA) on account login. 2FA is implemented via time-based one-time password (TOTP). If a user has set up 2FA (via settings), they will be required to enter a pass code generated by their second factor whenever they login to their account.</p>
<p>Users are also provisioned a set of recovery codes as part of the 2FA setup process. These are one-time use codes that can be used to login to an account if a user loses access to their second factor. <strong>Note that these 2FA recovery codes are different than the account recovery keys described above</strong>.</p>
<h4 id="login-with-2fatotp-funnel-no-recovery-code"><a class="header" href="#login-with-2fatotp-funnel-no-recovery-code">Login with 2FA/TOTP Funnel (No Recovery Code)</a></h4>
<p><em>This funnel starts after the <code>fxa_login - success</code> / <code>account.login</code> step of the login funnel</em></p>
<div class="table-wrapper"><table><thead><tr><th>Step</th><th>Amplitude Event</th><th>Flow Event</th><th>Description</th></tr></thead><tbody>
<tr><td>1</td><td><code>fxa_login - totp_code_view</code></td><td><code>flow.signin-totp-code.view</code></td><td>View of the TOTP form.</td></tr>
<tr><td>2</td><td><code>fxa_login - totp_code_engage</code></td><td><code>flow.signin-totp-code.engage</code></td><td>Click on the TOTP form.</td></tr>
<tr><td>3</td><td><code>fxa_login - totp_code_submit</code></td><td><code>flow.signin-totp-code.submit</code></td><td>Submission of the TOTP form.</td></tr>
<tr><td>4</td><td><code>fxa_login - totp_code_success</code></td><td><code>flow.signin-totp-code.success</code></td><td>Successful submission of the TOTP form. Auth server also emits <code>totpToken.verified</code></td></tr>
</tbody></table>
</div>
<h4 id="login-with-2fatotp-funnel-w-recovery-code"><a class="header" href="#login-with-2fatotp-funnel-w-recovery-code">Login with 2FA/TOTP Funnel w/ Recovery Code</a></h4>
<p><em>This funnel starts after user clicks to use a recovery code during the TOTP funnel.</em></p>
<div class="table-wrapper"><table><thead><tr><th>Step</th><th>Amplitude Event</th><th>Flow Event</th><th>Description</th></tr></thead><tbody>
<tr><td>1</td><td><code>fxa_login - totp_code_view</code></td><td><code>flow.signin-totp-code.view</code></td><td>View of the TOTP form.</td></tr>
<tr><td>2</td><td>Not Implemented</td><td><code>flow.sign_in_recovery_code.view</code></td><td>View of the TOTP recovery code form.</td></tr>
<tr><td>3</td><td>Not Implemented</td><td><code>recoveryCode.verified</code> (auth server)</td><td>User submitted a valid recovery code.</td></tr>
</tbody></table>
</div>
<h2 id="settings"><a class="header" href="#settings">Settings</a></h2>
<p>A variety of metrics are logged that reflect user interaction with the settings page (https://accounts.firefox.com/settings). The chart below outlines some of these events (this is not an exhaustive list).</p>
<div class="table-wrapper"><table><thead><tr><th>Amplitude Event</th><th>Flow Event</th><th>Description</th></tr></thead><tbody>
<tr><td><code>fxa_pref - view</code></td><td><code>flow.settings.view</code></td><td>User viewed the settings page.</td></tr>
<tr><td><code>fxa_pref - two_step_authentication_view</code></td><td><code>flow.settings.two-step-authentication.view</code></td><td>User viewed 2FA settings.</td></tr>
<tr><td>Not Implemented</td><td><code>flow.settings.two-step-authentication.recovery-codes.view</code></td><td>User viewed their 2FA recovery codes. These are only viewable one time only, after a user sets up 2FA, or after they generate new codes.</td></tr>
<tr><td>Not Implemented</td><td><code>flow.settings.two-step-authentication.recovery-codes.print-option</code></td><td>User clicks to print their 2FA recovery codes.</td></tr>
<tr><td>Not Implemented</td><td><code>flow.settings.two-step-authentication.recovery-codes.download-option</code></td><td>User clicks to download their 2FA recovery codes.</td></tr>
<tr><td>Not Implemented</td><td><code>flow.settings.two-step-authentication.recovery-codes.copy-option</code></td><td>User clicks to copy their 2FA recovery codes to the clipboard (this is fired only when they click the copy button, not if they copy using e.g. a keyboard shortcut).</td></tr>
<tr><td>Not Implemented</td><td><code>flow.settings.change-password.view</code></td><td>User viewed the form to change their password.</td></tr>
<tr><td><code>fxa_pref - password</code></td><td><code>settings.change-password.success</code></td><td>User changed their password via settings.</td></tr>
<tr><td><code>fxa_pref - logout</code></td><td><code>settings.signout.success</code></td><td>User logged out via settings.</td></tr>
<tr><td><code>fxa_pref - newsletter</code> (see also user property <code>newsletter_state</code>)</td><td><code>settings.communication-preferences.(optIn\|optOut).success</code></td><td>User changed their newsletter email preferences.</td></tr>
<tr><td>Not Implemented</td><td><code>flow.settings.account_recovery.view</code></td><td>User viewed account recovery settings.</td></tr>
<tr><td>Not Implemented</td><td><code>flow.settings.account_recovery.engage</code></td><td>User clicked somewhere in account recovery settings.</td></tr>
<tr><td>Not Implemented</td><td><code>flow.settings.account-recovery.confirm-password.view</code></td><td>User viewed the password form prior to turning on account recovery. (user first has to verify their email address)</td></tr>
<tr><td>Not Implemented</td><td><code>flow.settings.account-recovery.confirm-password.view</code></td><td>User clicked the password form prior to turning on account recovery.</td></tr>
<tr><td>Not Implemented</td><td><code>flow.settings.account-recovery.confirm-password.submit</code></td><td>User submitted the password form prior to turning on account recovery.</td></tr>
<tr><td>Not Implemented</td><td><code>flow.settings.account-recovery.confirm-password.success</code></td><td>User successfully submitted the password form prior to turning on account recovery.</td></tr>
<tr><td>Not Implemented</td><td><code>flow.settings.account-recovery.recovery-key.view</code></td><td>User viewed their recovery key. This is viewable one time only, after a user sets up account recovery, or after they generate a new key.</td></tr>
<tr><td>Not Implemented</td><td><code>flow.settings.account-recovery.recovery-key.print-option</code></td><td>User clicks to print their recovery key.</td></tr>
<tr><td>Not Implemented</td><td><code>flow.settings.account-recovery.recovery-key.download-option</code></td><td>User clicks to download their recovery key.</td></tr>
<tr><td>Not Implemented</td><td><code>flow.settings.account-recovery.recovery-key.copy-option</code></td><td>User clicks to copy their recovery key to the clipboard (this is fired only when they click the copy button, not if they copy using e.g. a keyboard shortcut).</td></tr>
<tr><td>Not Implemented</td><td><code>flow.settings.account-recovery.refresh</code></td><td>User generated a new recovery key.</td></tr>
<tr><td>Not Implemented</td><td><code>flow.settings.clients.view</code></td><td>User viewed the list of clients (&quot;Devices &amp; Apps&quot;) connected to their account. AKA the device manager.</td></tr>
<tr><td>Not Implemented</td><td><code>flow.settings.clients.engage</code></td><td>User clicked somewhere the list of clients connected to their account.</td></tr>
<tr><td>Not Implemented</td><td><code>flow.settings.clients.disconnect.view</code></td><td>User viewed the dialog asking to confirm disconnection of a device.</td></tr>
</tbody></table>
</div><footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/fxa_metrics/funnels.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="mozilla-account-email-metrics"><a class="header" href="#mozilla-account-email-metrics">Mozilla Account Email Metrics</a></h1>
<h2 id="table-of-contents-20"><a class="header" href="#table-of-contents-20">Table of Contents</a></h2>
<ul>
<li><a href="datasets/fxa_metrics/emails.html#introduction">Introduction</a></li>
<li><a href="datasets/fxa_metrics/emails.html#email-templates-and-email-types">Email Templates and Email Types</a>
<ul>
<li><a href="datasets/fxa_metrics/emails.html#mozilla-accounts">Mozilla Accounts</a></li>
<li><a href="datasets/fxa_metrics/emails.html#subscription-platform">Subscription Platform</a></li>
</ul>
</li>
</ul>
<h2 id="introduction-29"><a class="header" href="#introduction-29">Introduction</a></h2>
<p>Users must provide an email address when they sign up for a Mozilla Account. Emails are sent to users to confirm authentication, alert them to new sign-ins, and to complete password resets. Users can also opt-in to marketing emails, however metrics for those are not covered in this article.</p>
<p>Events that we track relating to email:</p>
<ol>
<li>When the email is sent.</li>
<li>If the email bounces.</li>
<li>If the email contains a verification/confirmation link, whether the user clicked on it.</li>
</ol>
<p>Metrics relating to emails also contain the following properties:</p>
<ol>
<li>The email service of the recipient</li>
<li>The <code>email_template</code> - the <a href="https://github.com/mozilla/fxa/tree/main/packages/fxa-auth-server/lib/senders/emails/templates">template</a> of the email that was sent (we currently only track this for sending events, not click events). This is more specific than the</li>
<li><code>email_type</code>, which is broader grouping of many email templates into related categories, see chart below.</li>
</ol>
<h2 id="email-templates-and-email-types"><a class="header" href="#email-templates-and-email-types">Email Templates and Email Types</a></h2>
<p>Only emails sent by the FxA auth server are represented in the tables below. TBD on marketing emails.</p>
<h3 id="mozilla-accounts"><a class="header" href="#mozilla-accounts">Mozilla Accounts</a></h3>
<div class="table-wrapper"><table><thead><tr><th><code>email_template</code></th><th><code>email_type</code></th><th>Description &amp; Notes</th></tr></thead><tbody>
<tr><td><code>postAddTwoStepAuthenticationEmail</code></td><td><code>2fa</code></td><td>Sent to users after they successfully add 2 factor authentication to their account (TOTP)</td></tr>
<tr><td><code>postRemoveTwoStepAuthenticationEmail</code></td><td><code>2fa</code></td><td>Sent to users after they successfully REMOVE 2 factor authentication from their account (TOTP)</td></tr>
<tr><td><code>postConsumeRecoveryCodeEmail</code></td><td><code>2fa</code></td><td>Sent to users after they successfully use a recovery code to login to their account after not being able to use their second factor.</td></tr>
<tr><td><code>postNewRecoveryCodesEmail</code></td><td><code>2fa</code></td><td>Sent to users after they successfully generate a new set of 2FA recovery codes (replacing their old ones, if they existed).</td></tr>
<tr><td><code>lowRecoveryCodesEmail</code></td><td><code>2fa</code></td><td>Send when a user has 2 or fewer recovery codes remaining.</td></tr>
<tr><td><code>passwordResetAccountRecoveryEmail</code></td><td><code>account_recovery</code></td><td>After a user resets their password using a recovery key, they receive this email telling them to generate a new recovery key.</td></tr>
<tr><td><code>postAddAccountRecoveryEmail</code></td><td><code>account_recovery</code></td><td>Sent to users after they successfully add account recovery capabilities to their account (i.e. after generating recovery codes).</td></tr>
<tr><td><code>postRemoveAccountRecoveryEmail</code></td><td><code>account_recovery</code></td><td>Sent to users after they successfully REMOVE account recovery capabilities from their account.</td></tr>
<tr><td><code>postChangePrimaryEmail</code></td><td><code>change_email</code></td><td>Sent to users after they successfully change their primary email address (is sent to their new email).</td></tr>
<tr><td><code>passwordChangedEmail</code></td><td><code>change_password</code></td><td>Sent to users after they change their password via FxA settings (NOT during password reset; they must be logged in to do this).</td></tr>
<tr><td><code>passwordChangeRequiredEmail</code></td><td><code>change_password</code></td><td>Sent when an account's devices are disconnected and a password change is required due to suspicious activity.</td></tr>
<tr><td><code>cadReminderFirstEmail</code></td><td><code>connect_another_device</code></td><td>Sent 8 hours after a user clicks &quot;send me a reminder&quot; on the connect another device page.</td></tr>
<tr><td><code>cadReminderSecondEmail</code></td><td><code>connect_another_device</code></td><td>Sent 72 hours after a user clicks &quot;send me a reminder&quot; on the connect another device page.</td></tr>
<tr><td><code>newDeviceLoginEmail</code></td><td><code>login</code></td><td>Sent to existing accounts after they have logged into a device that FxA has not previously recognized.</td></tr>
<tr><td><code>verifyLoginCodeEmail</code></td><td><code>login</code></td><td>Sent to existing accounts when they try to login to sync, containing a code (rather than a link) the user must enter into the login form. Note that currently the use of confirmation codes is limited to some login contexts only - they are never used for registration.</td></tr>
<tr><td><code>verifyLoginEmail</code></td><td><code>login</code></td><td>Sent to existing accounts when they try to login to sync. User must click the verification link before the logged-in device can begin syncing.</td></tr>
<tr><td><code>postAddLinkedAccountEmail</code></td><td><code>login</code></td><td>Sent after a Firefox account is linked to a 3rd party account (e.g. an Apple account)</td></tr>
<tr><td><code>postVerifyEmail</code></td><td><code>registration</code></td><td>Sent after users confirm their email. Contains instructions for how to connect another device to sync.</td></tr>
<tr><td><code>verifyEmail</code></td><td><code>registration</code></td><td>Sent to users setting up a new NON-sync account. Contains a verification link (user must click it for their account to become functional).</td></tr>
<tr><td><code>verificationReminderFirstEmail</code></td><td><code>registration</code></td><td>If a users does not verify their account within 24 hours, they receive this email with an additional verification link.</td></tr>
<tr><td><code>verificationReminderSecondEmail</code></td><td><code>registration</code></td><td>If a users does not verify their account within 5 days, they receive this email with an additional verification link.</td></tr>
<tr><td><code>verifyShortCodeEmail</code></td><td><code>registration</code></td><td>Sent to users to verify their account via code after signing up.</td></tr>
<tr><td><code>passwordResetEmail</code></td><td><code>reset_password</code></td><td>Sent to users after they reset their password (without using a recovery key).</td></tr>
<tr><td><code>recoveryEmail</code></td><td><code>reset_password</code></td><td>After a user opts to reset their password (during login, because they clicked &quot;forgot password&quot;), they receive this email with a link to reset their password (without using a recovery key).</td></tr>
<tr><td><code>postVerifySecondaryEmail</code></td><td><code>secondary_email</code></td><td>Sent to users after they successfully verified a secondary email address (sent to the secondary email address).</td></tr>
<tr><td><code>postRemoveSecondaryEmail</code></td><td><code>secondary_email</code></td><td>Sent to users after they successfully remove a secondary email address (sent to the secondary email address).</td></tr>
<tr><td><code>verifySecondaryCodeEmail</code></td><td><code>secondary_email</code></td><td>Sent to verify the addition of a secondary email via code.</td></tr>
<tr><td><code>unblockCodeEmail</code></td><td><code>unblock</code></td><td>Sent to verify or unblock an account via code that has reached the login attempt rate limit.</td></tr>
<tr><td><code>verifyPrimaryEmail</code></td><td><code>verify</code></td><td>Sent to users with an unverified primary email, meaning an unverified account, when they attempt an action requiring a verified account.</td></tr>
</tbody></table>
</div>
<h3 id="subscription-platform"><a class="header" href="#subscription-platform">Subscription Platform</a></h3>
<p>The <code>email_type</code> is <a href="https://github.com/mozilla/fxa/issues/12098">still being determined</a> for Subscription Platform emails.</p>
<div class="table-wrapper"><table><thead><tr><th><code>email_template</code></th><th><code>email_type</code></th><th>Description &amp; Notes</th></tr></thead><tbody>
<tr><td><code>downloadSubscription</code></td><td></td><td>Sent to users after they successfully add a subscription</td></tr>
<tr><td><code>subscriptionAccountDeletion</code></td><td></td><td>Sent when a user with an active subscription deletes their Firefox account</td></tr>
<tr><td><code>subscriptionAccountFinishSetup</code></td><td></td><td>Sent to a user after they purchased the product through the password-less flow without an existing Firefox account</td></tr>
<tr><td><code>subscriptionAccountReminderFirst</code></td><td></td><td>Sent to a user to remind them to finish setting up a Firefox account after they signed up through the password-less flow without an existing account</td></tr>
<tr><td><code>subscriptionAccountReminderSecond</code></td><td></td><td>Sent as a final reminder to a user to remind them to finish setting up a Firefox account as they signed up through the password-less flow without an existing account</td></tr>
<tr><td><code>subscriptionCancellation</code></td><td></td><td>Sent when a user cancels their subscription</td></tr>
<tr><td><code>subscriptionDowngrade</code></td><td></td><td>Sent when a user downgrades their subscription</td></tr>
<tr><td><code>subscriptionFailedPaymentsCancellation</code></td><td></td><td>Sent when failed payments result in cancellation of user subscription</td></tr>
<tr><td><code>subscriptionFirstInvoice</code></td><td></td><td>Sent to inform a user that their first payment is currently being processed</td></tr>
<tr><td><code>subscriptionFirstInvoiceDiscount</code></td><td></td><td>Sent to inform a user that their first payment, with a discount coupon, is currently being processed</td></tr>
<tr><td><code>subscriptionPaymentExpired</code></td><td></td><td>Sent whenever a user has a single subscription and their card will expire at the end of the month, triggered by a Stripe webhook</td></tr>
<tr><td><code>subscriptionPaymentFailed</code></td><td></td><td>Sent when there is a problem with the latest payment</td></tr>
<tr><td><code>subscriptionPaymentProviderCancelled</code></td><td></td><td>Sent when a problem is detected with the payment method</td></tr>
<tr><td><code>subscriptionReactivation</code></td><td></td><td>Sent when a user reactivates their subscription</td></tr>
<tr><td><code>subscriptionRenewalReminder</code></td><td></td><td>Sent to remind a user of an upcoming automatic subscription renewal X days out from charge (X being what is set in the Stripe dashboard)</td></tr>
<tr><td><code>subscriptionSubsequentInvoice</code></td><td></td><td>Sent when the latest subscription payment is received</td></tr>
<tr><td><code>subscriptionSubsequentInvoiceDiscount</code></td><td></td><td>Sent when the latest subscription payment is received (coupon)</td></tr>
<tr><td><code>subscriptionUpgrade</code></td><td></td><td>Sent when a user upgrades their subscription</td></tr>
<tr><td><code>subscriptionsPaymentExpired</code></td><td></td><td>Sent whenever a user has multiple subscriptions and their card will expire at the end of the month</td></tr>
<tr><td><code>subscriptionsPaymentProviderCancelled</code></td><td></td><td>Sent when a user has multiple subscriptions and a problem has been detected with payment method</td></tr>
</tbody></table>
</div><footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/fxa_metrics/emails.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="static-datasets"><a class="header" href="#static-datasets">Static Datasets</a></h1>
<p>Tables containing static data exist in the <code>static</code> dataset in BigQuery.
These tables are generated from CSV files named <code>data.csv</code> in subdirectories of the <code>sql/&lt;project&gt;/static/</code>
directory in <a href="https://github.com/mozilla/bigquery-etl/tree/master/sql/moz-fx-data-shared-prod/static"><code>bigquery-etl</code></a>.</p>
<h2 id="creating-a-static-table"><a class="header" href="#creating-a-static-table">Creating a Static Table</a></h2>
<p>To create a new table, create a directory in <code>sql/&lt;project&gt;/static/</code>.
This directory should be named whatever you wish the table to be named.
Then, put a CSV file named <code>data.csv</code> in the directory.
It is expected that the first line of <code>data.csv</code> is a header row containing the column
names of the data.</p>
<p>e.g. In <code>sql/moz-fx-data-shared-prod/static/new_table/data.csv</code>:</p>
<pre><code>id,val
a,1
b,2
c,3
</code></pre>
<p>An optional <code>description.txt</code> and <code>schema.json</code> can be added. <code>description.txt</code> will fill the description
field in BigQuery. <code>schema.json</code> will set the schema of the table; if no schema is provided, it is assumed
that all fields are nullable strings.</p>
<p>See <a href="https://github.com/mozilla/bigquery-etl/tree/master/sql/moz-fx-data-shared-prod/static/country_names_v1"><code>country_names_v1</code></a> for an example.</p>
<p>To create the table in BigQuery, run <a href="https://github.com/mozilla/bigquery-etl/blob/master/script/publish_static"><code>script/publish_static</code></a>.</p>
<h3 id="notes"><a class="header" href="#notes">Notes</a></h3>
<p>Static tables can be created in any dataset in bigquery-etl. However, it is recommended for consistency and
organization to keep them in the <code>static</code> dataset.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/static.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="normalized-os"><a class="header" href="#normalized-os">Normalized OS</a></h1>
<p>The OS names and versions received in telemetry is not necessarily the accepted common name.
The <code>normalized_os_name</code> and <code>normalized_os_version</code> tables in the <code>static</code> dataset serve as a
lookup table for mapping the telemetry name to the common name.</p>
<h3 id="os-names"><a class="header" href="#os-names">OS Names</a></h3>
<p>For OS names, Mac clients send <code>Darwin</code>, Windows clients send <code>Windows_NT</code>, and Linux may send the
distribution name.
Pings should already have a <code>normalized_os</code> field that corrects this.
The <code>normalized_os_name</code> table exists as an alternative lookup table.</p>
<p>Example query:</p>
<pre><code class="language-sql">SELECT
client_id,
environment.system.os.name,
normalized_os_name
FROM
telemetry_stable.main_v4
LEFT JOIN
static.normalized_os_name
ON (environment.system.os.name = os_name)
</code></pre>
<h3 id="os-versions"><a class="header" href="#os-versions">OS Versions</a></h3>
<p>OS versions for some OS's are not properly normalized in telemetry. For example, the version
reported by Mac clients is the Darwin version instead of the MacOS version and the version
reported by Android Fennec clients is the Android SDK version instead of the Android version.
The <code>normalized_os_version</code> table can be used to map the sent version to the &quot;display version&quot;
of the OS.</p>
<p>The table uses a regular expression to look up the OS version so <code>REGEXP_CONTAINS</code> should be used.
An example query can be found in <a href="https://sql.telemetry.mozilla.org/queries/67040/source"><code>STMO#67040</code></a>.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/static/normalized_os.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="historical-reference-1"><a class="header" href="#historical-reference-1">Historical Reference</a></h1>
<p>This section contains some documentation of things that used to be part of the Mozilla Data Platform, but are no
longer. You can generally safely ignore this section, it is intended only to answer questions like &quot;what happened to X?&quot;.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/historical/index.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="an-overview-of-mozillas-data-pipeline-1"><a class="header" href="#an-overview-of-mozillas-data-pipeline-1">An overview of Mozillas Data Pipeline</a></h1>
<blockquote>
<p>Note: This article describes the AWS-based pipeline which
<a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1598815">has been retired</a>;
the client-side concepts here still apply, but this article has been updated
to reflect the <a href="concepts/pipeline/gcp_data_pipeline.html">new GCP pipeline</a>.</p>
</blockquote>
<p>This post describes the architecture of Mozillas data pipeline, which is used to collect Telemetry data from our users and logs from various services. One of the cool perks of working at Mozilla is that most of what we do is out in the open and because of that I can do more than just show you some diagram with arrows of our architecture; I can point you to the code, script &amp; configuration that underlies it!</p>
<p>To make the examples concrete, the following description is centered around the collection of Firefox Telemetry data. The same tool-chain is used to collect, store and analyze data coming from disparate sources though, such as service logs.</p>
<pre class="mermaid">graph TD
firefox((fa:fa-firefox Firefox))--&gt;|JSON| elb
elb[Load Balancer]--&gt;|JSON| nginx
nginx--&gt;|JSON| landfill(fa:fa-database S3 Landfill)
nginx--&gt;|protobuf| kafka[fa:fa-files-o Kafka]
kafka--&gt;|protobuf| cep(Hindsight CEP)
kafka--&gt;|protobuf| dwl(Hindsight DWL)
cep--&gt; hsui(Hindsight UI)
dwl--&gt;|protobuf| datalake(fa:fa-database S3 Data Lake)
dwl--&gt;|parquet| datalake
datalake--&gt;|parquet| prestodb
prestodb--&gt;redash[fa:fa-line-chart Redash]
datalake--&gt;spark
spark--&gt;datalake
airflow[fa:fa-clock-o Airflow]--&gt;|Scheduled tasks|spark{fa:fa-star Spark}
spark--&gt;|aggregations|rdbms(fa:fa-database PostgreSQL)
rdbms--&gt;tmo[fa:fa-bar-chart TMO]
rdbms--&gt;cerberus[fa:fa-search-plus Cerberus]
style firefox fill:#f61
style elb fill:#777
style nginx fill:green
style landfill fill:tomato
style datalake fill:tomato
style kafka fill:#aaa
style cep fill:palegoldenrod
style dwl fill:palegoldenrod
style hsui fill:palegoldenrod
style prestodb fill:cornflowerblue
style redash fill:salmon
style spark fill:darkorange
style airflow fill:lawngreen
style rdbms fill:cornflowerblue
style tmo fill:lightgrey
style cerberus fill:royalblue
</pre>
<h1 id="firefox-1"><a class="header" href="#firefox-1">Firefox</a></h1>
<p>There are different APIs and formats to <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/collection/index.html">collect data</a> in Firefox, all suiting different use cases:</p>
<ul>
<li><a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/collection/histograms.html">histograms</a> – for recording multiple data points;</li>
<li><a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/collection/scalars.html">scalars</a> – for recording single values;</li>
<li><a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/collection/measuring-time.html">timings</a> – for measuring how long operations take;</li>
<li><a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/collection/events.html">events</a> – for recording time-stamped events.</li>
</ul>
<p>These are commonly referred to as <em><a href="concepts/pipeline/../../datasets/new_data.html">probes</a></em>. Each probe must declare the <a href="https://wiki.mozilla.org/Firefox/Data_Collection">collection policy</a> it conforms to: either <em>release</em> or <em>prerelease</em>. When adding a new measurement data-reviewers carefully inspect the probe and eventually approve the requested collection policy:</p>
<ul>
<li>Release data is collected from all Firefox users.</li>
<li>Prerelease data is collected from users on Firefox Nightly and Beta channels.</li>
</ul>
<p>Users may choose to turn the data collection off in preferences.</p>
<p>A <em>session</em> begins when Firefox starts up and ends when it shuts down. As a session could be long-running and last weeks, it gets sliced into smaller logical units called <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/concepts/sessions.html#subsessions">subsessions</a>. Each subsession generates a batch of data containing the current state of all probes collected so far, i.e. a <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/data/main-ping.html">main ping</a>, which is sent to our servers. The main ping is just one of the many <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/concepts/pings.html#ping-types">ping types</a> we support. Developers can <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/collection/custom-pings.html">create their own ping types</a> if needed.</p>
<p><em>Pings</em> are submitted via an <a href="https://searchfox.org/mozilla-central/rev/501eb4718d73870892d28f31a99b46f4783efaa0/toolkit/components/telemetry/app/TelemetryController.jsm#231">API</a> that performs a HTTP POST request to our edge servers. If a ping fails to successfully <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/concepts/submission.html#submission">submit</a> (e.g. because of missing internet connection), Firefox will store the ping on disk and retry to send it until the maximum ping age is exceeded.</p>
<h1 id="kafka"><a class="header" href="#kafka">Kafka</a></h1>
<p>HTTP submissions coming in from the wild hit a <a href="https://aws.amazon.com/elasticloadbalancing/">load balancer</a> and then an NGINX <a href="https://github.com/mozilla-services/nginx_moz_ingest">module</a>. The <a href="https://github.com/mozilla-services/nginx_moz_ingest">module</a> accepts data via a <a href="https://wiki.mozilla.org/CloudServices/DataPipeline/HTTPEdgeServerSpecification">HTTP request</a> which it wraps in a Hindsight protobuf message and forwards to two places: a Kafka cluster and a short-lived S3 bucket (landfill) which acts as a fail-safe in case there is a processing error and/or data loss within the rest of the pipeline. The deployment scripts and configuration files of NGINX and Kafka live in a <a href="https://github.com/mozilla-services/puppet-config/tree/02f716a3e0df1117fc2494b41e85a1416f8e2a64/pipeline">private repository</a>.</p>
<p>The data from Kafka is read from the Complex Event Processors (CEP) and the Data Warehouse Loader (DWL), both of which use Hindsight.</p>
<h1 id="hindsight"><a class="header" href="#hindsight">Hindsight</a></h1>
<p><a href="https://github.com/mozilla-services/hindsight">Hindsight</a>, an open source stream processing software system developed by Mozilla as <a href="https://github.com/mozilla-services/heka">Heka</a>s successor, is useful for a wide variety of different tasks, such as:</p>
<ul>
<li>converting data from one format to another;</li>
<li>shipping data from one location to another;</li>
<li>performing real time analysis, graphing, and anomaly detection.</li>
</ul>
<p>Hindsights core is a lightweight data processing kernel written in C that controls a set of Lua <a href="https://github.com/mozilla-services/hindsight/blob/9593668e84a642aff9dd95ccc648b6585948abfe/docs/index.md">plugins</a> executed inside a sandbox.</p>
<p>The CEP are custom plugins that are created, configured and deployed from an <a href="https://github.com/mozilla-services/hindsight_admin">UI</a> which produce real-time plots like the number of pings matching a certain criteria. Mozilla employees can <a href="concepts/pipeline/BROKEN:https://pipeline-cep.prod.mozaws.net/">access the UI</a> and create/deploy their own custom plugin in real-time without interfering with other plugins running.</p>
<p><img src="concepts/pipeline/../../assets/CEP_custom_plugin.jpeg" alt="CEP Custom Plugin" title="CEP – a custom plugin in action" /></p>
<p>The DWL is composed of a set of plugins that transform, convert &amp; finally shovel pings into S3 for long term storage. In the specific case of Telemetry data, an input plugin <a href="https://github.com/mozilla-services/lua_sandbox_extensions/blob/0895238e32d25241ef46f561e43039beb201c7cd/kafka/sandboxes/heka/input/kafka.lua">reads pings from Kafka</a>, <a href="https://github.com/mozilla-services/lua_sandbox_extensions/blob/5d8907ee9f1a20e3a02bfe5b57d4312b173487a3/moz_telemetry/io_modules/decoders/moz_telemetry/ping.lua">pre-processes</a> them and <a href="https://github.com/mozilla-services/lua_sandbox_extensions/blob/5d8907ee9f1a20e3a02bfe5b57d4312b173487a3/moz_telemetry/sandboxes/heka/output/moz_telemetry_s3.lua">sends batches to S3</a>, our data lake, for long term storage. The data is compressed and partitioned by a set of dimensions, like date and application.</p>
<p>The data has traditionally been serialized to <a href="https://hekad.readthedocs.io/en/latest/message/index.html#stream-framing">Protobuf</a> sequence files which contain some nasty “free-form” JSON fields. Hindsight gained recently the ability to <a href="https://github.com/mozilla-services/lua_sandbox_extensions/pull/48">dump data directly in Parquet form</a> though.</p>
<p>The deployment scripts and configuration files of the CEP &amp; DWL live in a <a href="https://github.com/mozilla-services/puppet-config/tree/02f716a3e0df1117fc2494b41e85a1416f8e2a64/pipeline">private repository</a>.</p>
<h1 id="spark-2"><a class="header" href="#spark-2">Spark</a></h1>
<p>Once the data reaches our data lake on S3 it can be processed with Spark on Mozilla's Databricks instance. Databricks allows Mozilla employees to write custom analyses in notebooks, and also schedule Databricks jobs to run periodically.</p>
<p>As mentioned earlier, most of our data lake contains data serialized to Protobuf with free-form JSON fields. Needless to say, parsing JSON is terribly slow when ingesting Terabytes of data per day. A set of <a href="https://github.com/mozilla/telemetry-batch-view">ETL jobs</a>, written in Scala by Data Engineers and scheduled with <a href="https://github.com/mozilla/telemetry-airflow/">Airflow</a>, create Parquet views of our raw data. We have a Github repository <a href="https://github.com/mozilla/telemetry-batch-view/">telemetry-batch-view</a> that showcases this.</p>
<h1 id="aggregates-dataset"><a class="header" href="#aggregates-dataset">Aggregates Dataset</a></h1>
<pre class="mermaid">graph TD
%% Data Flow Diagram for mozaggregator/TMO-adjacent services
firefox((fa:fa-firefox Firefox)) --&gt; |main ping| pipeline
fennec((fa:fa-firefox Fennec)) --&gt; |saved-session ping| pipeline
pipeline((Telemetry Pipeline))
subgraph mozaggregator
service(service)
aggregator
rdbms(fa:fa-database PostgreSQL)
end
pipeline --&gt; aggregator
pipeline --&gt; spark{fa:fa-star Spark}
pipeline --&gt; redash[fa:fa-line-chart Redash]
subgraph telemetry.mozilla.org
telemetry.js(telemetry.js) --&gt; dist
telemetry.js --&gt; evo
orphan[Update Orphaning]
crashdash[tmo/crashes]
end
redash --&gt; crashdash
service --&gt; telemetry.js
spark --&gt; orphan
telemetry.js --&gt; telemetry-next-node(telemetry-next-node)
subgraph alerts.tmo
cerberus[fa:fa-search-plus Cerberus] --&gt;medusa
medusa --&gt; html
medusa --&gt; email
end
telemetry-next-node --&gt; cerberus
style redash fill:salmon
style spark fill:darkorange
style rdbms fill:cornflowerblue
style cerberus fill:royalblue
style firefox fill:#f61
style fennec fill:#f61
style telemetry.js fill:lightgrey
style dist fill:lightgrey
style evo fill:lightgrey
</pre>
<p>A dedicated Spark job feeds daily aggregates to a PostgreSQL database which powers a <a href="https://github.com/mozilla/python_mozaggregator/#api">HTTP service</a> to easily retrieve faceted roll-ups. The service is mainly used by <a href="https://telemetry.mozilla.org/">TMO</a>, a dashboard that visualizes distributions and time-series, and <a href="https://github.com/mozilla/cerberus/">Cerberus</a>, an anomaly detection tool that detects and alerts developers of changes in the distributions. Originally the sole purpose of the Telemetry pipeline was to feed data into this dashboard but in time its scope and flexibility grew to support more general use-cases.</p>
<p><img src="concepts/pipeline/../../assets/TMO_example.jpeg" alt="TMO" title="TMO – timeseries" /></p>
<h1 id="presto--stmo"><a class="header" href="#presto--stmo">Presto &amp; STMO</a></h1>
<p>We maintain a couple of <a href="https://github.com/mozilla/emr-bootstrap-presto">Presto clusters</a> and a centralized Hive metastore to query Parquet data with SQL. The Hive metastore provides an universal view of our Parquet dataset to both Spark and Presto clusters.</p>
<p>Presto, and other databases, are behind a <a href="https://redash.io/">Redash</a> service (<a href="https://sql.telemetry.mozilla.org/">STMO</a>) which provides a convenient &amp; powerful interface to query SQL engines and build dashboards that can be shared within the company. Mozilla maintains its own <a href="https://github.com/mozilla/redash">fork of Redash</a> to iterate quickly on new features, but as good open source citizen we push our changes upstream.</p>
<p><img src="concepts/pipeline/../../assets/STMO_example.jpeg" alt="STMO" title="STMO – who doesnt love SQL?" /></p>
<h1 id="is-that-it"><a class="header" href="#is-that-it">Is that it?</a></h1>
<p>No, not really. If you want to read more, check out <a href="concepts/pipeline/data_pipeline_detail.html">this article</a>. For example, the DWL pushes some of the Telemetry data to Redshift and other tools that satisfy more niche needs. The pipeline ingests logs from services as well and there are many specialized dashboards out there I havent mentioned.</p>
<p>There is a vast ecosystem of tools for processing data at scale, each with their pros &amp; cons. The pipeline grew organically and we added new tools as new use-cases came up that we couldnt solve with our existing stack. There are still scars left from that growth though which require some effort to get rid of, like ingesting data from schema-less format.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/concepts/pipeline/data_pipeline.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="a-detailed-look-at-the-data-platform"><a class="header" href="#a-detailed-look-at-the-data-platform">A Detailed Look at the Data Platform</a></h1>
<p>For a more gentle introduction to the data platform, please read the <a href="concepts/pipeline/data_pipeline.html">Pipeline Overview</a> article.</p>
<p>This article goes into more depth about the architecture and flow of data in the platform.</p>
<h2 id="the-entire-platform"><a class="header" href="#the-entire-platform">The Entire Platform</a></h2>
<p>The full detail of the platform can get quite complex, but at a high level the structure is fairly simple.</p>
<pre class="mermaid">graph LR
Producers[Data Producers] --&gt; Ingestion
Ingestion --&gt; Storage[Long-term Storage]
Ingestion --&gt; Stream[Stream Processing]
Stream --&gt; Storage
Batch[Batch Processing] --&gt; Storage
Storage --&gt; Batch
Self[Self Serve] -.- Stream
Self -.- Batch
Stream -.-&gt; Visualization
Batch -.-&gt; Visualization
Stream --&gt; Export
Batch --&gt; Export
</pre>
<p>Each of these high-level parts of the platform are described in more detail below.</p>
<h2 id="data-producers"><a class="header" href="#data-producers">Data Producers</a></h2>
<p>By far most data handled by the Data Platform is <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/data/main-ping.html">produced by Firefox</a>. There are other producers, though, and the eventual aim is to generalize data production using a client SDK or set of standard tools.</p>
<p>Most data is submitted via HTTP POST, but data is also produced in the form of service logs and <code>statsd</code> messages.</p>
<p>If you would like to locally test a new data producer, the <a href="https://github.com/mozilla/gzipServer"><code>gzipServer</code></a> project provides a simplified server that makes it easy to inspect submitted messages.</p>
<h2 id="ingestion-2"><a class="header" href="#ingestion-2">Ingestion</a></h2>
<pre class="mermaid">graph LR
subgraph HTTP
tee
lb[Load Balancer]
mozingest
end
subgraph Kafka
kafka_unvalidated[Kafka unvalidated]
kafka_validated[Kafka validated]
zookeeper[ZooKeeper] -.- kafka_unvalidated
zookeeper -.- kafka_validated
end
subgraph Storage
s3_heka[S3 Heka Protobuf Storage]
s3_parquet[S3 Parquet Storage]
end
subgraph Data Producers
Firefox --&gt; lb
more_producers[Other Producers] --&gt; lb
end
lb --&gt; tee
tee --&gt; mozingest
mozingest --&gt; kafka_unvalidated
mozingest --&gt; Landfill
kafka_unvalidated --&gt; dwl[Data Store Loader]
kafka_validated --&gt; cep[Hindsight CEP]
kafka_validated --&gt; sparkstreaming[Spark Streaming]
Schemas -.-&gt;|validation| dwl
dwl --&gt; kafka_validated
dwl --&gt; s3_heka
dwl --&gt; s3_parquet
sparkstreaming --&gt; s3_parquet
</pre>
<p>Data arrives as an HTTP POST of an optionally gzipped payload of JSON. See the common <a href="concepts/pipeline/http_edge_spec.html">Edge Server</a> specification for details.</p>
<p>Submissions hit a load balancer which handles the SSL connection, then forwards to a &quot;tee&quot; server, which may direct some or all submissions to alternate backends. In the past, the tee was used to manage the <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1302265">cutover between different versions of the backend</a> infrastructure. It is implemented as an <a href="http://openresty.org/en/"><code>OpenResty</code></a> plugin.</p>
<p>From there, the <a href="https://github.com/mozilla-services/nginx_moz_ingest"><code>mozingest</code></a> HTTP Server receives submissions from the tee and batches and stores data durably on Amazon S3 as a fail-safe (we call this &quot;Landfill&quot;). Data is then passed along via <a href="https://kafka.apache.org/">Kafka</a> for validation and further processing. If there is a problem with decoding, validation, or any of the code described in the rest of this section, data can be re-processed from this fail-safe store. The <code>mozingest</code> server is implemented as an <code>nginx</code> module.</p>
<p>Validation, at a minimum, ensures that a payload is valid JSON (possibly compressed). Many document types also have a <a href="https://github.com/mozilla-services/mozilla-pipeline-schemas">JSONSchema specification</a>, and are further validated against that.</p>
<p>Invalid messages are redirected to a separate &quot;errors&quot; stream for debugging and inspection.</p>
<p>Valid messages proceed for further decoding and processing. This involves things like doing GeoIP lookup and discarding the IP address, and attaching some HTTP header info as annotated metadata.</p>
<p>Validated and annotated messages become available for stream processing.</p>
<p>They are also batched and stored durably for later batch processing and ad-hoc querying.</p>
<p>See also the &quot;<a href="https://docs.google.com/document/d/1PqiF1rF2fCk_kQuGSwGwildDf4Crg9MJTY44E6N5DSk/edit">generic ingestion</a>&quot; proposal which aims to make ingestion, validation, storage, and querying available as self-serve for platform users.</p>
<h5 id="data-flow-for-valid-submissions"><a class="header" href="#data-flow-for-valid-submissions">Data flow for valid submissions</a></h5>
<pre class="mermaid">sequenceDiagram
participant Fx as Firefox
participant lb as Load Balancer
participant mi as mozingest
participant lf as Landfill
participant k as Kafka
participant dwl as Data Store Loader
participant dl as Data Lake
Fx-&gt;&gt;lb: HTTPS POST
lb-&gt;&gt;mi: forward
mi--&gt;&gt;lf: failsafe store
mi-&gt;&gt;k: enqueue
k-&gt;&gt;dwl: validate, decode
dwl-&gt;&gt;k: enqueue validated
dwl-&gt;&gt;dl: store durably
</pre>
<h5 id="other-ingestion-methods"><a class="header" href="#other-ingestion-methods">Other ingestion methods</a></h5>
<p>Hindsight is used for <a href="https://mozilla-services.github.io/lua_sandbox_extensions/moz_logging/">ingestion of logs</a> from applications and services, it supports parsing of log lines and appending similar metadata as the HTTP ingestion above (timestamp, source, and so on).</p>
<p><a href="https://github.com/etsy/statsd"><code>Statsd</code></a> messages are ingested in the usual way.</p>
<h2 id="storage"><a class="header" href="#storage">Storage</a></h2>
<pre class="mermaid">graph TD
subgraph RDBMS
PostgreSQL
Redshift
MySQL
BigQuery
end
subgraph NoSQL
DynamoDB
end
subgraph S3
landfill[Landfill]
s3_heka[Heka Data Lake]
s3_parquet[Parquet Data Lake]
s3_analysis[Analysis Outputs]
s3_public[Public Outputs]
end
Ingestion --&gt; s3_heka
Ingestion --&gt; s3_parquet
Ingestion --&gt; landfill
Ingestion -.-&gt; stream[Stream Processing]
stream --&gt; s3_parquet
batch[Batch Processing] --&gt; s3_parquet
batch --&gt; PostgreSQL
batch --&gt; DynamoDB
batch --&gt; s3_public
selfserve[Self Serve] --&gt; s3_analysis
s3_analysis --&gt; selfserve
Hive --&gt;|Presto| STMO[STMO]
PostgreSQL --&gt; STMO
Redshift --&gt; STMO
MySQL --&gt; STMO
BigQuery --&gt; STMO
s3_parquet -.- Hive
</pre>
<p><a href="https://aws.amazon.com/s3/">Amazon S3</a> forms the backbone of the platform storage layer. The primary format used in the Data Lake is <a href="https://parquet.apache.org/">parquet</a>, which is a strongly typed columnar storage format that can easily be read and written by <a href="https://spark.apache.org/docs/latest/index.html">Spark</a>, as well as being compatible with SQL interfaces such as <a href="https://cwiki.apache.org/confluence/display/Hive/Home">Hive</a> and <a href="http://prestosql.io/">Presto</a>. Some data is also stored in <a href="https://hekad.readthedocs.io/en/dev/message/index.html#stream-framing">Heka-framed protobuf</a> format. This custom format is usually reserved for data where we do not have a complete <a href="https://github.com/mozilla-services/mozilla-pipeline-schemas">JSONSchema specification</a>.</p>
<p>Using S3 for storage avoids the need for an always-on cluster, which means that data at rest is inexpensive. S3 also makes it very easy to automatically expire (delete) objects after a certain period of time, which is helpful for implementing data retention policies.</p>
<p>Once written to S3, the data is typically treated as immutable - data is not appended to existing files, nor is data normally updated in place. The exception here is when data is back-filled, in which case previous data may be overwritten.</p>
<p>There are a number of other types of storage used for more specialized applications, including relational databases (such as PostgreSQL for the <a href="https://github.com/mozilla/python_mozaggregator/#api">Telemetry Aggregates</a>) and NoSQL databases (DynamoDB is used for a backing store for the <a href="https://github.com/mozilla/python_mozetl/blob/master/mozetl/taar/taar_dynamo.py">TAAR project</a>). Reading data from a variety of RDBMS sources is also supported via STMO.</p>
<p>The data stored in Heka format is <a href="concepts/pipeline/../../tools/spark.html">readable from Spark</a> using libraries in <a href="https://github.com/mozilla/moztelemetry/blob/master/src/main/scala/com/mozilla/telemetry/heka/Dataset.scala">Scala</a> or <a href="https://mozilla.github.io/python_moztelemetry/api.html#dataset">Python</a>.</p>
<p>Parquet data can be read and written natively from Spark, and many datasets are indexed in a <a href="https://cwiki.apache.org/confluence/display/Hive/Home">Hive</a> Metastore, making them available through a SQL interface on STMO and in notebooks via Spark SQL. Many other SQL data sources are also made available via STMO, see <a href="concepts/pipeline/../../tools/stmo.html">this article</a> for more information on accessing data using SQL.</p>
<p>There is a separate data store for self-serve <strong>Analysis Outputs</strong>, intended to keep ad-hoc, temporary data out of the Data Lake. This is implemented as a separate S3 location, with personal output locations prefixed with each person's user id, similar to the layout of the <code>/home</code> directory on a Unix system.</p>
<p>Analysis outputs can also be made public using the <strong>Public Outputs</strong> bucket. This is a web-accessible S3 location for powering public dashboards. This public data is available at <code>https://analysis-output.telemetry.mozilla.org/&lt;job name&gt;/data/&lt;files&gt;</code>.</p>
<h2 id="stream-processing"><a class="header" href="#stream-processing">Stream Processing</a></h2>
<p>Stream processing is done using <a href="https://github.com/mozilla-services/hindsight">Hindsight</a> and <a href="https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html">Spark Streaming</a>.</p>
<p>Hindsight allows you to run <a href="http://mozilla-services.github.io/lua_sandbox/">plugins written in Lua inside a sandbox</a>. This gives a safe, performant way to do self-serve streaming analysis. Hindsight plugins do the initial data validation and decoding, as well as writing out to long-term storage in both <a href="https://hekad.readthedocs.io/en/dev/message/index.html#stream-framing">Heka-framed protobuf</a> and <a href="https://mozilla-services.github.io/lua_sandbox_extensions/parquet/">parquet</a> forms.</p>
<p>Spark Streaming is used to read from Kafka and perform <a href="https://github.com/mozilla/telemetry-streaming">low-latency ETL and aggregation tasks</a>. These aggregates are currently used by <a href="https://mozilla.cloud.looker.com/dashboards/918">Mission Control</a> and are also available for querying via <a href="concepts/pipeline/../../tools/stmo.html">STMO</a>.</p>
<h2 id="batch-processing"><a class="header" href="#batch-processing">Batch Processing</a></h2>
<p>Batch processing is done using <a href="https://spark.apache.org/docs/latest/index.html">Spark</a>. Production ETL code is written in both <a href="https://github.com/mozilla/python_mozetl">Python</a> and <a href="https://github.com/mozilla/telemetry-batch-view">Scala</a>.</p>
<p>There are <a href="https://mozilla.github.io/python_moztelemetry/api.html#dataset">Python</a> and <a href="https://github.com/mozilla/moztelemetry/blob/master/src/main/scala/com/mozilla/telemetry/heka/Dataset.scala">Scala</a> libraries for reading data from the Data Lake in <a href="https://hekad.readthedocs.io/en/dev/message/index.html#stream-framing">Heka-framed protobuf</a> form, though it is much easier and more performant to make use of a derived dataset whenever possible.</p>
<p>Datasets in parquet format can be read natively by Spark, either using Spark SQL or by reading data directly from S3.</p>
<p>Data produced by production jobs go into the Data Lake, while output from ad-hoc jobs go into Analysis Outputs.</p>
<p>Job scheduling and dependency management is done using <a href="https://github.com/mozilla/telemetry-airflow">Airflow</a>. Most jobs run once a day, processing data from &quot;yesterday&quot; on each run. A typical job launches a cluster, which fetches the specified ETL code as part of its bootstrap on startup, runs the ETL code, then shuts down upon completion. If something goes wrong, a job may time out or fail, and in this case it is retried automatically.</p>
<h2 id="self-serve-data-analysis"><a class="header" href="#self-serve-data-analysis">Self Serve Data Analysis</a></h2>
<pre class="mermaid">graph TD
subgraph Storage
lake[Data Lake]
s3_output_public[Public Outputs]
s3_output_private[Analysis Outputs]
end
subgraph STMO
STMO[STMO] --&gt;|read| lake
end
subgraph TMO
evo[Evolution Dashboard]
histo[Histogram Dashboard]
agg[Telemetry Aggregates]
evo -.- agg
histo -.- agg
end
subgraph Databricks
db_notebook[Notebook]
db_notebook --&gt;|read + write| lake
end
</pre>
<p>Most of the data analysis tooling has been developed with the goal of being &quot;self-serve&quot;. This means that people should be able to access and analyze data on their own, without involving data engineers or operations. Thus can data access scale beyond a small set of people with specialized knowledge of the entire pipeline.</p>
<p>The use of these self-serve tools is described in the tutorials section of this site. This section focuses on how these tools integrate with the platform infrastructure.</p>
<h5 id="stmo-sql-analysis"><a class="header" href="#stmo-sql-analysis">STMO: SQL Analysis</a></h5>
<p><a href="concepts/pipeline/../../tools/stmo.html">STMO</a> is a customized <a href="https://redash.io">Redash</a> installation that provides self-serve access to a a variety of different datasets. From here, you can query data in the Parquet Data Lake as well as various RDBMS data sources.</p>
<p>STMO interfaces with the data lake using both <a href="http://prestosql.io/">Presto</a> and Amazon <a href="https://aws.amazon.com/athena/">Athena</a>. Each has its own data source in STMO. Since Athena does not support user-defined functions, datasets with HyperLogLog columns, such as <a href="concepts/pipeline/../../datasets/obsolete/client_count_daily/reference.html"><code>client_count_daily</code></a>, are only available via Presto..</p>
<p>Different <strong>Data Sources</strong> in STMO connect to different backends, and each backend might use a slightly different flavor of SQL. You should find a link to the documentation for the expected SQL variant next to the Data Sources list.</p>
<p>Queries can be run just once, or scheduled to run periodically to keep data up-to-date.</p>
<p>There is a command-line interface to STMO called <a href="https://github.com/mozilla/stmocli">St. Mocli</a>, if you prefer writing SQL using your own editor and tools.</p>
<h5 id="databricks-managed-spark-analysis"><a class="header" href="#databricks-managed-spark-analysis">Databricks: Managed Spark Analysis</a></h5>
<p>Our Databricks instance (see <a href="https://docs.databricks.com/user-guide/notebooks/index.html">Databricks docs</a>) offers another notebook interface for doing analysis in Scala, SQL, Python and R.</p>
<p>Databricks provides an always-on shared server which is nice for quick data investigations.</p>
<h5 id="tmo-aggregate-graphs"><a class="header" href="#tmo-aggregate-graphs">TMO: Aggregate Graphs</a></h5>
<p><a href="https://telemetry.mozilla.org">TMO</a> provides easy visualizations of histogram and scalar measures over time. Time can be in terms of either builds or submission dates. This is the most convenient interface to the Telemetry data, as it does not require any custom code.</p>
<h2 id="visualization"><a class="header" href="#visualization">Visualization</a></h2>
<p>There are a number of visualization libraries and tools being used to display data.</p>
<h5 id="tmo-dashboards"><a class="header" href="#tmo-dashboards">TMO Dashboards</a></h5>
<p>The landing page at <a href="https://telemetry.mozilla.org"><code>telemetry.mozilla.org</code></a> is a good place to look for existing graphs, notably the <a href="https://telemetry.mozilla.org/new-pipeline/dist.html">measurement dashboard</a> which gives a lot of information about histogram and scalar measures collected on pre-release channels.</p>
<h5 id="notebooks"><a class="header" href="#notebooks">Notebooks</a></h5>
<p>Use of interactive notebooks has become a standard in the industry, and Mozilla makes heavy use of this approach. Databricks makes it easy to run, share, and schedule notebooks.</p>
<h5 id="others"><a class="header" href="#others">Others</a></h5>
<p><a href="concepts/pipeline/../../tools/stmo.html">STMO</a> lets you query the data using SQL, but it also supports a number of useful visualizations.</p>
<p><a href="concepts/pipeline/BROKEN:http://pipeline-cep.prod.mozaws.net/">Hindsight's web interface</a> has the ability to visualize time-series data.</p>
<p><a href="https://mozilla.cloud.looker.com/dashboards/918">Mission Control</a> gives a low-latency view into release health.</p>
<p>Many bespoke visualizations are built using the <a href="http://metricsgraphicsjs.org/">Metrics Graphics</a> library as a display layer.</p>
<h2 id="monitoring-and-alerting"><a class="header" href="#monitoring-and-alerting">Monitoring and Alerting</a></h2>
<p>There are multiple layers of monitoring and alerting.</p>
<p>At a low level, the system is monitored to ensure that it is functioning as expected. This includes things like machine-level resources (network capacity, disk space, available RAM, CPU load) which are typically monitored using <a href="http://datadoghq.com/">DataDog</a>.</p>
<p>Next, we monitor the &quot;transport&quot; functionality of the system. This includes monitoring incoming submission rates, payload sizes, traffic patterns, schema validation failure rates, and alerting if anomalies are detected. This type of anomaly detection and alerting is handled by <a href="https://github.com/mozilla-services/hindsight">Hindsight</a>.</p>
<p>Once data has been safely ingested and stored, we run some automatic regression detection on all Telemetry <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/collection/histograms.html">histogram measures</a> using <a href="https://github.com/mozilla/cerberus">Cerberus</a>. This code looks for changes in the distribution of a measure, and emails probe owners if a significant change is observed.</p>
<p>Production ETL jobs are run via <a href="https://github.com/mozilla/telemetry-airflow">Airflow</a>, which monitors batch job progress and alerts if there are failures in any job. Self-serve batch jobs running via Databricks also generate alerts upon failure.</p>
<p>Scheduled <a href="concepts/pipeline/../../tools/stmo.html">STMO</a> queries may also be configured to generate alerts, which is used to monitor the last-mile user facing status of derived datasets. STMO may also be used to monitor and alert on high-level characteristics of the data, or really anything you can think of.</p>
<h2 id="data-exports"><a class="header" href="#data-exports">Data Exports</a></h2>
<p>Data is exported from the pipeline to a few other tools and systems. Examples include integration with <a href="https://amplitude.com/">Amplitude</a> for mobile and product analytics and shipping data to other parts of the Mozilla organization.</p>
<p>There are also a few data sets which are made publicly available, such as the <a href="https://data.firefox.com/dashboard/hardware">Firefox Hardware Report</a>.</p>
<h2 id="bringing-it-all-together"><a class="header" href="#bringing-it-all-together">Bringing it all together</a></h2>
<p>Finally, here is a more detailed view of the entire platform. Some connections are omitted for clarity.</p>
<pre class="mermaid">graph LR
subgraph Data Producers
Firefox
more_producers[...]
end
subgraph Storage
Landfill
warehouse_heka[Heka Data Lake]
warehouse_parquet[Parquet Data Lake]
warehouse_analysis[Analysis Outputs]
PostgreSQL
Redshift
MySQL
hive[Hive] -.- warehouse_parquet
end
subgraph Stream Processing
cep[Hindsight Streaming]
dwl[Data Store Loader] --&gt; warehouse_heka
dwl --&gt; warehouse_parquet
sparkstreaming[Spark Streaming] --&gt; warehouse_parquet
end
subgraph Ingestion
Firefox --&gt; lb[Load Balancer]
more_producers --&gt; lb
lb --&gt; tee
tee --&gt; mozingest
mozingest --&gt; kafka
mozingest --&gt; Landfill
ZooKeeper -.- kafka[Kafka]
kafka --&gt; dwl
kafka --&gt; cep
kafka --&gt; sparkstreaming
end
subgraph Batch Processing
Airflow -.-&gt;|spark|tbv[telemetry-batch-view]
Airflow -.-&gt;|spark|python_mozetl
warehouse_heka --&gt; tbv
warehouse_parquet --&gt; tbv
warehouse_heka --&gt; python_mozetl
warehouse_parquet --&gt; python_mozetl
tmo_agg[Telemetry Aggregates]
end
subgraph Visualization
Hindsight
Jupyter
Zeppelin
TMO
redash_graphs[STMO]
MissionControl
bespoke_viz[Bespoke Viz]
end
subgraph Export
tbv --&gt; Amplitude
sparkstreaming --&gt; Amplitude
end
subgraph Self Serve
redash[STMO] -.-&gt; Presto
Presto --&gt; hive
redash -.-&gt; Athena
Athena --&gt; hive
warehouse_heka --&gt; spcluster
warehouse_parquet --&gt; spcluster
spcluster --&gt; warehouse_analysis
end
Schemas -.-&gt;|validation| dwl
</pre>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/concepts/pipeline/data_pipeline_detail.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="standard-metrics"><a class="header" href="#standard-metrics">Standard Metrics</a></h1>
<blockquote>
<p><strong>⚠</strong> Some of the information in this section is obsolete. For an up to date explanation of accepted metrics used at Mozilla, <a href="metrics/../concepts/getting_help.html">contact the Data team</a>.</p>
</blockquote>
<p>This section provides an overview of standard metrics used at Mozilla.
Here you'll find the definitions and descriptions for each.</p>
<p>For a deep dive into these metrics, see <a href="https://mozilla.github.io/gud/">the GUD documentation</a>.</p>
<p>The <a href="metrics/../concepts/index.html">Telemetry Behavior Reference</a> section also provides
information related to the definitions below.</p>
<p>We are now in the process of setting up the metrics top-level section here. This information will be moved into the appropriate subsection and this page will be replaced with an overview.</p>
<h2 id="activity"><a class="header" href="#activity">Activity</a></h2>
<h3 id="dau-1"><a class="header" href="#dau-1">DAU</a></h3>
<p>The number of unique profiles active on each day.</p>
<h3 id="wau-1"><a class="header" href="#wau-1">WAU</a></h3>
<p>The number of unique profiles active at least once during the 7-day window
ending on the specified day.</p>
<h3 id="mau-1"><a class="header" href="#mau-1">MAU</a></h3>
<p>The number of unique profiles active at least once during the 28-day window
ending on the specified day.</p>
<h3 id="intensity-1"><a class="header" href="#intensity-1">Intensity</a></h3>
<p>Intuitively, how many days per week do users use the product? Among profiles
active at least once in the week ending on the date specified, the number of
days on average they were active during that one-week window.</p>
<h2 id="retention-1"><a class="header" href="#retention-1">Retention</a></h2>
<h3 id="1-week-new-profile-retention"><a class="header" href="#1-week-new-profile-retention">1-Week New Profile Retention</a></h3>
<p>Among new profiles created on the day specified, what proportion (out of 1) are
active during the week beginning one week after the day specified.</p>
<h3 id="1-week-retention"><a class="header" href="#1-week-retention">1-Week Retention</a></h3>
<p>Among profiles that were active at least once in the week starting on the
specified day, what proportion (out of 1) are active during the following week.</p>
<h2 id="frequently-asked-questions"><a class="header" href="#frequently-asked-questions">Frequently Asked Questions</a></h2>
<ul>
<li>Why isn't &quot;New Users&quot; a metric?
<ul>
<li>&quot;New Users&quot; is considered a <a href="https://mozilla.github.io/gud#data-model">usage criterion</a>, which means it may be used
to filter other metrics, rather than itself being a metric. For example,
you can compute &quot;New User DAU&quot;, which would be the subset of DAU that match
the &quot;New User&quot; criterion. The exception here is 1-Week New Profile
Retention, which is so common that it makes sense to include with all
usage criteria.</li>
</ul>
</li>
<li>What dates are used to determine activity for things like MAU and DAU?
<ul>
<li><a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1422892">Submission dates</a> are used for determining when activity happened (<em>not</em>
client-side activity dates).</li>
</ul>
</li>
<li>What <a href="metrics/../datasets/pings.html">pings</a> are used as a signal of activity?
<ul>
<li>For Firefox Desktop, we use the <code>main</code> ping to determine activity.</li>
<li>For products instrumented using Glean, we use the <code>baseline</code> ping.</li>
</ul>
</li>
</ul>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/metrics/index.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="metric-definitions"><a class="header" href="#metric-definitions">Metric Definitions</a></h1>
<p>These sections provide the lists of our standardized metrics, usage criteria, and slicing dimensions. All three are important to provide a holistic view of product usage and performance.</p>
<ul>
<li>
<p>A <a href="metrics/./usage.html"><code>usage criterion</code></a> tells us what has to happen for us to consider a product to have been used in a certain way and is needed to define metrics like MAU, which count users that use our products. For example, “Any Fenix Activity” measures activity for all Fenix profiles that have sent a <code>baseline</code> ping. Similarly, “Opened DevTools” means that we measure activity for all desktop Firefox profiles that have sent a telemetry ping indicating that DevTools was opened. We are working towards developing a standard usage criteria for each product, which operationalizes the idea that the product was &quot;used&quot;.</p>
</li>
<li>
<p>A <a href="metrics/./dimensions.html"><code>dimension</code></a> allows slicing to a subset of profiles according to characteristics of those profiles. Some dimensions include: country, channel, OS, etc. A <code>slice</code> is a set of particular values within dimensions, for example, “country is US” is one <code>slice</code> and “country is either US or DE and channel is release” is another.</p>
</li>
<li>
<p>A <a href="metrics/./metrics.html"><code>metric</code></a> is any quantity that we can calculate using our data and that, to some degree, measures some quantity of interest.</p>
</li>
</ul>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/metrics/definitions.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="metrics-1"><a class="header" href="#metrics-1">Metrics</a></h1>
<p>This section contains definitions and information about standard metrics used at Mozilla. You may wish to refer to the <a href="metrics/../concepts/terminology.html">terminology section</a> while reading this document, in case a particular concept is not clear.</p>
<h2 id="table-of-contents-21"><a class="header" href="#table-of-contents-21">Table of Contents</a></h2>
<ul>
<li><a href="metrics/metrics.html#daily-active-users-dau">Daily Active Users (DAU)</a>
<ul>
<li><a href="metrics/metrics.html#overview">Overview</a></li>
<li><a href="metrics/metrics.html#details">Details</a></li>
<li><a href="metrics/metrics.html#caveats">Caveats</a></li>
</ul>
</li>
<li><a href="metrics/metrics.html#day-2-7-activation">Day 2-7 Activation</a>
<ul>
<li><a href="metrics/metrics.html#overview-1">Overview</a></li>
<li><a href="metrics/metrics.html#details-1">Details</a></li>
<li><a href="metrics/metrics.html#dashboards">Dashboards</a></li>
<li><a href="metrics/metrics.html#tables">Tables</a></li>
</ul>
</li>
</ul>
<hr />
<h2 id="daily-active-users-dau"><a class="header" href="#daily-active-users-dau">Daily Active Users (DAU)</a></h2>
<h3 id="overview-5"><a class="header" href="#overview-5">Overview</a></h3>
<p>Daily Active Users or DAU counts the number of unique profiles active in the product on each day. This is intended to approximate the number of people using the product each day.</p>
<p>The most accurate and up-to-date metric information can be found on the
<a href="https://mozilla-hub.atlassian.net/wiki/spaces/DATA/pages/620494911/Metrics+Inventory">Metrics Inventory</a> page in Confluence (Mozilla LDAP required).</p>
<p>The official source code definitions for the metrics can be found in
<a href="https://github.com/mozilla/metric-hub/tree/main/definitions">metric-hub</a> on GitHub.</p>
<h3 id="details"><a class="header" href="#details">Details</a></h3>
<p>DAU counts unique profiles. Keep in mind that a <a href="metrics/../concepts/analysis_gotchas.html#profiles-vs-users">profile is not necessarily a user</a>.</p>
<p>The standard concept of active varies by product, but generally, active users are defined as unique profiles that have sent a <code>main</code> ping (on desktop) or a <code>baseline</code> ping (on mobile). The precise criteria are defined in the <a href="metrics/./usage.html"><code>usage criterion</code></a> section of this documentation.</p>
<p>We can also restrict the metric to alternative usage criteria. It's <strong>critical to clearly state any non-standard usage criteria on the metric</strong>. The metrics team suggest the following format: <code>DAU(usage criterion)</code>. For example, we might be interested in the count of profiles that have viewed more than 5 URIs in a day. We'd denote that metric as <code>DAU(URI &gt; 5)</code>.</p>
<p>Some common alternative usage criteria documented in the <a href="metrics/./usage.html"><code>usage criterion</code> section</a>.</p>
<h3 id="caveats-2"><a class="header" href="#caveats-2">Caveats</a></h3>
<p>If the number of users stays constant, but the average number of active profiles per user increases, this metric will tend to increase. For more details on the relationship between users and profiles, see <a href="metrics/../concepts/analysis_gotchas.html#profiles-vs-users">the profiles vs users section in analysis gotchas</a>.</p>
<hr />
<h2 id="day-2-7-activation"><a class="header" href="#day-2-7-activation">Day 2-7 Activation</a></h2>
<h3 id="overview-6"><a class="header" href="#overview-6">Overview</a></h3>
<p>This measure attempts to tell us whether, after creating a profile, the user returned at any point in the next 6 days. This is meant to measure whether a product is successful in engaging users early on: at present, many users churn after the first run and we want to measure whether efforts to make them stick around are succeeding.</p>
<p>This metric is used for all mobile applications, excluding Fenix and Firefox Preview, and is a top-level OKR inside Mozilla for 2020. It can apply both on the level of a specific product (e.g. Firefox iOS, Lockwise for Android) as well as an aggregate measure across all mobile products and devices. This metric is <em>not</em> used for Firefox desktop.</p>
<h3 id="details-1"><a class="header" href="#details-1">Details</a></h3>
<p>The day 2-7 activation metric is calculated as:</p>
<p>\[ \frac{\text{Activated New Profiles (day 2-7)}}{\text{New Profiles (day 1-7)}} \]</p>
<p>Activated New Profiles (day 2-7): Unique count of client ids who use the product at any point starting the day after they created a profile up to 6 days after.</p>
<p>New Profiles: Unique count of client ids with a given profile creation date. As not all initial pings are received exactly on the day of profile creation, we wait for 7 days after the profile creation date before establishing the New Profile cohort to ensure the data is complete.</p>
<p>E.g. For a cohort acquired (with a profile creation date) of Mar 1, they are considered activated in day 2-7 if they show up in DAU at any time between Mar 2nd and Mar 7th.</p>
<h3 id="dashboards-2"><a class="header" href="#dashboards-2">Dashboards</a></h3>
<p>The <a href="https://datastudio.google.com/u/0/reporting/1L7dsFyqjT8XZHrYprYS-HCP5_k_gZGIb/page/0iERB">non-desktop day 2-7 dashboard</a> tracks this measure.</p>
<h3 id="tables-2"><a class="header" href="#tables-2">Tables</a></h3>
<p>You can calculate this measure via the <code>firefox_nondesktop_day_2_7_activation</code> table in BigQuery. Here is a sample query:</p>
<pre><code class="language-sql">SELECT
cohort_date,
product,
SUM(day_2_7_activated) as day_2_7_activated,
SUM(new_profiles) as new_profiles,
SAFE_DIVIDE(SUM(day_2_7_activated), SUM(new_profiles)) as day_2_7_activation
FROM
`moz-fx-data-shared-prod.telemetry.firefox_nondesktop_day_2_7_activation`
WHERE
cohort_date = &quot;2020-03-01&quot;
GROUP BY 1,2
ORDER BY 1
</code></pre>
<p><a href="https://sql.telemetry.mozilla.org/queries/72054/source"><code>STMO#72054</code></a></p>
<hr />
<p>Reference below.</p>
<p>{{Metric name}}</p>
<p>TL;DR: two sentence max.</p>
<p>E.g: MAU counts the number of distinct users we see over a 28-day period. Desktop and Mobile MAU are both corporate KPIs for 2020.</p>
<ul>
<li>Overview:</li>
<li>What the metric measures</li>
<li>Calculation:</li>
<li>Definitions for both Mobile and Desktop, if applicable.</li>
<li>What is the easiest way to calculate this metric? E.g. MAU over <code>clients_last_seen</code>.</li>
<li>At least one working definition</li>
<li>Link to a scheduled Redash query (link with <code>stmocli</code>?)</li>
<li>Possibly an API-linked graph from STMO</li>
<li>If its non-obvious, examples for how to stratify. E.g. calculating MAU from <code>clients_daily</code></li>
<li>Common Issues: Failures and Gotchas</li>
<li>Resources</li>
<li>Link to the STMO query from Definition section</li>
<li>Notable dashboards for the metric</li>
<li>Similar metrics</li>
</ul>
<hr />
<p>Next metric</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/metrics/metrics.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="usage-criteria"><a class="header" href="#usage-criteria">Usage Criteria</a></h1>
<p>Here we provide definitions and information about our standard definitions of usage of our products.</p>
<p>Table of contents with links to all products and criteria.</p>
<hr />
<h2 id="product-name"><a class="header" href="#product-name">{{Product Name}}</a></h2>
<h3 id="usage-criteria-name"><a class="header" href="#usage-criteria-name">{{Usage Criteria Name}}</a></h3>
<p>TL;DR: two sentence max.</p>
<ul>
<li>Overview:</li>
<li>What type of usage we want to capture</li>
<li>Calculation:</li>
<li>Definitions of what user has to do to qualify</li>
<li>Definitions of what telemetry must be sent to qualify.</li>
<li>At least one working definition</li>
<li>Link to a scheduled STMO query (link with <code>stmocli</code>?)</li>
<li>Possibly an API-linked graph from STMO</li>
<li>Common Issues: Failures and Gotchas</li>
<li>Resources</li>
<li>Similar criteria</li>
</ul>
<h3 id="next-usage-criteria-name"><a class="header" href="#next-usage-criteria-name">{{Next Usage Criteria Name}}</a></h3>
<hr />
<h2 id="next-product-name"><a class="header" href="#next-product-name">{{Next Product Name}}</a></h2>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/metrics/usage.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="slicing-dimensions"><a class="header" href="#slicing-dimensions">Slicing Dimensions</a></h1>
<p>Here we provide definitions and information about our standard slicing dimensions.</p>
<p>Table of contents with links to dimensions.</p>
<hr />
<h2 id="dimension-name"><a class="header" href="#dimension-name">{{Dimension Name}}</a></h2>
<p>TL;DR: two sentence max.</p>
<ul>
<li>Overview:</li>
<li>What segmentation do we want to capture</li>
<li>Calculation:</li>
<li>Definitions of what user has to do to qualify</li>
<li>Definitions of what telemetry must be sent to qualify.</li>
<li>At least one working definition</li>
<li>Link to a scheduled STMO query (link with <code>stmocli</code>?)</li>
<li>Possibly an API-linked graph from STMO</li>
<li>Common Issues: Failures and Gotchas</li>
<li>Resources</li>
<li>Similar dimensions</li>
</ul>
<hr />
<h2 id="next-dimension-name"><a class="header" href="#next-dimension-name">{{Next Dimension Name}}</a></h2>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/metrics/dimensions.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="metric-policies"><a class="header" href="#metric-policies">Metric Policies</a></h1>
<p>Metric standards are maintained by the Metrics Standardization Group (MSG) (link to contact information).</p>
<h2 id="how-to-get-more-information"><a class="header" href="#how-to-get-more-information">How to get more information</a></h2>
<h2 id="how-to-propose-a-new-standard-metric-criteria-or-dimensions"><a class="header" href="#how-to-propose-a-new-standard-metric-criteria-or-dimensions">How to propose a new standard metric, criteria, or dimensions</a></h2>
<h2 id="how-to-propose-a-change-to-a-standards"><a class="header" href="#how-to-propose-a-change-to-a-standards">How to propose a change to a standards</a></h2>
<h2 id="recommendations-around-citing-standards-for-presentations-to-senior-audiences"><a class="header" href="#recommendations-around-citing-standards-for-presentations-to-senior-audiences">Recommendations around citing standards for presentations to senior audiences</a></h2>
<p>Something about how all metrics presented should have references to definitions here or explanations as to why a nonstandard metric was used.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/metrics/policy.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h2 id="legacy-census-metrics"><a class="header" href="#legacy-census-metrics">Legacy Census Metrics</a></h2>
<blockquote>
<p><strong>⚠</strong> The information in this document is obsolete. This content was originally included in the <a href="https://mozilla-private-report.protosaur.dev/smoot-existing-metrics/book/05_overview.html">Project Smoot existing metrics report</a> (Mozilla internal link).</p>
</blockquote>
<p>ADI and DAU are oft-discussed censuses. This chapter discusses their history and definition.</p>
<h3 id="adi--active-daily-installs-blocklist-fetches"><a class="header" href="#adi--active-daily-installs-blocklist-fetches">ADI / Active Daily Installs (blocklist fetches)</a></h3>
<blockquote>
<p><strong>⚠</strong> The Blocklist mechanism described below is no longer used and has been replaced with <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1257565">remote settings</a>. The content is left verbatim for historical reference.</p>
</blockquote>
<p>ADI, one of Firefoxs oldest client censuses, is computed as the number
of conforming requests to the Firefox
<a href="https://wiki.mozilla.org/Blocklisting">blocklist</a> endpoint. ADI data is
available since July 13, 2008.</p>
<p>It is not possible to opt-out of the blocklist using the Firefox UI, but
users can disable the update mechanism by changing preference values.</p>
<p>A blocklist is shipped in each release and updated when Firefox notices
that more than 24 hours have elapsed since the last update.</p>
<p>The blocklist request does not contain the telemetry <code>client_id</code> or any
other persistent identifiers. Some data about the install are provided
as URI parameters:</p>
<ul>
<li>App ID</li>
<li>App version</li>
<li>Product name</li>
<li>Build ID</li>
<li>Build target</li>
<li>Locale</li>
<li>Update channel</li>
<li>OS version</li>
<li>Distribution</li>
<li>Distribution version</li>
<li>Number of pings sent by this client for this version of Firefox
(stored in the pref <code>extensions.blocklist.pingCountVersion</code>)</li>
<li>Total ping count (stored in the pref
<code>extensions.blocklist.pingCountTotal</code>)</li>
<li>Number of full days since last ping</li>
</ul>
<p>so subsets of ADI may be queried along these dimensions.</p>
<p>The blocklist is kept up-to-date locally using the <code>UpdateTimerManager</code>
facility; the update is scheduled in a <a href="https://searchfox.org/mozilla-central/rev/b36e97fc776635655e84f2048ff59f38fa8a4626/toolkit/mozapps/extensions/extensions.manifest#1">manifest</a> and performed by
<a href="https://searchfox.org/mozilla-central/rev/b36e97fc776635655e84f2048ff59f38fa8a4626/toolkit/mozapps/extensions/Blocklist.jsm#569"><code>Blocklist#notify</code></a>.</p>
<p>Upon browser startup, after a delay (30 seconds by default),
<code>UpdateTimerManager</code> checks whether any of its scheduled tasks are
ready. At each wakeup, the single most-overdue task is triggered, if one
exists. <code>UpdateTimerManager</code> then sleeps at least two minutes or until
the next task is scheduled.</p>
<p>Failures are ignored.</p>
<p>The raw data is available in BigQuery (see <a href="https://sql.telemetry.mozilla.org/queries/66481"><code>STMO#66481</code></a>).</p>
<p>Telemetry only reports whether blocklist checking is enabled or disabled
on the client; there is no data in telemetry about blocklist fetches,
age, or update failures.</p>
<h3 id="dau--daily-active-users"><a class="header" href="#dau--daily-active-users">DAU / Daily Active Users</a></h3>
<blockquote>
<p><strong>⚠</strong> This description of DAU is not authoritative; please see the <a href="concepts/../metrics/metrics.html#daily-active-users-dau">DAU definition in metrics</a> for the canonical definition.</p>
</blockquote>
<p>Firefox DAU is currently computed as the number of unique <code>client_id</code>s
observed in <code>main</code> pings received on a calendar day. The DAU count
excludes users who have <a href="https://support.mozilla.org/en-US/kb/share-data-mozilla-help-improve-firefox">opted out of telemetry</a>.</p>
<p>Each <code>main</code> ping describes a single subsession of browser activity.</p>
<p>When and how a ping is sent depends on the reason the subsession ends:</p>
<div id="tbl:pingreasons">
<table style="width:99%;">
<caption>Table 1: When <code>main</code> pings are sent, and why.</caption>
<colgroup>
<col style="width: 9%" />
<col style="width: 7%" />
<col style="width: 7%" />
<col style="width: 75%" />
</colgroup>
<thead>
<tr class="header">
<th style="text-align: left;">Reason</th>
<th style="text-align: left;">Trigger</th>
<th style="text-align: left;">Percent of subsessions [1]</th>
<th style="text-align: left;">Mechanism</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td style="text-align: left;"><code>shutdown</code></td>
<td style="text-align: left;">Browser is closed</td>
<td style="text-align: left;">77%</td>
<td style="text-align: left;">For Firefox 55 or later, sent by <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/internals/pingsender.html"><code>Pingsender</code></a> on browser close unless the OS is shutting down. Otherwise, sent by <a href="https://searchfox.org/mozilla-central/rev/532e4b94b9e807d157ba8e55034aef05c1196dc9/toolkit/components/telemetry/app/TelemetrySend.jsm#677">`TelemetrySendImpl.setup`</a> on the following browser launch.</td>
</tr>
<tr class="even">
<td style="text-align: left;"><code>environment-change</code></td>
<td style="text-align: left;">The telemetry environment changed</td>
<td style="text-align: left;">13%</td>
<td style="text-align: left;">Sent when change is detected by <a href="https://searchfox.org/mozilla-central/rev/532e4b94b9e807d157ba8e55034aef05c1196dc9/toolkit/components/telemetry/pings/TelemetrySession.jsm#1510">`TelemetrySession._onEnvironmentChange`</a></td>
</tr>
<tr class="odd">
<td style="text-align: left;"><code>daily</code></td>
<td style="text-align: left;">more than 24 hours have elapsed since the last ping was sent and the time is local midnight</td>
<td style="text-align: left;">8%</td>
<td style="text-align: left;">Sent at local midnight after a random 0-60 min delay</td>
</tr>
<tr class="even">
<td style="text-align: left;"><code>aborted-session</code></td>
<td style="text-align: left;">A session terminates uncleanly (e.g. crash or lost power)</td>
<td style="text-align: left;">3%</td>
<td style="text-align: left;">Sent by the browser on the next launch; the payload to send is <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/concepts/crashes.html">written to disk every 5 minutes</a> during an active session and removed by a clean shutdown</td>
</tr>
</tbody>
</table>
</div>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/concepts/censuses.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="obsolete-datasets"><a class="header" href="#obsolete-datasets">Obsolete Datasets</a></h1>
<p>These datasets are no longer updated or maintained. Please reach out to the Data Platform team
if you think your needs are best met by an obsolete dataset.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/obsolete.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="activity-stream-datasets"><a class="header" href="#activity-stream-datasets">Activity Stream Datasets</a></h1>
<p>This article describes the various BigQuery tables Mozilla uses to store Activity Stream data, along with some examples of how to access them.</p>
<h2 id="table-of-contents-22"><a class="header" href="#table-of-contents-22">Table of Contents</a></h2>
<ul>
<li><a href="datasets/obsolete/activity-stream/reference.html#what-is-activity-stream">What is Activity Stream?</a></li>
<li><a href="datasets/obsolete/activity-stream/reference.html#activity-stream-pings">Activity Stream Pings</a></li>
<li><a href="datasets/obsolete/activity-stream/reference.html#accessing-activity-stream-data">Accessing Activity Stream Data</a>
<ul>
<li><a href="datasets/obsolete/activity-stream/reference.html#activity_stream"><code>activity_stream</code></a></li>
<li><a href="datasets/obsolete/activity-stream/reference.html#messaging_system"><code>messaging_system</code></a></li>
</ul>
</li>
<li><a href="datasets/obsolete/activity-stream/reference.html#gotchas-and-caveats">Gotchas and Caveats</a></li>
<li><a href="datasets/obsolete/activity-stream/reference.html#examples">Examples</a>
<ul>
<li><a href="datasets/obsolete/activity-stream/reference.html#sessions-per-client_id">Sessions per <code>client_id</code></a></li>
<li><a href="datasets/obsolete/activity-stream/reference.html#topsite-clicks-and-highlights-clicks">Topsite clicks and Highlights clicks</a></li>
<li><a href="datasets/obsolete/activity-stream/reference.html#topsite-tile-dismissals-sponsored-and-non-sponsored"><code>Topsite</code> Tile Dismissals: Sponsored and Non-Sponsored</a></li>
<li><a href="datasets/obsolete/activity-stream/reference.html#snippet-impressions-blocks-clicks-and-dismissals">Snippet impressions, blocks, clicks, and dismissals</a></li>
</ul>
</li>
</ul>
<h2 id="what-is-activity-stream"><a class="header" href="#what-is-activity-stream">What is Activity Stream?</a></h2>
<p>Activity Stream is the Firefox module which manages the in product content pages for Firefox:</p>
<ul>
<li><code>about:home</code></li>
<li><code>about:newtab</code></li>
<li><code>about:welcome</code>
<ul>
<li><a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1448918">starting with Firefox 62</a></li>
</ul>
</li>
<li><code>Snippets</code></li>
<li><code>CFR</code></li>
<li><code>Onboarding</code></li>
<li><code>What's new panel</code></li>
<li><code>Moments pages</code></li>
</ul>
<p>The Activity Stream team has implemented data collection in and around these pages. This data has some overlap with the standard Firefox Telemetry system, however it is a custom system, designed and maintained by that team.</p>
<p>For specific questions about this data, reach out to the <code>#fx-messaging-system</code> Slack channel directly.</p>
<h2 id="activity-stream-pings"><a class="header" href="#activity-stream-pings">Activity Stream Pings</a></h2>
<p>This data is measured in various custom pings that are sent via PingCentre (different from <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/internals/pingsender.html">Pingsender</a>).</p>
<ul>
<li><a href="https://firefox-source-docs.mozilla.org/browser/components/newtab/docs/v2-system-addon/data_events.html">Activity Stream Pings: <code>data_events.md</code></a></li>
</ul>
<h2 id="accessing-activity-stream-data"><a class="header" href="#accessing-activity-stream-data">Accessing Activity Stream Data</a></h2>
<p>Activity Stream pings are stored in BigQuery (like other Firefox Telemetry). There are two datasets: <code>activity_stream</code> and <code>messaging_system</code>.</p>
<h4 id="activity_stream"><a class="header" href="#activity_stream"><code>activity_stream</code></a></h4>
<p>The <code>activity_stream</code> dataset contains the following tables:</p>
<ul>
<li><code>events</code> stores user interactions with the <code>about:home</code> and <code>about:newtab</code> pages</li>
<li><code>sessions</code> stores sessions of <code>about:home</code> and <code>about:newtab</code> pages</li>
<li><code>impression_stats</code> stores impression/click/block events for the Pocket recommendations on the <code>about:home</code> and <code>about:newtab</code> pages</li>
<li><code>spoc_fills</code> stores &quot;Pocket Sponsored&quot; recommendation related pings</li>
</ul>
<h4 id="messaging_system"><a class="header" href="#messaging_system"><code>messaging_system</code></a></h4>
<p>The <code>messaging_system</code> dataset contains the following tables:</p>
<ul>
<li><code>cfr</code> stores metrics on user interactions with the CFR (Contextual Feature Recommendation) system</li>
<li><code>moments</code> stores &quot;Moments Pages&quot; related pings</li>
<li><code>onboarding</code> stores metrics on user interactions with onboarding features</li>
<li><code>snippets</code> stores impression/click/dismissal metrics for Firefox Snippets</li>
<li><code>whats_new_panel</code> stores &quot;What's New Panel&quot; related pings</li>
<li><code>undesired_events</code> stores system health related events</li>
</ul>
<h2 id="gotchas-and-caveats"><a class="header" href="#gotchas-and-caveats">Gotchas and Caveats</a></h2>
<p>Since this data collection isn't collected or maintained through our standard Telemetry API, there are a number of &quot;gotchas&quot; to keep in mind when working on this data:</p>
<ul>
<li>
<p><strong>Ping send conditions</strong>: Activity Stream pings have different send conditions, both from Telemetry pings as well as from each other. For example, <a href="https://firefox-source-docs.mozilla.org/browser/components/newtab/docs/v2-system-addon/data_events.html#session-end-pings">AS Session Pings</a> get sent by profiles that entered an Activity Stream session, at the end of that session, regardless of how long that session is. Compare this to <code>main</code> pings, which get sent by all Telemetry enabled profiles upon subsession end (browser shutdown, environment change, or local midnight cutoff).</p>
<p>Due to these inconsistencies, using data from different sources can be tricky. For example, if we wanted to know how much of DAU (from <code>main</code> pings) had a custom <code>about:home</code> page (available in AS Health Pings), joining on <code>client_id</code> and a date field would only provide information on profiles that started the session on that same day (active profiles on multi-day sessions would be excluded).</p>
</li>
<li>
<p><strong>Population covered</strong>: In addition to the usual considerations when looking at a measurement (in what version of Firefox did this measurement start getting collected? In what channels is it enabled in? etc.), when working with this data, there are additional Activity Stream specific conditions to consider when deciding &quot;who is eligible to send this ping?&quot;</p>
<p>For example, Pocket recommendations are only enabled in the US, CA, UK, and DE countries, for profiles that are on en-US, en-CA, en-GB, and de locales. Furthermore, users can set their <code>about:home</code> and <code>about:newtab</code> page to non-Activity Stream pages. This information can be important when deciding denominators for certain metrics.</p>
</li>
<li>
<p><strong>Different ping types in the same table</strong>: The tables in the <code>activity_stream</code> namespace can contain multiple types of pings. For example, the <code>events</code> table contains both <a href="https://firefox-source-docs.mozilla.org/browser/components/newtab/docs/v2-system-addon/data_events.html#page-takeover-ping">AS Page Takeover pings</a> as well as <a href="https://firefox-source-docs.mozilla.org/browser/components/newtab/docs/v2-system-addon/data_events.html#user-event-pings">AS User Event pings</a>.</p>
</li>
<li>
<p><strong>Null handling</strong>: Some fields in the Activity Stream data encode nulls with a <code>'N/A'</code> string or a <code>-1</code> value.</p>
</li>
<li>
<p><strong>Changes in ping behaviors</strong>: These pings continue to undergo development and the behavior as well as possible values for a given ping seem to change over time. For example, older versions of the event pings for clicking on a Topsite do not seem to report <code>card_types</code> and <code>icon_types</code>, while newer versions do. Caution is advised.</p>
</li>
<li>
<p><strong>Pocket data</strong>: Data related to Pocket interaction and usage in the <code>about:home</code> and <code>about:newtab</code> pages get sent to Pocket via this data collection and pipeline. However, due to privacy reasons, the <code>client_id</code> is omitted in the ping whenever the Pocket recommendation identifiers are included, instead it reports with another user unique identifier <code>impression_id</code>. Though all the Pocket user interactions, such as clicks, dismisses, and save to pocket are still reported as the regular events with the <code>client_id</code> as long as they don't contain the Pocket recommendation identifiers.</p>
</li>
</ul>
<h2 id="examples-1"><a class="header" href="#examples-1">Examples</a></h2>
<h3 id="sessions-per-client_id"><a class="header" href="#sessions-per-client_id">Sessions per <code>client_id</code></a></h3>
<p>Note: only includes <code>client_ids</code> that completed an Activity Stream session that day.</p>
<pre><code class="language-sql">SELECT
client_id,
DATE(submission_timestamp) AS date,
count(DISTINCT session_id) as num_sessions
FROM
`moz-fx-data-shared-prod.activity_stream.sessions`
WHERE
DATE(submission_timestamp) = '20200601'
GROUP BY
1
</code></pre>
<h3 id="topsite-clicks-and-highlights-clicks"><a class="header" href="#topsite-clicks-and-highlights-clicks">Topsite clicks and Highlights clicks</a></h3>
<pre><code class="language-sql">SELECT
client_id,
DATE(submission_timestamp) AS date,
session_id,
page,
source,
action_position,
experiments
FROM
`moz-fx-data-shared-prod.activity_stream.events`
WHERE
source in ('TOP_SITES', 'HIGHLIGHTS')
AND event = 'CLICK'
DATE(submission_timestamp) = '20200601'
</code></pre>
<h3 id="topsite-tile-dismissals-sponsored-and-non-sponsored"><a class="header" href="#topsite-tile-dismissals-sponsored-and-non-sponsored"><code>Topsite</code> Tile Dismissals: Sponsored and Non-Sponsored</a></h3>
<p>The <code>Topsite</code> Tile <code>Dismiss</code> action corresponds to the <code>BLOCK</code> event which can be taken on a Sponsored or Non-Sponsored Tile <a href="https://firefox-source-docs.mozilla.org/browser/components/newtab/docs/v2-system-addon/data_events.html#blocking-a-site">reference</a>. When applied to a Non-Sponsored Tile, the <code>BLOCK</code> event prevents the Tile from appearing in <code>TopSites</code> but leaves the browsing history as is. The <code>DELETE</code> event is fired when the user selects <code>Delete from History</code> and is only applicable to Non-Sponsored Tiles. This action deletes the URL from the client's complete browser history and prevents the Tile from appearing in their <code>Topsites.</code> <code>DELETE</code> doesn't apply to Sponsored Tiles as these are not generated by the user's browsing history.</p>
<pre><code class="language-sql">SELECT
DATE(submission_timestamp) AS date,
count(*)
FROM
`moz-fx-data-shared-prod.activity_stream.events`
WHERE
source = 'TOP_SITES'
AND event = 'BLOCK'
AND DATE(submission_timestamp) = '20220101'
AND value LIKE '%&quot;card_type&quot;:&quot;spoc&quot;%'
GROUP BY 1
ORDER BY 1
</code></pre>
<h3 id="snippet-impressions-blocks-clicks-and-dismissals"><a class="header" href="#snippet-impressions-blocks-clicks-and-dismissals">Snippet impressions, blocks, clicks, and dismissals</a></h3>
<p>Note: Which snippet message a record corresponds to can be identified by the <code>message_id</code> (check with Marketing for snippet recipes published).</p>
<pre><code class="language-sql">SELECT
client_id,
DATE(submission_timestamp) AS date,
event,
message_id,
event_context,
experiments
FROM
`moz-fx-data-shared-prod.messaging_system.snippets`
WHERE
DATE(submission_timestamp) = '20200601'
</code></pre>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/obsolete/activity-stream/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="attitudes-daily"><a class="header" href="#attitudes-daily">Attitudes Daily</a></h1>
<ul>
<li><a href="datasets/obsolete/attitudes_daily/reference.html#introduction">Introduction</a>
<ul>
<li><a href="datasets/obsolete/attitudes_daily/reference.html#contents">Contents</a></li>
<li><a href="datasets/obsolete/attitudes_daily/reference.html#accessing-the-data">Accessing the Data</a></li>
</ul>
</li>
<li><a href="datasets/obsolete/attitudes_daily/reference.html#data-reference">Data Reference</a>
<ul>
<li><a href="datasets/obsolete/attitudes_daily/reference.html#example-queries">Example Queries</a>
<ul>
<li><a href="datasets/obsolete/attitudes_daily/reference.html#distribution-of-user-responses-to-the-internet-is-open-and-accessible-to-all-over-time">Distribution of user responses to &quot;The internet is open and accessible to all&quot; over time</a></li>
<li><a href="datasets/obsolete/attitudes_daily/reference.html#distribution-of-responses-to-i-was-able-to-get-what-i-wanted-from-using-the-internet-today-by-default-search-engine">Distribution of responses to &quot;I was able to get what I wanted from using the Internet today&quot; by default search engine.</a></li>
</ul>
</li>
<li><a href="datasets/obsolete/attitudes_daily/reference.html#scheduling">Scheduling</a></li>
<li><a href="datasets/obsolete/attitudes_daily/reference.html#schema">Schema</a></li>
</ul>
</li>
<li><a href="datasets/obsolete/attitudes_daily/reference.html#code-reference">Code Reference</a></li>
</ul>
<h1 id="introduction-30"><a class="header" href="#introduction-30">Introduction</a></h1>
<p>The <code>attitudes_daily</code> table is a Telemetry instantiation of user responses to the <a href="datasets/obsolete/attitudes_daily/BROKEN:https://qsurvey.mozilla.com/collab/daily-attitude-survey">Daily Attitudes Survey (DAS)</a> over time.
It is joined to <code>clients_daily</code> using <code>client_id</code> and <code>submission_date</code>.</p>
<h4 id="contents-11"><a class="header" href="#contents-11">Contents</a></h4>
<p>Most Firefox surveys are point-in-time without longitudinal insights.
The DAS is completed by ~300 Firefox users every day, allowing us to measure long term attitudinal trends combined with users' corresponding Telemetry attributes.</p>
<h4 id="accessing-the-data-11"><a class="header" href="#accessing-the-data-11">Accessing the Data</a></h4>
<p>The <code>attitudes_daily</code> table is accessible through STMO using the
<code>Telemetry (BigQuery)</code> data source.
The full table name is <code>moz-fx-data-shared-prod.telemetry.attitudes_daily</code>.</p>
<p>Here's an <a href="https://sql.telemetry.mozilla.org/queries/63937/source#163424">example query (<code>STMO#163424</code>)</a>.</p>
<h1 id="data-reference-21"><a class="header" href="#data-reference-21">Data Reference</a></h1>
<p>The DAS shows the user the following four statements, all having the same possible responses of &quot;Agree&quot;, &quot;Disagree&quot; or &quot;Neutral or not sure&quot;.
Each question has an identifier in the <code>attitudes_daily</code> table as the <code>question_key</code>field (shown in parentheses after each statement below)</p>
<ul>
<li>The internet is open and accessible to all (<code>internet_accessible</code>)</li>
<li>I trust Firefox to help me with my online privacy (<code>trust_firefox</code>)</li>
<li>All the sites Ive visited recently have worked; none of them seem broken (<code>sites_work</code>)</li>
<li>Using the internet helped me meet my goals today (<code>met_goals</code>)</li>
</ul>
<p>User responses are processed and mapped to their <code>question_key</code>'s in <a href="https://github.com/mozilla/bigquery-etl/blob/master/sql/moz-fx-data-shared-prod/telemetry_derived/surveygizmo_daily_attitudes/import_responses.py">this script</a>, creating a table <code>moz-fx-data-shared-prod.external.survey_gizmo_daily_attitudes</code> which is subsequently joined to <code>clients_daily</code>.</p>
<h2 id="example-queries-18"><a class="header" href="#example-queries-18">Example Queries</a></h2>
<h4 id="distribution-of-user-responses-to-the-internet-is-open-and-accessible-to-all-over-time"><a class="header" href="#distribution-of-user-responses-to-the-internet-is-open-and-accessible-to-all-over-time">Distribution of user responses to &quot;The internet is open and accessible to all&quot; over time</a></h4>
<pre><code class="language-sql">SELECT
submission_date,
value,
count(DISTINCT client_id) as n
FROM `moz-fx-data-shared-prod.telemetry.attitudes_daily`
WHERE
question_key = 'internet_accessible'
AND value IS NOT NULL
GROUP BY 1, 2
</code></pre>
<iframe src="https://sql.telemetry.mozilla.org/embed/query/65079/visualization/165757?api_key=oSjO27fGmpCsnBXBEhaysRVrLZpX1SKMCpYcxA5h&" width="100%" height="530" frameborder="0" scrolling="no"></iframe>
<h4 id="distribution-of-responses-to-i-was-able-to-get-what-i-wanted-from-using-the-internet-today-by-default-search-engine"><a class="header" href="#distribution-of-responses-to-i-was-able-to-get-what-i-wanted-from-using-the-internet-today-by-default-search-engine">Distribution of responses to &quot;I was able to get what I wanted from using the Internet today&quot; by default search engine.</a></h4>
<pre><code class="language-sql">SELECT
value,
CASE
WHEN STARTS_WITH(default_search_engine, 'google') THEN 'Google'
WHEN STARTS_WITH(default_search_engine, 'ddg') THEN 'DuckDuckGo'
WHEN STARTS_WITH(default_search_engine, 'bing') THEN 'Bing'
ELSE 'Other'
END AS search_engine,
count(*) AS n
FROM
`moz-fx-data-shared-prod.telemetry.attitudes_daily`
WHERE
question_key = 'met_goals'
AND value IS NOT NULL
AND submission_date &gt; DATE_SUB(CURRENT_DATE, INTERVAL '7' DAY)
GROUP BY 1, 2
</code></pre>
<iframe src="https://sql.telemetry.mozilla.org/embed/query/63957/visualization/163467?api_key=gzCopzybDta4t3JGkx2urB9MQa67akAehJUybVdW&" width="100%" height="570" frameborder="0" scrolling="no"></iframe>
<h2 id="scheduling-27"><a class="header" href="#scheduling-27">Scheduling</a></h2>
<p>This dataset is updated daily via the
<a href="https://github.com/mozilla/telemetry-airflow">telemetry-airflow</a> infrastructure.
The job runs as part of the <a href="https://github.com/mozilla/telemetry-airflow/blob/master/dags/attitudes_daily.py"><code>attitudes_daily</code> DAG</a>.</p>
<h2 id="schema-19"><a class="header" href="#schema-19">Schema</a></h2>
<p>The data is partitioned by <code>submission_date</code>.</p>
<p>As of 2019-09-15, the current version of the <code>attitudes_daily</code> dataset is <code>v1</code>.</p>
<h1 id="code-reference-24"><a class="header" href="#code-reference-24">Code Reference</a></h1>
<p>This dataset is generated by
<a href="https://github.com/mozilla/bigquery-etl/blob/master/sql/moz-fx-data-shared-prod/telemetry_derived/attitudes_daily_v1/query.sql"><code>bigquery-etl</code></a>.
Refer to this repository for information on how to run or augment the dataset. You can view the task run status in airflow <a href="https://workflow.telemetry.mozilla.org/tree?dag_id=attitudes_daily">here</a>.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/obsolete/attitudes_daily/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="churn"><a class="header" href="#churn">Churn</a></h1>
<blockquote>
<p>As of 2019-08-21, this dataset has been deprecated and is no longer
maintained. See <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1561048">Bug 1561048</a>
for historical sources.</p>
</blockquote>
<ul>
<li><a href="datasets/obsolete/churn/reference.html#introduction">Introduction</a>
<ul>
<li><a href="datasets/obsolete/churn/reference.html#content">Content</a></li>
<li><a href="datasets/obsolete/churn/reference.html#background-and-caveats">Background and Caveats</a></li>
<li><a href="datasets/obsolete/churn/reference.html#accessing-the-data">Accessing the Data</a></li>
</ul>
</li>
<li><a href="datasets/obsolete/churn/reference.html#data-reference">Data Reference</a>
<ul>
<li><a href="datasets/obsolete/churn/reference.html#example-queries">Example Queries</a></li>
<li><a href="datasets/obsolete/churn/reference.html#scheduling">Scheduling</a></li>
<li><a href="datasets/obsolete/churn/reference.html#schema">Schema</a></li>
<li><a href="datasets/obsolete/churn/reference.html#code-reference">Code Reference</a></li>
</ul>
</li>
</ul>
<h1 id="introduction-31"><a class="header" href="#introduction-31">Introduction</a></h1>
<p>The churn dataset tracks the 7-day churn rate of telemetry profiles. This
dataset is generally used for analyzing cohort churn across segments and time.</p>
<h4 id="content-3"><a class="header" href="#content-3">Content</a></h4>
<p>Churn is the rate of attrition defined by <code>(clients seen in week N)/(clients seen in week 0)</code>
for groups of clients with some shared attributes. A group of clients with
shared attributes is called a <em>cohort</em>. The cohorts in this dataset are created
every week and can be tracked over time using the <code>acquisition_date</code> and the
weeks since acquisition or <code>current_week</code>.</p>
<p>The following example demonstrates the current logic for generating this
dataset. Each column represents the days since some arbitrary starting date.</p>
<div class="table-wrapper"><table><thead><tr><th>client</th><th>00</th><th>01</th><th>02</th><th>03</th><th>04</th><th>05</th><th>06</th><th>07</th><th>08</th><th>09</th><th>10</th><th>11</th><th>12</th><th>13</th><th>14</th></tr></thead><tbody>
<tr><td>A</td><td>X</td><td></td><td></td><td></td><td></td><td></td><td></td><td>X</td><td></td><td></td><td></td><td></td><td></td><td></td><td></td></tr>
<tr><td>B</td><td></td><td>X</td><td>X</td><td>X</td><td>X</td><td>X</td><td>X</td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td></tr>
<tr><td>C</td><td>X</td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td>X</td></tr>
</tbody></table>
</div>
<p>All three clients are part of the same cohort. Client A is retained for weeks 0
and 1 since there is activity in both periods. A client only needs to show up
once in the period to be counted as retained. Client B is acquired in week 0 and
is active frequently but does not appear in following weeks. Client B is
considered churned on week 1. However, a client that is churned can become
retained again. Client C is considered churned on week 1 but retained on week 2.</p>
<p>The following table summarizes the above daily activity into the following view
where every column represents the current week since acquisition date..</p>
<div class="table-wrapper"><table><thead><tr><th>client</th><th>0</th><th>1</th><th>2</th></tr></thead><tbody>
<tr><td>A</td><td>X</td><td>X</td><td></td></tr>
<tr><td>B</td><td>X</td><td></td><td></td></tr>
<tr><td>C</td><td>X</td><td></td><td>X</td></tr>
</tbody></table>
</div>
<p>The clients are then grouped into cohorts by attributes. An attribute describes
a property about the cohort such as the country of origin or the binary
distribution channel. Each group also contains descriptive aggregates of
engagement. Each metric describes the activity of a cohort such as size and
overall usage at a given time instance.</p>
<h4 id="background-and-caveats-11"><a class="header" href="#background-and-caveats-11">Background and Caveats</a></h4>
<p>The original concept for churn is captured in <a href="https://mana.mozilla.org/wiki/display/FIREFOX/Project%3A+Firefox+Churn+v1.0">this Mana
page</a>.
The original derived data-set was created in <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1198537">bug
1198537</a>. The first
<a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1389230">major revision (<code>v2</code>)</a> of
this data-set added attribution, search, and uri counts. The second <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1389231">major
revision (<code>v3</code>)</a> included
additional clients through the <code>new-profile</code> ping and adjusted the collection
window from 10 to 5 days.</p>
<ul>
<li>Each row in this dataset describes a unique segment of users
<ul>
<li>The number of rows is exponential with the number of dimensions</li>
<li>New fields should be added sparing to account for data-set size</li>
</ul>
</li>
<li>The dataset lags by 10 days in order account for submission latency
<ul>
<li>This value was determined to be time for 99% of main pings to arrive at the
server. With the shutdown-ping sender, this has been reduced to 4 days.
However, <code>churn_v3</code> still tracks releases older than Firefox 55.</li>
</ul>
</li>
<li>The start of the period is fixed to Sundays. Once it has been aggregated, the
period cannot be shifted due to the way clients are counted.
<ul>
<li>A supplementary 1-day <code>retention</code> dataset using HyperLogLog for client
counts is available for counting over arbitrary retention periods and date
offsets. Additionally, calculating churn or retention over specific cohorts
is tractable in STMO with <code>main_summary</code> or <code>clients_daily</code> datasets.</li>
</ul>
</li>
</ul>
<h4 id="accessing-the-data-12"><a class="header" href="#accessing-the-data-12">Accessing the Data</a></h4>
<p><code>churn</code> is available in STMO under Athena and Presto. The data is also
available in parquet for consumption by columnar data engines at
<code>s3://telemetry-parquet/churn/v3</code>.</p>
<h1 id="data-reference-22"><a class="header" href="#data-reference-22">Data Reference</a></h1>
<h2 id="example-queries-19"><a class="header" href="#example-queries-19">Example Queries</a></h2>
<p>This section walks through a typical query to generate data suitable for
visualization.</p>
<div class="table-wrapper"><table><thead><tr><th>field</th><th>type</th><th>description</th></tr></thead><tbody>
<tr><td><code>cohort_date</code></td><td>common, attribute</td><td>The start date bucket of the cohort. This is week the client was acquired.</td></tr>
<tr><td><code>elapsed_periods</code></td><td>common, attribute</td><td>The number of periods that have elapsed since the cohort date. In this dataset, the retention period is 7 days.</td></tr>
<tr><td><code>channel</code></td><td>attribute</td><td>Part of the release train model. An attribute that distinguishes cohorts.</td></tr>
<tr><td><code>geo</code></td><td>filter attribute</td><td>Country code. Used to filter out all countries other than the 'US'</td></tr>
<tr><td><code>n_profiles</code></td><td>metric</td><td>Count of users in a cohort. Use sum to aggregate.</td></tr>
</tbody></table>
</div>
<p>First the fields are extracted and aliased for consistency. <code>cohort_date</code> and
<code>elapsed_periods</code> are common to most retention queries and are useful concepts
for building on other datasets.</p>
<pre><code class="language-sql">WITH extracted AS (
SELECT acquisition_period AS cohort_date,
current_week AS elapsed_periods,
n_profiles,
channel,
geo
FROM churn
),
</code></pre>
<p>The extracted table is filtered down to the attributes of interest. The cohorts
of interest originate in the US and are in the release or beta channels. Note
that <code>channel</code> here is the concatenation of the normalized channel and the
funnelcake id. Only cohorts appearing after August 6, 2017 are chosen to be in
this population.</p>
<pre><code class="language-sql"> population AS (
SELECT channel,
cohort_date,
elapsed_periods,
n_profiles
FROM extracted
WHERE geo = 'US'
AND channel IN ('release', 'beta')
AND cohort_date &gt; '20170806'
-- filter out noise from clients with incorrect dates
AND elapsed_periods &gt;= 0
AND elapsed_periods &lt; 12
),
</code></pre>
<p>The number of profiles is aggregated by the cohort dimensions. The cohort
acquisition date and elapsed periods since acquisition are fundamental to cohort
analysis.</p>
<pre><code class="language-sql"> cohorts AS (
SELECT channel,
cohort_date,
elapsed_periods,
sum(n_profiles) AS n_profiles
FROM population
GROUP BY 1, 2, 3
),
</code></pre>
<p>The table will have the following structure. The table is sorted by the first three columns for demonstration.</p>
<div class="table-wrapper"><table><thead><tr><th><code>channel</code></th><th><code>cohort_date</code></th><th><code>elapsed_periods</code></th><th><code>n_profiles</code></th></tr></thead><tbody>
<tr><td>release</td><td>20170101</td><td>0</td><td>100</td></tr>
<tr><td>release</td><td>20170101</td><td>1</td><td>90</td></tr>
<tr><td>release</td><td>20170101</td><td>2</td><td>80</td></tr>
<tr><td>...</td><td>...</td><td>...</td><td>...</td></tr>
<tr><td>beta</td><td>20170128</td><td>10</td><td>25</td></tr>
</tbody></table>
</div>
<p>Finally, retention is calculated through the number of profiles at the time of
the <code>elapsed_period</code> relative to the initial period. This data can be imported
into a pivot table for further analysis.</p>
<pre><code class="language-sql">results AS (
SELECT c.*,
iv.n_profiles AS total_n_profiles,
(0.0+c.n_profiles)*100/iv.n_profiles AS percentage_n_profiles
FROM cohorts c
JOIN (
SELECT *
FROM cohorts
WHERE elapsed_periods = 0
) iv ON (
c.cohort_date = iv.cohort_date
AND c.channel = iv.channel
)
)
</code></pre>
<div class="table-wrapper"><table><thead><tr><th><code>channel</code></th><th><code>cohort_date</code></th><th><code>elapsed_periods</code></th><th><code>n_profiles</code></th><th><code>total_n_profiles</code></th><th><code>percentage_n_profiles</code></th></tr></thead><tbody>
<tr><td>release</td><td>20170101</td><td>0</td><td>100</td><td>100</td><td>1.0</td></tr>
<tr><td>release</td><td>20170101</td><td>1</td><td>90</td><td>100</td><td>0.9</td></tr>
<tr><td>release</td><td>20170101</td><td>2</td><td>80</td><td>100</td><td>0.8</td></tr>
<tr><td>...</td><td>....</td><td>...</td><td>...</td><td>...</td><td>...</td></tr>
<tr><td>beta</td><td>20170128</td><td>10</td><td>25</td><td>50</td><td>0.5</td></tr>
</tbody></table>
</div>
<p>Obtain the results.</p>
<pre><code class="language-sql">SELECT *
FROM results
</code></pre>
<p>You may consider visualizing using cohort graphs, line charts, or a pivot
tables. See <a href="https://sql.telemetry.mozilla.org/dashboard/firefox-telemetry-retention-dataset-example-usage">Firefox Telemetry Retention: Dataset Example Usage</a>
for more examples.</p>
<h2 id="scheduling-28"><a class="header" href="#scheduling-28">Scheduling</a></h2>
<p>The aggregated churn data is updated weekly on Wednesday.</p>
<h2 id="schema-20"><a class="header" href="#schema-20">Schema</a></h2>
<p>As of 2017-10-15, the current version of <code>churn</code> is <code>v3</code> and has a schema as follows:</p>
<pre><code>root
|-- channel: string (nullable = true)
|-- geo: string (nullable = true)
|-- is_funnelcake: string (nullable = true)
|-- acquisition_period: string (nullable = true)
|-- start_version: string (nullable = true)
|-- sync_usage: string (nullable = true)
|-- current_version: string (nullable = true)
|-- current_week: long (nullable = true)
|-- source: string (nullable = true)
|-- medium: string (nullable = true)
|-- campaign: string (nullable = true)
|-- content: string (nullable = true)
|-- distribution_id: string (nullable = true)
|-- default_search_engine: string (nullable = true)
|-- locale: string (nullable = true)
|-- is_active: string (nullable = true)
|-- n_profiles: long (nullable = true)
|-- usage_hours: double (nullable = true)
|-- sum_squared_usage_hours: double (nullable = true)
|-- total_uri_count: long (nullable = true)
|-- unique_domains_count_per_profile: double (nullable = true)
</code></pre>
<h2 id="code-reference-25"><a class="header" href="#code-reference-25">Code Reference</a></h2>
<p>The script for generating <code>churn</code> currently lives in
<a href="https://github.com/mozilla/python_mozetl/tree/9217335652cad46940a51c7c2784cc5c6d3a00f4"><code>mozilla/python_mozetl</code></a>. The job can
be found in
<a href="https://github.com/mozilla/python_mozetl/blob/9217335652cad46940a51c7c2784cc5c6d3a00f4/mozetl/engagement/churn/job.py#L1-L27"><code>mozetl/engagement/churn</code></a>.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/obsolete/churn/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="client-count-daily-reference"><a class="header" href="#client-count-daily-reference">Client Count Daily Reference</a></h1>
<blockquote>
<p>As of 2019-04-10, this dataset has been deprecated and is no longer maintained. Please use <a href="datasets/obsolete/client_count_daily//datasets/bigquery/clients_last_seen/reference.html"><code>clients_last_seen</code></a> instead. See <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1543518">Bug 1543518</a> for more information.</p>
</blockquote>
<ul>
<li><a href="datasets/obsolete/client_count_daily/reference.html#replacement">Replacement</a></li>
<li><a href="datasets/obsolete/client_count_daily/reference.html#introduction">Introduction</a>
<ul>
<li><a href="datasets/obsolete/client_count_daily/reference.html#content">Content</a></li>
<li><a href="datasets/obsolete/client_count_daily/reference.html#background-and-caveats">Background and Caveats</a></li>
<li><a href="datasets/obsolete/client_count_daily/reference.html#accessing-the-data">Accessing the Data</a></li>
<li><a href="datasets/obsolete/client_count_daily/reference.html#further-reading">Further Reading</a></li>
</ul>
</li>
<li><a href="datasets/obsolete/client_count_daily/reference.html#data-reference">Data Reference</a>
<ul>
<li><a href="datasets/obsolete/client_count_daily/reference.html#example-queries">Example Queries</a>
<ul>
<li><a href="datasets/obsolete/client_count_daily/reference.html#compute-dau-for-non-windows-clients-for-the-last-week">Compute DAU for non-windows clients for the last week</a></li>
<li><a href="datasets/obsolete/client_count_daily/reference.html#compute-wau-by-channel-for-the-last-week">Compute WAU by Channel for the last week</a></li>
</ul>
</li>
<li><a href="datasets/obsolete/client_count_daily/reference.html#caveats">Caveats</a></li>
<li><a href="datasets/obsolete/client_count_daily/reference.html#schema">Schema</a></li>
</ul>
</li>
</ul>
<h1 id="replacement"><a class="header" href="#replacement">Replacement</a></h1>
<p>We've moved to calculating exact user counts based on
<a href="datasets/obsolete/client_count_daily//datasets/bigquery/clients_last_seen/reference.html"><code>clients_last_seen</code></a>.</p>
<h1 id="introduction-32"><a class="header" href="#introduction-32">Introduction</a></h1>
<p>The <code>client_count_daily</code> dataset is useful for estimating user counts over a few
<a href="https://github.com/mozilla/telemetry-airflow/blob/adfce4a30895faa607ccf586b292b61ad68d8f75/jobs/client_count_daily_view.sh">pre-defined dimensions</a>.</p>
<p>The <code>client_count_daily</code> dataset is similar to the deprecated
<a href="datasets/obsolete/client_count_daily//datasets/batch_view/client_count/reference.html"><code>client_count</code> dataset</a>
except that is aggregated by submission date and not activity date.</p>
<h4 id="content-4"><a class="header" href="#content-4">Content</a></h4>
<p>This dataset includes columns for a dozen factors and an HLL variable.
The <code>hll</code> column contains a
<a href="https://en.wikipedia.org/wiki/HyperLogLog">HyperLogLog</a>
variable, which is an approximation to the exact count.
The factor columns include <strong>submission</strong> date and the dimensions listed
<a href="https://github.com/mozilla/telemetry-airflow/blob/adfce4a30895faa607ccf586b292b61ad68d8f75/jobs/client_count_daily_view.sh">here</a>.
Each row represents one combinations of the factor columns.</p>
<h4 id="background-and-caveats-12"><a class="header" href="#background-and-caveats-12">Background and Caveats</a></h4>
<p>It's important to understand that the <code>hll</code> column is <strong>not a standard count</strong>.
The <code>hll</code> variable avoids double-counting users when aggregating over multiple days.
The HyperLogLog variable is a far more efficient way to count distinct elements of a set,
but comes with some complexity.
To find the cardinality of an HLL use <code>cardinality(cast(hll AS HLL))</code>.
To find the union of two HLL's over different dates, use <code>merge(cast(hll AS HLL))</code>.
The <a href="https://sql.telemetry.mozilla.org/queries/81/source#129">Firefox ER Reporting Query (<code>STMO#81</code>)</a>
is a good example to review.
Finally, Roberto has a relevant write-up
<a href="https://ravitillo.wordpress.com/2016/04/12/measuring-product-engagment-at-scale/">here</a>.</p>
<h4 id="accessing-the-data-13"><a class="header" href="#accessing-the-data-13">Accessing the Data</a></h4>
<p>The data is available in STMO.
Take a look at <a href="https://sql.telemetry.mozilla.org/queries/81/source#129"><code>STMO#81</code></a>.</p>
<h4 id="further-reading-1"><a class="header" href="#further-reading-1">Further Reading</a></h4>
<h1 id="data-reference-23"><a class="header" href="#data-reference-23">Data Reference</a></h1>
<h2 id="example-queries-20"><a class="header" href="#example-queries-20">Example Queries</a></h2>
<h4 id="compute-dau-for-non-windows-clients-for-the-last-week-1"><a class="header" href="#compute-dau-for-non-windows-clients-for-the-last-week-1">Compute DAU for non-windows clients for the last week</a></h4>
<pre><code class="language-sql">WITH sample AS (
SELECT
os,
submission_date,
cardinality(merge(cast(hll AS HLL))) AS count
FROM client_count_daily
WHERE submission_date &gt;= DATE_FORMAT(CURRENT_DATE - INTERVAL '7' DAY, '%Y%m%d')
GROUP BY
submission_date,
os
)
SELECT
os,
-- formatting date as late as possible improves performance dramatically
date_parse(submission_date, '%Y%m%d') AS submission_date,
count
FROM sample
WHERE
count &gt; 10 -- remove outliers
AND lower(os) NOT LIKE '%windows%'
ORDER BY
os,
submission_date DESC
</code></pre>
<h4 id="compute-wau-by-channel-for-the-last-week-1"><a class="header" href="#compute-wau-by-channel-for-the-last-week-1">Compute WAU by Channel for the last week</a></h4>
<pre><code class="language-sql">WITH dau AS (
SELECT
normalized_channel,
submission_date,
merge(cast(hll AS HLL)) AS hll
FROM client_count_daily
-- 2 days of lag, 7 days of results, and 6 days preceding for WAU
WHERE submission_date &gt; DATE_FORMAT(CURRENT_DATE - INTERVAL '15' DAY, '%Y%m%d')
GROUP BY
submission_date,
normalized_channel
),
wau AS (
SELECT
normalized_channel,
submission_date,
cardinality(merge(hll) OVER (
PARTITION BY normalized_channel
ORDER BY submission_date
ROWS BETWEEN 6 PRECEDING AND 0 FOLLOWING
)) AS count
FROM dau
)
SELECT
normalized_channel,
-- formatting date as late as possible improves performance dramatically
date_parse(submission_date, '%Y%m%d') AS submission_date,
count
FROM wau
WHERE
count &gt; 10 -- remove outliers
AND submission_date &gt; DATE_FORMAT(CURRENT_DATE - INTERVAL '9' DAY, '%Y%m%d') -- only days that have a full WAU
</code></pre>
<h2 id="caveats-3"><a class="header" href="#caveats-3">Caveats</a></h2>
<p>The <code>hll</code> column does not product an exact count. <code>hll</code> stands for
<a href="https://en.wikipedia.org/wiki/HyperLogLog">HyperLogLog</a>, a sophisticated
algorithm that allows for the counting of extremely high numbers of items,
sacrificing a small amount of accuracy in exchange for using much less memory
than a simple counting structure.</p>
<p>When count is calculated over a column that may change over time, such as
<code>total_uri_count_threshold</code>, then a client would be counted in every group
where they appear. Over longer windows, like MAU, this is more likely to occur.</p>
<h2 id="schema-21"><a class="header" href="#schema-21">Schema</a></h2>
<p>The data is partitioned by <code>submission_date</code> which is formatted as <code>%Y%m%d</code>,
like <code>20180130</code>.</p>
<p>As of 2018-03-15, the current version of the <code>client_count_daily</code> dataset
is <code>v2</code>, and has a schema as follows:</p>
<pre><code>root
|-- app_name: string (nullable = true)
|-- app_version: string (nullable = true)
|-- country: string (nullable = true)
|-- devtools_toolbox_opened: boolean (nullable = true)
|-- e10s_enabled: boolean (nullable = true)
|-- hll: binary (nullable = true)
|-- locale: string (nullable = true)
|-- normalized_channel: string (nullable = true)
|-- os: string (nullable = true)
|-- os_version: string (nullable = true)
|-- top_distribution_id: string (nullable = true)
|-- total_uri_count_threshold: integer (nullable = true)
</code></pre>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/obsolete/client_count_daily/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><blockquote>
<p>As of 2019-10-23, this dataset has been deprecated and is no longer
maintained. See <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1585539">Bug 1585539</a>.</p>
</blockquote>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/obsolete/client_count/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="crash-aggregates"><a class="header" href="#crash-aggregates">Crash Aggregates</a></h1>
<blockquote>
<p>As of 2018-04-02, this dataset has been deprecated and is no longer maintained. See <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1388025">Bug 1388025</a> for more information.</p>
</blockquote>
<ul>
<li><a href="datasets/obsolete/crash_aggregates/reference.html#introduction">Introduction</a>
<ul>
<li><a href="datasets/obsolete/crash_aggregates/reference.html#rows-and-columns">Rows and Columns</a></li>
<li><a href="datasets/obsolete/crash_aggregates/reference.html#accessing-the-data">Accessing the Data</a></li>
<li><a href="datasets/obsolete/crash_aggregates/reference.html#further-reading">Further Reading</a></li>
</ul>
</li>
<li><a href="datasets/obsolete/crash_aggregates/reference.html#data-reference">Data Reference</a>
<ul>
<li><a href="datasets/obsolete/crash_aggregates/reference.html#example-queries">Example Queries</a></li>
<li><a href="datasets/obsolete/crash_aggregates/reference.html#sampling">Sampling</a>
<ul>
<li><a href="datasets/obsolete/crash_aggregates/reference.html#invalid-pings">Invalid Pings</a></li>
</ul>
</li>
<li><a href="datasets/obsolete/crash_aggregates/reference.html#scheduling">Scheduling</a></li>
<li><a href="datasets/obsolete/crash_aggregates/reference.html#schema">Schema</a></li>
</ul>
</li>
</ul>
<h1 id="introduction-33"><a class="header" href="#introduction-33">Introduction</a></h1>
<p>The <code>crash_aggregates</code> dataset compiles crash statistics over various dimensions for each day.</p>
<h4 id="rows-and-columns-1"><a class="header" href="#rows-and-columns-1">Rows and Columns</a></h4>
<p>There's one column for each of the stratifying dimensions and the crash statistics.
Each row is a distinct set of dimensions, along with their associated crash stats.
Example stratifying dimensions include channel and country,
example statistics include usage hours and plugin crashes.</p>
<h4 id="accessing-the-data-14"><a class="header" href="#accessing-the-data-14">Accessing the Data</a></h4>
<p>This dataset is accessible via STMO.</p>
<p>The data is stored as a parquet table in S3 at the following address.</p>
<pre><code>s3://telemetry-parquet/crash_aggregates/v1/
</code></pre>
<h4 id="further-reading-2"><a class="header" href="#further-reading-2">Further Reading</a></h4>
<p>The technical documentation for this dataset can be found in the
<a href="https://github.com/mozilla/telemetry-batch-view/blob/0128b08/docs/CrashAggregateView.md">telemetry-batch-view documentation</a></p>
<h1 id="data-reference-24"><a class="header" href="#data-reference-24">Data Reference</a></h1>
<h2 id="example-queries-21"><a class="header" href="#example-queries-21">Example Queries</a></h2>
<p>Here's an example query that computes crash rates
for each channel (sorted by number of usage hours):</p>
<pre><code class="language-sql">SELECT dimensions['channel'] AS channel,
sum(stats['usage_hours']) AS usage_hours,
1000 * sum(stats['main_crashes']) / sum(stats['usage_hours']) AS main_crash_rate,
1000 * sum(stats['content_crashes']) / sum(stats['usage_hours']) AS content_crash_rate,
1000 * sum(stats['plugin_crashes']) / sum(stats['usage_hours']) AS plugin_crash_rate,
1000 * sum(stats['gmplugin_crashes']) / sum(stats['usage_hours']) AS gmplugin_crash_rate,
1000 * sum(stats['gpu_crashes']) / sum(stats['usage_hours']) AS gpu_crash_rate
FROM crash_aggregates
GROUP BY dimensions['channel']
ORDER BY -sum(stats['usage_hours'])
</code></pre>
<p>Main process crashes by build date and OS version.</p>
<pre><code class="language-sql">WITH channel_rates AS (
SELECT dimensions['build_id'] AS build_id,
SUM(stats['main_crashes']) AS main_crashes, -- total number of crashes
SUM(stats['usage_hours']) / 1000 AS usage_kilohours, -- thousand hours of usage
dimensions['os_version'] AS os_version -- os version
FROM crash_aggregates
WHERE dimensions['experiment_id'] is null -- not in an experiment
AND regexp_like(dimensions['build_id'], '^\d{14}$') -- validate build IDs
AND dimensions['build_id'] &gt; '20160201000000' -- only in the date range that we care about
GROUP BY dimensions['build_id'], dimensions['os_version']
)
SELECT cast(parse_datetime(build_id, 'yyyyMMddHHmmss') as date) as build_id, -- program build date
usage_kilohours, -- thousands of usage hours
os_version, -- os version
main_crashes / usage_kilohours AS main_crash_rate -- crash rate being defined as crashes per thousand usage hours
FROM channel_rates
WHERE usage_kilohours &gt; 100 -- only aggregates that have statistically significant usage hours
ORDER BY build_id ASC
</code></pre>
<h2 id="sampling-2"><a class="header" href="#sampling-2">Sampling</a></h2>
<h3 id="invalid-pings-1"><a class="header" href="#invalid-pings-1">Invalid Pings</a></h3>
<p>We ignore invalid pings in our processing. Invalid pings are defined as those that:</p>
<ul>
<li>The submission dates or activity dates are invalid or missing.</li>
<li>The build ID is malformed.</li>
<li>The <code>docType</code> field is missing or unknown.</li>
<li>The ping is a main ping without usage hours or a crash ping with usage hours.</li>
</ul>
<h2 id="scheduling-29"><a class="header" href="#scheduling-29">Scheduling</a></h2>
<p>The <code>crash_aggregates</code> job is run daily, at midnight UTC.
The job is scheduled on <a href="https://github.com/mozilla/telemetry-airflow">Airflow</a>.
The DAG is <a href="https://github.com/mozilla/telemetry-airflow/blob/d50b938/dags/crash_aggregates.py">here</a></p>
<h2 id="schema-22"><a class="header" href="#schema-22">Schema</a></h2>
<p>The <code>crash_aggregates</code> table has 4 commonly-used columns:</p>
<ul>
<li><code>submission_date</code> is the date pings were submitted for a particular aggregate.
<ul>
<li>For example, <code>select sum(stats['usage_hours']) from crash_aggregates where submission_date = '2016-03-15'</code> will give the total number of user hours represented by pings submitted on March 15, 2016.</li>
<li>The dataset is partitioned by this field. Queries that limit the possible values of <code>submission_date</code> can run significantly faster.</li>
</ul>
</li>
<li><code>activity_date</code> is the day when the activity being recorded took place.
<ul>
<li>For example, <code>select sum(stats['usage_hours']) from crash_aggregates where activity_date = '2016-03-15'</code> will give the total number of user hours represented by activities that took place on March 15, 2016.</li>
<li>This can be several days before the pings are actually submitted, so it will always be before or on its corresponding <code>submission_date</code>.</li>
<li>Therefore, queries that are sensitive to when measurements were taken on the client should prefer this field over <code>submission_date</code>.</li>
</ul>
</li>
<li><code>dimensions</code> is a map of all the other dimensions that we currently care about. These fields include:
<ul>
<li><code>dimensions['build_version']</code> is the program version, like <code>46.0a1</code>.</li>
<li><code>dimensions['build_id']</code> is the <code>YYYYMMDDhhmmss</code> timestamp the program was built, like <code>20160123180541</code>. This is also known as the <code>build ID</code> or <code>buildid</code>.</li>
<li><code>dimensions['channel']</code> is the channel, like <code>release</code> or <code>beta</code>.</li>
<li><code>dimensions['application']</code> is the program name, like <code>Firefox</code> or <code>Fennec</code>.</li>
<li><code>dimensions['os_name']</code> is the name of the OS the program is running on, like <code>Darwin</code> or <code>Windows_NT</code>.</li>
<li><code>dimensions['os_version']</code> is the version of the OS the program is running on.</li>
<li><code>dimensions['architecture']</code> is the architecture that the program was built for (not necessarily the one it is running on).</li>
<li><code>dimensions['country']</code> is the country code for the user (determined using geoIP), like <code>US</code> or <code>UK</code>.</li>
<li><code>dimensions['experiment_id']</code> is the identifier of the experiment being participated in, such as <code>e10s-beta46-noapz@experiments.mozilla.org</code>, or null if no experiment.</li>
<li><code>dimensions['experiment_branch']</code> is the branch of the experiment being participated in, such as <code>control</code> or <code>experiment</code>, or null if no experiment.</li>
<li><code>dimensions['e10s_enabled']</code> is whether E10s is enabled.</li>
<li><code>dimensions['gfx_compositor']</code> is the graphics backend compositor used by the program, such as <code>d3d11</code>, <code>opengl</code> and <code>simple</code>. Null values may be reported as <code>none</code> as well.</li>
<li>All of the above fields can potentially be blank, which means &quot;not present&quot;. That means that in the actual pings, the corresponding fields were null.</li>
</ul>
</li>
<li><code>stats</code> contains the aggregate values that we care about:
<ul>
<li><code>stats['usage_hours']</code> is the number of user-hours represented by the aggregate.</li>
<li><code>stats['main_crashes']</code> is the number of main process crashes represented by the aggregate (or just program crashes, in the non-E10S case).</li>
<li><code>stats['content_crashes']</code> is the number of content process crashes represented by the aggregate.</li>
<li><code>stats['plugin_crashes']</code> is the number of plugin process crashes represented by the aggregate.</li>
<li><code>stats['gmplugin_crashes']</code> is the number of Gecko media plugin (often abbreviated <code>GMPlugin</code>) process crashes represented by the aggregate.</li>
<li><code>stats['content_shutdown_crashes']</code> is the number of content process crashes that were caused by failure to shut down in a timely manner.</li>
<li><code>stats['gpu_crashes']</code> is the number of GPU process crashes represented by the aggregate.</li>
</ul>
</li>
</ul>
<p><code>TODO(harter)</code>: https://bugzilla.mozilla.org/show_bug.cgi?id=1361862</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/obsolete/crash_aggregates/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="crash-summary-reference"><a class="header" href="#crash-summary-reference">Crash Summary Reference</a></h1>
<ul>
<li><a href="datasets/obsolete/crash_summary/reference.html#introduction">Introduction</a>
<ul>
<li><a href="datasets/obsolete/crash_summary/reference.html#contents">Contents</a></li>
<li><a href="datasets/obsolete/crash_summary/reference.html#accessing-the-data">Accessing the Data</a></li>
<li><a href="datasets/obsolete/crash_summary/reference.html#further-reading">Further Reading</a></li>
</ul>
</li>
<li><a href="datasets/obsolete/crash_summary/reference.html#data-reference">Data Reference</a>
<ul>
<li><a href="datasets/obsolete/crash_summary/reference.html#example-queries">Example Queries</a></li>
<li><a href="datasets/obsolete/crash_summary/reference.html#sampling">Sampling</a></li>
<li><a href="datasets/obsolete/crash_summary/reference.html#scheduling">Scheduling</a></li>
<li><a href="datasets/obsolete/crash_summary/reference.html#schema">Schema</a></li>
</ul>
</li>
</ul>
<h1 id="introduction-34"><a class="header" href="#introduction-34">Introduction</a></h1>
<blockquote>
<p>As of 2019-11-06, this dataset has been deprecated and is no longer maintained. Please use the <code>telemetry.crash</code> table instead, which is generated directly from live pings and is much more complete. See <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1572069">Bug 1572069</a> for more information.</p>
</blockquote>
<p>The <code>crash_summary</code> table is the a direct representation of a crash ping.</p>
<h4 id="contents-12"><a class="header" href="#contents-12">Contents</a></h4>
<p>The <code>crash_summary</code> table contains one row for each crash ping.
Each column represents one field from the crash ping payload,
though only a subset of all crash ping fields are included.</p>
<h4 id="accessing-the-data-15"><a class="header" href="#accessing-the-data-15">Accessing the Data</a></h4>
<p>The data is stored as a parquet table in S3 at the following address.</p>
<pre><code>s3://telemetry-parquet/crash_summary/v1/
</code></pre>
<p><code>crash_summary</code> is accessible through STMO.
Here's an <a href="https://sql.telemetry.mozilla.org/queries/4793/source">example query (<code>STMO#4793</code>)</a>.</p>
<h4 id="further-reading-3"><a class="header" href="#further-reading-3">Further Reading</a></h4>
<p>The technical documentation for <code>crash_summary</code> is located in the
<a href="https://github.com/mozilla/telemetry-batch-view/blob/master/docs/CrashSummary.md">telemetry-batch-view documentation</a>.</p>
<p>The code responsible for generating this dataset is
<a href="https://github.com/mozilla/telemetry-batch-view/blob/master/GRAVEYARD.md#crash-summary">here</a></p>
<h1 id="data-reference-25"><a class="header" href="#data-reference-25">Data Reference</a></h1>
<h2 id="example-queries-22"><a class="header" href="#example-queries-22">Example Queries</a></h2>
<p>Here is an example query to get the total number of main crashes by <code>gfx_compositor</code>:</p>
<pre><code class="language-sql">select gfx_compositor, count(*)
from crash_summary
where application = 'Firefox'
and (payload.processType IS NULL OR payload.processType = 'main')
group by gfx_compositor
</code></pre>
<h2 id="sampling-3"><a class="header" href="#sampling-3">Sampling</a></h2>
<p><code>CrashSummary</code> contains one record for every
<a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/data/crash-ping.html">crash ping</a>
submitted by Firefox.</p>
<h2 id="scheduling-30"><a class="header" href="#scheduling-30">Scheduling</a></h2>
<p>This dataset is updated daily, shortly after midnight UTC.
The job is scheduled on
<a href="https://github.com/mozilla/telemetry-airflow">telemetry-airflow</a>.
The DAG is <a href="https://github.com/mozilla/telemetry-airflow/blob/166e0a555ee2de0d3c7f0add1011f7771f7ea23d/dags/crash_summary.py">here</a>.</p>
<h2 id="schema-23"><a class="header" href="#schema-23">Schema</a></h2>
<pre><code>root
|-- client_id: string (nullable = true)
|-- normalized_channel: string (nullable = true)
|-- build_version: string (nullable = true)
|-- build_id: string (nullable = true)
|-- channel: string (nullable = true)
|-- crash_time: string (nullable = true)
|-- application: string (nullable = true)
|-- os_name: string (nullable = true)
|-- os_version: string (nullable = true)
|-- architecture: string (nullable = true)
|-- country: string (nullable = true)
|-- experiment_id: string (nullable = true)
|-- experiment_branch: string (nullable = true)
|-- experiments: map (nullable = true)
| |-- key: string
| |-- value: string (valueContainsNull = true)
|-- e10s_enabled: boolean (nullable = true)
|-- gfx_compositor: string (nullable = true)
|-- profile_created: integer (nullable = true)
|-- payload: struct (nullable = true)
| |-- crashDate: string (nullable = true)
| |-- processType: string (nullable = true)
| |-- hasCrashEnvironment: boolean (nullable = true)
| |-- metadata: map (nullable = true)
| | |-- key: string
| | |-- value: string (valueContainsNull = true)
| |-- version: integer (nullable = true)
|-- submission_date: string (nullable = true)
</code></pre>
<p>For more detail on where these fields come from in the
<a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/data/crash-ping.html">raw data</a>,
please look at the case classes
<a href="https://github.com/mozilla/telemetry-batch-view/blob/master/GRAVEYARD.md#crash-summary">in the <code>CrashSummaryView</code> code</a>.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/obsolete/crash_summary/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="error-aggregates-reference"><a class="header" href="#error-aggregates-reference">Error Aggregates Reference</a></h1>
<blockquote>
<p>As of 2019-11-21, this dataset has been deprecated and is no longer maintained. See <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1594112">Bug 1594112</a> for more information.</p>
</blockquote>
<ul>
<li><a href="datasets/obsolete/error_aggregates/reference.html#introduction">Introduction</a>
<ul>
<li><a href="datasets/obsolete/error_aggregates/reference.html#contents">Contents</a></li>
<li><a href="datasets/obsolete/error_aggregates/reference.html#accessing-the-data">Accessing the data</a></li>
<li><a href="datasets/obsolete/error_aggregates/reference.html#further-reading">Further Reading</a></li>
</ul>
</li>
<li><a href="datasets/obsolete/error_aggregates/reference.html#data-reference">Data Reference</a>
<ul>
<li><a href="datasets/obsolete/error_aggregates/reference.html#example-queries">Example Queries</a></li>
<li><a href="datasets/obsolete/error_aggregates/reference.html#sampling">Sampling</a>
<ul>
<li><a href="datasets/obsolete/error_aggregates/reference.html#data-sources">Data sources</a></li>
</ul>
</li>
<li><a href="datasets/obsolete/error_aggregates/reference.html#scheduling">Scheduling</a></li>
<li><a href="datasets/obsolete/error_aggregates/reference.html#schema">Schema</a></li>
</ul>
</li>
</ul>
<h1 id="introduction-35"><a class="header" href="#introduction-35">Introduction</a></h1>
<p>The <code>error_aggregates_v2</code> table represents counts of errors counted from main and crash
pings, aggregated every 5 minutes.
It is the dataset backing the old mission control view, but may also be queried independently.</p>
<h4 id="contents-13"><a class="header" href="#contents-13">Contents</a></h4>
<p>The <code>error_aggregates_v2</code> table contains counts of various error measures (for
example: crashes, &quot;the slow script dialog showing&quot;), aggregated across each
unique set of dimensions (for example: channel, operating system) every 5
minutes. You can get an aggregated count for any particular set of dimensions
by summing using SQL.</p>
<h5 id="experiment-unpacking"><a class="header" href="#experiment-unpacking">Experiment unpacking</a></h5>
<p>It's important to note that when this dataset is written, pings from clients participating in an experiment
are aggregated on the <code>experiment_id</code> and <code>experiment_branch</code> dimensions corresponding to what experiment and branch
they are participating in. However, they are also aggregated with the rest of the population where the values of
these dimensions are null.
Therefore care must be taken when writing aggregating queries over the whole population - in these cases one needs to
filter for <code>experiment_id is null</code> and <code>experiment_branch is null</code> in order to not double-count pings from experiments.</p>
<h4 id="accessing-the-data-16"><a class="header" href="#accessing-the-data-16">Accessing the data</a></h4>
<p>You can access the data via STMO. Choose <code>Athena</code> and then select the
<code>telemetry.error_aggregates_v2</code> table.</p>
<h4 id="further-reading-4"><a class="header" href="#further-reading-4">Further Reading</a></h4>
<p>The code responsible for generating this dataset is <a href="https://github.com/mozilla/telemetry-streaming/blob/master/src/main/scala/com/mozilla/telemetry/streaming/ErrorAggregator.scala">here</a>.</p>
<h1 id="data-reference-26"><a class="header" href="#data-reference-26">Data Reference</a></h1>
<h2 id="example-queries-23"><a class="header" href="#example-queries-23">Example Queries</a></h2>
<p>Getting a large number of different crash measures across many platforms and channels
(<a href="https://sql.telemetry.mozilla.org/queries/4769/source"><code>STMO#4769</code></a>):</p>
<pre><code class="language-sql">SELECT window_start,
build_id,
channel,
os_name,
version,
sum(usage_hours) AS usage_hours,
sum(main_crashes) AS main,
sum(content_crashes) AS content,
sum(gpu_crashes) AS gpu,
sum(plugin_crashes) AS plugin,
sum(gmplugin_crashes) AS gmplugin
FROM error_aggregates_v2
WHERE application = 'Firefox'
AND (os_name = 'Darwin' or os_name = 'Linux' or os_name = 'Windows_NT')
AND (channel = 'beta' or channel = 'release' or channel = 'nightly' or channel = 'esr')
AND build_id &gt; '201801'
AND window_start &gt; current_timestamp - (1 * interval '24' hour)
AND experiment_id IS NULL
AND experiment_branch IS NULL
GROUP BY window_start, channel, build_id, version, os_name
</code></pre>
<p>Get the number of <code>main_crashes</code> on Windows over a small interval
(<a href="https://sql.telemetry.mozilla.org/queries/51677"><code>STMO#51677</code></a>):</p>
<pre><code class="language-sql">SELECT window_start as time, sum(main_crashes) AS main_crashes
FROM error_aggregates_v2
WHERE application = 'Firefox'
AND os_name = 'Windows_NT'
AND channel = 'release'
AND version = '58.0.2'
AND window_start &gt; timestamp '2018-02-21'
AND window_end &lt; timestamp '2018-02-22'
AND experiment_id IS NULL
AND experiment_branch IS NULL
GROUP BY window_start
</code></pre>
<h2 id="sampling-4"><a class="header" href="#sampling-4">Sampling</a></h2>
<h3 id="data-sources-1"><a class="header" href="#data-sources-1">Data sources</a></h3>
<p>The aggregates in this data source are derived from main, crash and core <a href="datasets/obsolete/error_aggregates/../../pings.html">pings</a>:</p>
<ul>
<li>crash pings are used to count/gather main and content crash events, all other errors from desktop clients (including all other crashes) are gathered from main pings</li>
<li>core pings are used to count usage hours, first subsession and unique client counts.</li>
</ul>
<h2 id="scheduling-31"><a class="header" href="#scheduling-31">Scheduling</a></h2>
<p>The <code>error_aggregates</code> job is run continuously, using the Spark Streaming infrastructure</p>
<h2 id="schema-24"><a class="header" href="#schema-24">Schema</a></h2>
<p>The <code>error_aggregates_v2</code> table has the following columns which define its dimensions:</p>
<ul>
<li><code>window_start</code>: beginning of interval when this sample was taken</li>
<li><code>window_end</code>: end of interval when this sample was taken (will always be 5 minutes more
than <code>window_start</code> for any given row)</li>
<li><code>submission_date_s3</code>: the date pings were submitted for a particular aggregate</li>
<li><code>channel</code>: the channel, like <code>release</code> or <code>beta</code></li>
<li><code>version</code>: the version e.g. <code>57.0.1</code></li>
<li><code>display_version</code>: like version, but includes beta number if applicable e.g. <code>57.0.1b4</code></li>
<li><code>build_id</code>: the <code>YYYYMMDDhhmmss</code> timestamp the program was built, like <code>20160123180541</code>. This is also known as the <code>build ID</code> or <code>buildid</code></li>
<li><code>application</code>: application name (e.g. <code>Firefox</code> or <code>Fennec</code>)</li>
<li><code>os_name</code>: name of the OS (e.g. <code>Darwin</code> or <code>Windows_NT</code>)</li>
<li><code>os_version</code>: version of the OS</li>
<li><code>architecture</code>: build architecture, e.g. <code>x86</code></li>
<li><code>country</code>: country code for the user (determined using geoIP), like <code>US</code> or <code>UK</code></li>
<li><code>experiment_id</code>: identifier of the experiment being participated in, such as <code>e10s-beta46-noapz@experiments.mozilla.org</code>, null if no experiment or for unpacked rows (see <a href="datasets/obsolete/error_aggregates/reference.html#experiment-unpacking">Experiment unpacking</a>)</li>
<li><code>experiment_branch</code>: the branch of the experiment being participated in, such as <code>control</code> or <code>experiment</code>, null if no experiment or for unpacked rows (see <a href="datasets/obsolete/error_aggregates/reference.html#experiment-unpacking">Experiment unpacking</a>)</li>
</ul>
<p>And these are the various measures we are counting:</p>
<ul>
<li><code>usage_hours</code>: number of usage hours (i.e. total number of session hours reported by the pings in this aggregate, note that this might include time where
people are not actively using the browser or their computer is asleep)</li>
<li><code>count</code>: number of pings processed in this aggregate</li>
<li><code>main_crashes</code>: number of main process crashes (or just program crashes, in the non-e10s case)</li>
<li><code>startup_crashes</code> : number of startup crashes</li>
<li><code>content_crashes</code>: number of content process crashes (<code>version =&gt; 58</code> only)</li>
<li><code>gpu_crashes</code>: number of GPU process crashes</li>
<li><code>plugin_crashes</code>: number of plugin process crashes</li>
<li><code>gmplugin_crashes</code>: number of Gecko media plugin (often abbreviated <code>GMPlugin</code>) process crashes</li>
<li><code>content_shutdown_crashes</code>: number of content process crashes that were caused by failure to shut down in a timely manner (<code>version =&gt; 58</code> only)</li>
<li><code>browser_shim_usage_blocked</code>: number of times a CPOW shim was blocked from being created by browser code</li>
<li><code>permissions_sql_corrupted</code>: number of times the permissions SQL error occurred (beta/nightly only)</li>
<li><code>defective_permissions_sql_removed</code>: number of times there was a removal of defective <code>permissions.sqlite</code> (beta/nightly only)</li>
<li><code>slow_script_notice_count</code>: number of times the slow script notice count was shown (beta/nightly only)</li>
<li><code>slow_script_page_count</code>: number of pages that trigger slow script notices (beta/nightly only)</li>
</ul>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/obsolete/error_aggregates/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="first-shutdown-summary"><a class="header" href="#first-shutdown-summary">First Shutdown Summary</a></h1>
<ul>
<li><a href="datasets/obsolete/first_shutdown_summary/reference.html#introduction">Introduction</a>
<ul>
<li><a href="datasets/obsolete/first_shutdown_summary/reference.html#contents">Contents</a></li>
<li><a href="datasets/obsolete/first_shutdown_summary/reference.html#background-and-caveats">Background and Caveats</a></li>
<li><a href="datasets/obsolete/first_shutdown_summary/reference.html#accessing-the-data">Accessing the Data</a></li>
</ul>
</li>
</ul>
<h1 id="introduction-36"><a class="header" href="#introduction-36">Introduction</a></h1>
<p>The <code>first_shutdown_summary</code> table is a summary of the <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/data/first-shutdown-ping.html"><code>first-shutdown</code>
ping</a>.</p>
<h4 id="contents-14"><a class="header" href="#contents-14">Contents</a></h4>
<p>The first shutdown ping contains first session usage data. The
dataset has rows similar to the
<a href="datasets/obsolete/first_shutdown_summary//datasets/batch_view/new_profile/reference.html"><code>telemetry_new_profile_parquet</code></a>,
but in the shape of
<a href="datasets/obsolete/first_shutdown_summary//datasets/batch_view/main_summary/reference.html"><code>main_summary</code></a>.</p>
<h4 id="background-and-caveats-13"><a class="header" href="#background-and-caveats-13">Background and Caveats</a></h4>
<p>Ping latency was reduced through the
shutdown ping-sender mechanism in Firefox 55. To maintain consistent historical
behavior, the first main ping is not sent until the second start up. In Firefox 57, a
separate first-shutdown ping was created to evaluate first-shutdown behavior while maintaining backwards compatibility.</p>
<p>In many cases, the first-shutdown ping is a duplicate of the main ping. The first-shutdown summary can be used in conjunction with the main summary by taking the union and deduplicating on the <code>document_id</code>.</p>
<h4 id="accessing-the-data-17"><a class="header" href="#accessing-the-data-17">Accessing the Data</a></h4>
<p>The data can be accessed as <code>first_shutdown_summary</code>.</p>
<p>The data is backfilled to 2017-09-22, the date of its first nightly appearance. This data should be available to all releases on and after Firefox 57.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/obsolete/first_shutdown_summary/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="heavy-users"><a class="header" href="#heavy-users">Heavy Users</a></h1>
<blockquote>
<p>As of 2018-05-18, this dataset has been deprecated and is no longer maintained. See <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1455314">Bug 1455314</a></p>
</blockquote>
<ul>
<li><a href="datasets/obsolete/heavy_users/reference.html#replacement">Replacement</a></li>
<li><a href="datasets/obsolete/heavy_users/reference.html#introduction">Introduction</a>
<ul>
<li><a href="datasets/obsolete/heavy_users/reference.html#contents">Contents</a></li>
<li><a href="datasets/obsolete/heavy_users/reference.html#background-and-caveats">Background and Caveats</a></li>
<li><a href="datasets/obsolete/heavy_users/reference.html#accessing-the-data">Accessing the Data</a></li>
<li><a href="datasets/obsolete/heavy_users/reference.html#further-reading">Further Reading</a></li>
</ul>
</li>
<li><a href="datasets/obsolete/heavy_users/reference.html#data-reference">Data Reference</a>
<ul>
<li><a href="datasets/obsolete/heavy_users/reference.html#example-queries">Example Queries</a></li>
<li><a href="datasets/obsolete/heavy_users/reference.html#schema">Schema</a></li>
</ul>
</li>
<li><a href="datasets/obsolete/heavy_users/reference.html#code-reference">Code Reference</a></li>
</ul>
<h1 id="replacement-1"><a class="header" href="#replacement-1">Replacement</a></h1>
<p>We've moved to assigning user's an active tag based on <code>total_uri_count</code>.</p>
<p>The activity of a user based on <code>active_ticks</code> is available in <code>clients_daily</code>
in the <code>active_hours_sum</code> field, which has the <code>sum(active_ticks / 720)</code>.</p>
<p>To retrieve a client's 28-day <code>active_hours</code>, use the following query:</p>
<pre><code class="language-sql">SELECT submission_date_s3,
client_id,
SUM(active_hours_sum) OVER (PARTITION BY client_id
ORDER BY submission_date_s3 ASC
ROWS 27 PRECEDING) AS monthly_active_hours
FROM
clients_daily
</code></pre>
<h1 id="introduction-37"><a class="header" href="#introduction-37">Introduction</a></h1>
<p>The <code>heavy_users</code> table provides information about whether a given <code>client_id</code> is
considered a &quot;heavy user&quot; on each day (using submission date).</p>
<h4 id="contents-15"><a class="header" href="#contents-15">Contents</a></h4>
<p>The <code>heavy_users</code> table contains one row per client-day, where day is
<code>submission_date</code>. A client has a row for a specific <code>submission_date</code> if
they were active at all in the 28 day window ending on that <code>submission_date</code>.</p>
<p>A user is a &quot;heavy user&quot; as of day N if, for the 28 day period ending
on day N, the sum of their <code>active_ticks</code> is in the 90th percentile (or
above) of all clients during that period. For more analysis on this,
and a discussion of new profiles, see
<a href="datasets/obsolete/heavy_users/BROKEN:https://metrics.mozilla.com/protected/sguha/heavy/heavycutoffs5.html">this link</a>.</p>
<h4 id="background-and-caveats-14"><a class="header" href="#background-and-caveats-14">Background and Caveats</a></h4>
<ol>
<li>Data starts at 20170801. There is technically data in the table before
this, but the <code>heavy_user</code> column is <code>NULL</code> for those dates because it
needed to bootstrap the first 28 day window.</li>
<li>Because it is top the 10% of clients for each 28 day period, more
than 10% of clients active on a given <code>submission_date</code> will be
considered heavy users. If you join with another data source
(<code>main_summary</code>, for example), you may see a larger proportion of heavy
users than expected.</li>
<li>Each day has a separate, but related, set of heavy users. Initial
investigations show that approximately 97.5% of heavy users as of a
certain day are still considered heavy users as of the next day.</li>
<li>There is no &quot;fixing&quot; or weighting of new profiles - days before the
profile was created are counted as zero <code>active_ticks</code>. Analyses may
need to use the included <code>profile_creation_date</code> field to take this
into account.</li>
</ol>
<h4 id="accessing-the-data-18"><a class="header" href="#accessing-the-data-18">Accessing the Data</a></h4>
<p>The data is available both via <code>sql.t.m.o</code> and Spark.</p>
<p>In Spark:</p>
<pre><code class="language-python">spark.read.parquet(&quot;s3://telemetry-parquet/heavy_users/v1&quot;)
</code></pre>
<p>In SQL:</p>
<pre><code class="language-sql">SELECT * FROM heavy_users LIMIT 3
</code></pre>
<h4 id="further-reading-5"><a class="header" href="#further-reading-5">Further Reading</a></h4>
<p>The code responsible for generating this dataset is
<a href="https://github.com/mozilla/telemetry-batch-view/blob/master/GRAVEYARD.md#heavy-users">here</a></p>
<h1 id="data-reference-27"><a class="header" href="#data-reference-27">Data Reference</a></h1>
<h2 id="example-queries-24"><a class="header" href="#example-queries-24">Example Queries</a></h2>
<p>Example queries:</p>
<ul>
<li><a href="https://sql.telemetry.mozilla.org/queries/47041/source#127382">Join <code>heavy_users</code> with <code>main_summary</code> to get distribution of <code>max_concurrent_tab_count</code> for heavy vs. non-heavy users (<code>STMO#47041</code>)</a></li>
<li><a href="https://sql.telemetry.mozilla.org/queries/47044/source#127385">Join <code>heavy_users</code> with <code>longitudinal</code> to get crash rates for heavy vs. non-heavy users (<code>STMO#47044</code>)</a></li>
</ul>
<h2 id="schema-25"><a class="header" href="#schema-25">Schema</a></h2>
<p>As of 2017-10-05, the current version of the <code>heavy_users</code> dataset is <code>v1</code>, and has a schema as follows:</p>
<pre><code>root
|-- client_id: string (nullable = true)
|-- sample_id: integer (nullable = true)
|-- profile_creation_date: long (nullable = true)
|-- active_ticks: long (nullable = true)
|-- active_ticks_period: long (nullable = true)
|-- heavy_user: boolean (nullable = true)
|-- prev_year_heavy_user: boolean (nullable = true)
|-- submission_date_s3: string (nullable = true)
</code></pre>
<h1 id="code-reference-26"><a class="header" href="#code-reference-26">Code Reference</a></h1>
<p>This dataset is generated by
<a href="https://github.com/mozilla/telemetry-batch-view/blob/master/GRAVEYARD.md#heavy-users">telemetry-batch-view</a>.
Refer to this repository for information on how to run or augment the dataset.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/obsolete/heavy_users/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="legacy-mobile-datasets"><a class="header" href="#legacy-mobile-datasets">Legacy Mobile Datasets</a></h1>
<p>Modern mobile products use the <a href="datasets/obsolete/legacy_mobile/../../../concepts/glean/glean.html">Glean SDK</a> and are thus documented by the <a href="https://dictionary.telemetry.mozilla.org">Glean Dictionary</a>.
This documentation documents some details about accessing data for legacy products based on other technology.</p>
<h2 id="legacy-ping-tables"><a class="header" href="#legacy-ping-tables">Legacy ping tables</a></h2>
<p>Legacy (pre-Glean) mobile data is structured differently than desktop data. Instead of sending a <code>main</code> ping, mobile has provides the following key types of pings:</p>
<ul>
<li><code>core</code></li>
<li><code>events</code></li>
</ul>
<p>The core ping is sent once for each session. It includes a much smaller set of
metrics than the main ping because of network and data size constraints. All mobile apps send the core ping. For more information on the core ping, see the telemetry documentation <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/obsolete/core-ping.html">here</a>.</p>
<p>Event pings are not sent for all products. They are sent by Focus Android, Focus iOS, Klar, Firefox for FireTV, Firefox for Echo Show, and Firefox Lite.
Event pings are sent more frequently than core pings, at most once per 10 minute interval.
If a ping records 10,000 events, it is sent immediately unless it is within 10 minutes of the last event ping sent: in this case some data may be lost.</p>
<p>Mobile legacy event pings follow generally the same format as the <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/data/event-ping.html">desktop <code>event</code> ping</a>.</p>
<p>Fennec (Firefox Android) does not send event pings. Instead, it includes a
<code>saved_session</code> ping with the same format as <code>main</code> pings. However, it is only
available for users who have installed a pre-release and a few users who have installed a release. In both cases, they must have opted into signing up for telemetry collection.
Data from this collection must be treated with caution because it is based on a biased
population and therefore should not be used to make conclusions about Fennec users.</p>
<p>For more information on the implementation of the event pings and to view event
descriptions for <a href="https://github.com/mozilla-mobile/focus-android/blob/master/docs/Telemetry.md">Focus</a>, <a href="https://github.com/mozilla-mobile/firefox-tv/blob/master/docs/telemetry.md">Firefox for FireTV</a>, or <a href="https://github.com/mozilla-mobile/firefox-echo-show/blob/master/docs/telemetry.md">Firefox for Echo Show</a>, see the following documentation:</p>
<h3 id="core-ping-derived-datasets"><a class="header" href="#core-ping-derived-datasets">Core Ping Derived Datasets</a></h3>
<h4 id="telemetrycore"><a class="header" href="#telemetrycore"><code>telemetry.core</code></a></h4>
<p>For most analyses of mobile data, you need to use the <code>telemetry.core</code> table. It includes data for all the non-desktop Firefox applications that send core pings.</p>
<p>You need to filter on <code>app_name</code> and <code>os</code> because Firefox iOS and Firefox Android
have the same <code>app_name</code>. It is recommended that you always filter on <code>app_name</code>, <code>os</code>, app version (found as <code>metadata_app_version</code>) and release channel (it is located under metadata as <code>metadata.normalized_channel</code>).</p>
<p>Versioned tables are available for core ping storage for historical reference, but a table without a version suffix always represents an up-to-date table. It is recommended that you use the unversioned table, so you can be sure your analysis is based on up-to-date information.</p>
<p>The <code>seq</code> field indicates the order in which pings are sent. A record includes <code>seq = 1</code>, which represents the first ping that is received for a client id. It can be used as a proxy to identify new users.</p>
<h3 id="event-ping-derived-datasets"><a class="header" href="#event-ping-derived-datasets">Event Ping Derived Datasets</a></h3>
<p>There are two tables for mobile event data: <code>telemetry.focus_event</code> and <code>telemetry.mobile_event</code>.</p>
<p>As the name suggests, one table includes the event pings from Focus (iOS, Android
and Klar). The other table includes the event data for other apps. Both tables use the same format and columns.</p>
<h4 id="telemetrymobile_events"><a class="header" href="#telemetrymobile_events"><code>telemetry.mobile_events</code></a></h4>
<p>The <code>telemetry.mobile_events</code> table includes event data for Firefox for Fire TV, Firefox for Echo Show, and Firefox Lite. A metadata column with a list of metrics is also included.</p>
<p>Like when querying <code>telemetry.core</code>, multiple applications are included in each table. It is recommended that you filter at least <code>app_name</code> and <code>os</code>. Be sure that no <code>app_version</code> field is included in these tables: if you want to filter or join a specific version, you must have already identified the corresponding <code>metadata.app_build_id</code>(s) for the <code>app_version</code> by contacting the engineering team that has created the app.</p>
<p>A few other applications also send event data to this table, including Lockbox and FirefoxReality. For more information about the event data that is sent from these applications, see their documentation.</p>
<h4 id="telemetryfocus_events"><a class="header" href="#telemetryfocus_events"><code>telemetry.focus_events</code></a></h4>
<p>The <code>telemetry.focus_events</code> table includes event data for Focus Android, Focus iOS, and Klar.</p>
<p>Like when querying <code>telemetry.core</code>, multiple apps are included in each table. It is recommended that you filter on at least <code>app_name</code> and <code>os</code>. Be sure that no <code>app_version</code> field is included in these tables. If you want to filter or join a specific version, you must have already identified the corresponding <code>app_build_id</code>(s) for the <code>app_version</code> by contacting the engineering team that has created the app.</p>
<p>A few other applications send data to this table. However, it is recommended that you use
this table only for analysis of event data from Focus and its related apps.</p>
<h3 id="notes-1"><a class="header" href="#notes-1">Notes</a></h3>
<p>Each app uses a unique set of release channels. Most apps include a <code>nightly</code>, <code>beta</code>, <code>release</code>, and an <code>other</code> channel. Each channel is used during various stages of development: generally users sign up to test a pre-release version (anything other than <code>release</code>). In Focus Android, the <code>beta</code> channel uses the same APK in the Google Play Store as the <code>release</code> channel. However, beta users get access to this version earlier than users who receive the final release.</p>
<p>As soon as the <code>release</code> version is published, beta users work with the same version
of the app as users who have received the final released version. Both versions of the software become indistinguishable from each other unless you perform a query that flags them by <code>client_id</code>. Beta releases have <code>normalized_channel</code> tagged as <code>release</code>. If you want to filter for beta users, you can only identify the beta users by checking for a higher version number than the version number and date that have been assigned to the official release.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/obsolete/legacy_mobile/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="longitudinal-reference"><a class="header" href="#longitudinal-reference">Longitudinal Reference</a></h1>
<ul>
<li><a href="datasets/obsolete/longitudinal/reference.html#introduction">Introduction</a>
<ul>
<li><a href="datasets/obsolete/longitudinal/reference.html#contents">Contents</a></li>
<li><a href="datasets/obsolete/longitudinal/reference.html#background-and-caveats">Background and Caveats</a></li>
<li><a href="datasets/obsolete/longitudinal/reference.html#accessing-the-data">Accessing the Data</a></li>
</ul>
</li>
<li><a href="datasets/obsolete/longitudinal/reference.html#data-reference">Data Reference</a>
<ul>
<li><a href="datasets/obsolete/longitudinal/reference.html#sampling">Sampling</a>
<ul>
<li><a href="datasets/obsolete/longitudinal/reference.html#pings-within-last-6-months">Pings Within Last 6 Months</a></li>
<li><a href="datasets/obsolete/longitudinal/reference.html#1-sample">1% Sample</a></li>
</ul>
</li>
<li><a href="datasets/obsolete/longitudinal/reference.html#scheduling">Scheduling</a></li>
<li><a href="datasets/obsolete/longitudinal/reference.html#schema">Schema</a></li>
</ul>
</li>
<li><a href="datasets/obsolete/longitudinal/reference.html#code-reference">Code Reference</a></li>
</ul>
<h1 id="introduction-38"><a class="header" href="#introduction-38">Introduction</a></h1>
<p>The <code>longitudinal</code> dataset is a 1% sample of main ping data
organized so that each row corresponds to a <code>client_id</code>.
If you're not sure which dataset to use for your analysis,
this is probably what you want.</p>
<h4 id="contents-16"><a class="header" href="#contents-16">Contents</a></h4>
<p>Each row in the <code>longitudinal</code> dataset represents one <code>client_id</code>,
which is approximately a user.
Each column represents a field from the main ping.
Most fields contain <strong>arrays of values</strong>, with one value for each ping associated with a <code>client_id</code>.
Using arrays give you access to the raw data from each ping,
but can be difficult to work with from SQL.
Here's a <a href="https://sql.telemetry.mozilla.org/queries/4188#table">query showing some sample data (<code>STMO#4188</code>)</a>
to help illustrate.</p>
<h4 id="background-and-caveats-15"><a class="header" href="#background-and-caveats-15">Background and Caveats</a></h4>
<p>Think of the longitudinal table as wide and short.
The dataset contains more columns than <code>main_summary</code>
and down-samples to 1% of all clients to reduce query computation time and save resources.</p>
<p>In summary, the longitudinal table differs from <code>main_summary</code> in two important ways:</p>
<ul>
<li>The longitudinal dataset groups all data so that one row represents a <code>client_id</code></li>
<li>The longitudinal dataset samples to 1% of all <code>client_id</code>s</li>
</ul>
<p>Please note that this dataset only contains release (or opt-out) histograms and scalars.</p>
<h4 id="accessing-the-data-19"><a class="header" href="#accessing-the-data-19">Accessing the Data</a></h4>
<p>The <code>longitudinal</code> is available in STMO,
though it can be difficult to work with the array values in SQL.
Take a look at <a href="https://sql.telemetry.mozilla.org/queries/4189/source"><code>STMO#4189</code></a>.</p>
<p>The data is stored as a parquet table in S3 at the following address.</p>
<pre><code>s3://telemetry-parquet/longitudinal/
</code></pre>
<h1 id="data-reference-28"><a class="header" href="#data-reference-28">Data Reference</a></h1>
<h2 id="sampling-5"><a class="header" href="#sampling-5">Sampling</a></h2>
<h3 id="pings-within-last-6-months"><a class="header" href="#pings-within-last-6-months">Pings Within Last 6 Months</a></h3>
<p>The <code>longitudinal</code> filters to <code>main</code> pings from within the last 6 months.</p>
<h3 id="1-sample"><a class="header" href="#1-sample">1% Sample</a></h3>
<p>The longitudinal dataset samples down to 1% of all clients in the above sample.
The sample is generated by the following process:</p>
<ul>
<li>hash the <code>client_id</code> for each ping from the last 6 months.</li>
<li>project that hash onto an integer from 1:100, inclusive</li>
<li>filter to pings with <code>client_id</code>s matching a 'magic number' (in this case 42)</li>
</ul>
<p>This process has a couple of nice properties:</p>
<ul>
<li>The sample is consistent over time.
The <code>longitudinal</code> dataset is regenerated weekly.
The clients included in each run are very similar with this process.
The only change will come from never-before-seen clients,
or clients without a ping in the last 180 days.</li>
<li>We don't need to adjust the sample as new clients enter or exit our pool.</li>
</ul>
<p>More practically,
the sample is created by filtering to pings with <code>main_summary.sample_id == 42</code>.
If you're working with <code>main_summary</code>,
you can recreate this sample by doing this filter manually.</p>
<h2 id="scheduling-32"><a class="header" href="#scheduling-32">Scheduling</a></h2>
<p>The <code>longitudinal</code> job is run weekly, early on Sunday morning UTC.
The job is scheduled on <a href="https://github.com/mozilla/telemetry-airflow">Airflow</a>.
The DAG is <a href="https://github.com/mozilla/telemetry-airflow/blob/54cffc42a2ca24e46056b7030735f0d4d093c0c7/dags/longitudinal.py">here</a>.</p>
<h2 id="schema-26"><a class="header" href="#schema-26">Schema</a></h2>
<p><code>TODO(harter)</code>: https://bugzilla.mozilla.org/show_bug.cgi?id=1361862</p>
<h1 id="code-reference-27"><a class="header" href="#code-reference-27">Code Reference</a></h1>
<p>This dataset is generated by
<a href="https://github.com/mozilla/telemetry-batch-view/blob/master/GRAVEYARD.md#longitudinal">telemetry-batch-view</a>.
Refer to this repository for information on how to run or augment the dataset.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/obsolete/longitudinal/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="1-day-retention"><a class="header" href="#1-day-retention">1 Day Retention</a></h1>
<blockquote>
<p>As of 2019-08-13, this dataset has been deprecated and is no longer
maintained. See <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1571565">Bug 1571565</a>
for historical sources.</p>
</blockquote>
<ul>
<li><a href="datasets/obsolete/retention/reference.html#introduction">Introduction</a>
<ul>
<li><a href="datasets/obsolete/retention/reference.html#contents">Contents</a></li>
<li><a href="datasets/obsolete/retention/reference.html#background-and-caveats">Background and Caveats</a></li>
<li><a href="datasets/obsolete/retention/reference.html#accessing-the-data">Accessing the Data</a></li>
</ul>
</li>
<li><a href="datasets/obsolete/retention/reference.html#data-reference">Data Reference</a>
<ul>
<li><a href="datasets/obsolete/retention/reference.html#example-queries">Example Queries</a></li>
<li><a href="datasets/obsolete/retention/reference.html#scheduling">Scheduling</a></li>
<li><a href="datasets/obsolete/retention/reference.html#schema">Schema</a></li>
</ul>
</li>
<li><a href="datasets/obsolete/retention/reference.html#code-reference">Code Reference</a></li>
</ul>
<h1 id="introduction-39"><a class="header" href="#introduction-39">Introduction</a></h1>
<p>The <code>retention</code> table provides client counts relevant to client retention at a
1-day granularity. The project is tracked in <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1381840">Bug 1381840</a></p>
<h3 id="contents-17"><a class="header" href="#contents-17">Contents</a></h3>
<p>The <code>retention</code> table contains a set of attribute columns used to specify a
cohort of users and a set of metric columns to describe cohort activity. Each
row contains a permutation of attributes, an approximate set of clients in a
cohort, and the aggregate engagement metrics.</p>
<p>This table uses the HyperLogLog (HLL) sketch to create an approximate set of
clients in a cohort. HLL allows counting across overlapping cohorts in a single
pass while avoiding the problem of double counting. This data-structure has the
benefit of being compact and performant in the context of retention analysis,
at the expense of precision. For example, calculating a 7-day retention period
can be obtained by aggregating over a week of retention data using the union
operation. With SQL primitive, this requires a recalculation of COUNT DISTINCT
over <code>client_id</code>'s in the 7-day window.</p>
<h4 id="background-and-caveats-16"><a class="header" href="#background-and-caveats-16">Background and Caveats</a></h4>
<ol>
<li>The data starts at 2017-03-06, the <a href="https://wiki.mozilla.org/RapidRelease/Calendar">merge date where Nightly started to
track Firefox 55 in Mozilla-Central</a>. However, there was
not a consistent view into the behavior of first session profiles until the
<a href="datasets/obsolete/retention//datasets/batch_view/new_profile/reference.html"><code>new_profile</code> ping</a>. This means much of the data is inaccurate
before 2017-06-26.</li>
<li>This dataset uses 4 day reporting latency to aggregate at least 99% of the
data in a given submission date. This figure is derived from the
<a href="https://sql.telemetry.mozilla.org/dashboard/telemetry-health">telemetry-health measurements on submission latency</a>, with
the discussion in <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1407410">Bug 1407410</a>. This latency metric was reduced
Firefox 55 with the introduction of the shutdown ping-sender mechanism.</li>
<li>Caution should be taken before adding new columns. Additional attribute
columns will grow the number of rows exponentially.</li>
<li>The number of HLL bits chosen for this dataset is 13. This means the default
size of the HLL object is 2^13 bits or 1KiB. This maintains about a 1% error
on average. See <a href="https://github.com/twitter/algebird/blob/develop/algebird-core/src/main/scala/com/twitter/algebird/HyperLogLog.scala#L230-L255">this table from Algebird's HLL implementation</a> for
more details.</li>
</ol>
<h4 id="accessing-the-data-20"><a class="header" href="#accessing-the-data-20">Accessing the Data</a></h4>
<p>The data is primarily available through <a href="https://sql.telemetry.mozilla.org">STMO</a> via
the Presto source. This service has been configured to use predefined HLL
functions.</p>
<p>The column should first be cast to the HLL type. The scalar
<code>cardinality(&lt;hll_column&gt;)</code> function will approximate the number of unique
items per HLL object. The aggregate <code>merge(&lt;hll_column&gt;)</code> function will perform
the set union between all objects in a column.</p>
<p>Example: Cast the count column into the appropriate type.</p>
<pre><code class="language-sql">SELECT cast(hll as HLL) as n_profiles_hll FROM retention
</code></pre>
<p>Count the number of clients seen over all attribute combinations.</p>
<pre><code class="language-sql">SELECT cardinality(cast(hll as HLL)) FROM retention
</code></pre>
<p>Group-by and aggregate client counts over different release channels.</p>
<pre><code class="language-sql">SELECT channel, cardinality(merge(cast(hll AS HLL))
FROM retention
GROUP BY channel
</code></pre>
<p>The HyperLogLog library wrappers are available for use outside of the
configured STMO environment, <a href="https://github.com/mozilla/spark-hyperloglog"><code>spark-hyperloglog</code></a> and
<a href="https://github.com/vitillo/presto-hyperloglog"><code>presto-hyperloglog</code></a>.</p>
<p>Also see the <a href="datasets/obsolete/retention//datasets/obsolete/client_count_daily/reference.html"><code>client_count_daily</code> dataset</a>.</p>
<h1 id="data-reference-29"><a class="header" href="#data-reference-29">Data Reference</a></h1>
<h2 id="example-queries-25"><a class="header" href="#example-queries-25">Example Queries</a></h2>
<p>See the <a href="https://sql.telemetry.mozilla.org/dashboard/firefox-telemetry-retention-dataset-example-usage">Example Usage Dashboard</a> for more usages of datasets of
the same shape.</p>
<h2 id="scheduling-33"><a class="header" href="#scheduling-33">Scheduling</a></h2>
<p>The job is scheduled on Airflow on a daily basis after <code>main_summary</code> is run
for the day. This job requires both <code>mozetl</code> and <code>telemetry-batch-view</code> as
dependencies.</p>
<h2 id="schema-27"><a class="header" href="#schema-27">Schema</a></h2>
<p>As of 2017-10-10, the current version of <code>retention</code> is <code>v1</code> and has a schema
as follows:</p>
<pre><code>root
|-- subsession_start: string (nullable = true)
|-- profile_creation: string (nullable = true)
|-- days_since_creation: long (nullable = true)
|-- channel: string (nullable = true)
|-- app_version: string (nullable = true)
|-- geo: string (nullable = true)
|-- distribution_id: string (nullable = true)
|-- is_funnelcake: boolean (nullable = true)
|-- source: string (nullable = true)
|-- medium: string (nullable = true)
|-- content: string (nullable = true)
|-- sync_usage: string (nullable = true)
|-- is_active: boolean (nullable = true)
|-- hll: binary (nullable = true)
|-- usage_hours: double (nullable = true)
|-- sum_squared_usage_hours: double (nullable = true)
|-- total_uri_count: long (nullable = true)
|-- unique_domains_count: double (nullable = true)
</code></pre>
<h1 id="code-reference-28"><a class="header" href="#code-reference-28">Code Reference</a></h1>
<p>The ETL script for processing the data before aggregation is found in
<a href="https://github.com/mozilla/python_mozetl/blob/ba51f539e5f1218954b7f3536e96f50c57a1b55c/mozetl/engagement/retention/job.py"><code>mozetl.engagement.retention</code></a>. The aggregate job is found in
<a href="https://github.com/mozilla/telemetry-batch-view/blob/9428b1951545dcd7517a3e72c81e7891a6dfa1fa/src/main/scala/com/mozilla/telemetry/views/RetentionView.scala">telemetry-batch-view</a> as the <code>RetentionView</code>.</p>
<p>The <a href="https://github.com/acmiyaguchi/telemetry-airflow/blob/1b4b11d23cdd1191ed2d2be905f116d7c3c67533/jobs/retention.sh">runner script</a> performs all the necessary setup to run on
EMR. This script can be used to perform backfill.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/obsolete/retention/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="sync-summary-and-sync-flat-summary-reference"><a class="header" href="#sync-summary-and-sync-flat-summary-reference">Sync Summary and Sync Flat Summary Reference</a></h1>
<ul>
<li><a href="datasets/obsolete/sync_summary/reference.html#introduction">Introduction</a>
<ul>
<li><a href="datasets/obsolete/sync_summary/reference.html#which-dataset-should-i-use">Which dataset should I use?</a></li>
</ul>
</li>
<li><a href="datasets/obsolete/sync_summary/reference.html#data-reference">Data Reference</a>
<ul>
<li><a href="datasets/obsolete/sync_summary/reference.html#a-note-about-user-ids">A note about user IDs</a></li>
<li><a href="datasets/obsolete/sync_summary/reference.html#which-apps-send-sync-telemetry-what-about-fenix">Which apps send sync telemetry? What about Fenix?</a></li>
<li><a href="datasets/obsolete/sync_summary/reference.html#whats-an-engine">What's an engine?</a></li>
<li><a href="datasets/obsolete/sync_summary/reference.html#example-queries">Example Queries</a></li>
<li><a href="datasets/obsolete/sync_summary/reference.html#sampling">Sampling</a></li>
<li><a href="datasets/obsolete/sync_summary/reference.html#scheduling">Scheduling</a></li>
<li><a href="datasets/obsolete/sync_summary/reference.html#sync-summary-schema">Sync Summary Schema</a></li>
<li><a href="datasets/obsolete/sync_summary/reference.html#sync-flat-summary-schema">Sync Flat Summary Schema</a></li>
</ul>
</li>
</ul>
<h1 id="introduction-40"><a class="header" href="#introduction-40">Introduction</a></h1>
<p><em>Note: some of the information in this chapter is a duplication of the info found on <a href="https://wiki.mozilla.org/CloudServices/Sync/ReDash">this</a> wiki page. You can also find more detailed information about the data contained in the sync ping <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/data/sync-ping.html">here</a></em></p>
<p><code>sync_summary</code> and <code>sync_flat_summary</code> are the primary datasets that track the health of sync. <code>sync_flat_summary</code> is derived from <code>sync_summary</code> by unpacking/exploding the <code>engines</code> field of the latter, so they ultimately contain the same data (see below).</p>
<h2 id="which-dataset-should-i-use"><a class="header" href="#which-dataset-should-i-use">Which dataset should I use?</a></h2>
<p>Which dataset to use depends on whether you are interested in <em>per-engine</em> sync success or <em>per-sync</em> sync success (see below). If you are interested in whether a sync failed overall, regardless of which engine may have caused the failure, then you can use <code>sync_summary</code>. Otherwise, if you are interested in per-engine data, you should use <code>sync_flat_summary</code>.</p>
<p>If you aren't sure, or just trying to get acquainted, you should probably just use <code>sync_flat_summary</code>.</p>
<h1 id="data-reference-30"><a class="header" href="#data-reference-30">Data Reference</a></h1>
<h2 id="a-note-about-user-ids"><a class="header" href="#a-note-about-user-ids">A note about user IDs</a></h2>
<p>Unlike most other telemetry datasets, these do not contain the profile-level identifier <code>client_id</code>. Because you need to sign up for a <a href="https://www.mozilla.org/en-US/firefox/accounts/">Firefox Account</a> in order to use sync, these datasets instead include an anonymised version of the user's Firefox Account user id <code>uid</code> and an anonymised version of their individual devices' <code>device_id</code>s. Put another way, each <code>uid</code> can have many associated <code>device_id</code>s.</p>
<p><strong>Q:</strong> Why not include <code>client_id</code> in these datasets so that they can be joined on (e.g.) <code>main_summary</code>?</p>
<p><strong>A:</strong> We've had a policy to keep main browser telemetry separate from sync and FxA telemetry. This is in part because FxA <code>uid</code>s are ultimately associated with email addresses in the FxA database, and thus a breach of that database in combination with access to telemetry could in theory de-anonymise client-side browser metrics.</p>
<h2 id="which-apps-send-sync-telemetry-what-about-fenix"><a class="header" href="#which-apps-send-sync-telemetry-what-about-fenix">Which apps send sync telemetry? What about Fenix?</a></h2>
<p>Currently, Firefox for desktop, Firefox for iOS and Firefox for Android (fennec) all have sync implemented, and they all send sync telemetry. Though there are some differences in the telemetry that each application sends, it all ends up in the <code>sync_summary</code> and <code>sync_flat_summary</code> datasets.</p>
<p>Starting with Fenix, however, sync telemetry will start to be sent through <a href="https://github.com/mozilla-mobile/android-components/tree/master/components/service/glean">glean</a>. This means that, in all likelihood, Fenix sync telemetry will initially be segregated from existing sync telemetry (one reason is that current sync telemetry is on AWS while glean pings are ingested to GCP).</p>
<h2 id="whats-an-engine"><a class="header" href="#whats-an-engine">What's an engine?</a></h2>
<p>Firefox syncs many different types of browser data and (generally speaking) each one of these data types are synced by their own engine. When the app triggers a &quot;sync&quot; each engine makes their own determination of what needs to be synced (if anything). Many syncs can happen in a day (dozens or more on desktop, usually less on mobile). Telemetry about each sync is logged, and each <a href="https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/data/sync-ping.html">sync ping</a> (sent once a day, and whenever the user logs in or out of sync) contains information about multiple syncs. The scala code responsible for creating the <code>sync_summary</code> dataset unpacks each sync ping into one row per sync. The resulting <code>engines</code> field is an array of &quot;engine records&quot;: data about how each engine performed during that sync. <code>sync_flat_summary</code> further unpacking/exploding the <code>engines</code> field and creates a dataset that is one row per engine record.</p>
<p>Existing engines (<code>engine_name</code> in <code>sync_flat_summary</code>) are listed below with brief descriptions in cases where their name isn't transparent.</p>
<p>Note that not every device syncs each of these engines. They can be disabled individually and some are off by default.</p>
<ul>
<li><code>addons</code></li>
<li><code>addresses</code> mailing addresses e.g. for e-commerce; part of form autofill.</li>
<li><code>bookmarks</code></li>
<li><code>clients</code> non-user-facing list of the sync account's associated devices</li>
<li><code>creditcards</code> this used to be nightly only but was recently removed entirely</li>
<li><code>extension-storage</code> WebExtension storage, in support of the <code>storage.sync</code> WebExtension API.</li>
<li><code>history</code> browsing history.</li>
<li><code>passwords</code></li>
<li><code>forms</code> saved values in web forms</li>
<li><code>prefs</code> not all prefs are synced</li>
<li><code>tabs</code> note that this is not the same as the &quot;send tab&quot; feature, this is the engine that syncs the tabs you have open across your devices (used to populate the synced tabs sidebar). For data on the send-tab feature use the <code>sync_events</code> dataset.</li>
</ul>
<h2 id="example-queries-26"><a class="header" href="#example-queries-26">Example Queries</a></h2>
<p>See <a href="https://sql.telemetry.mozilla.org/dashboard/sync-leif-status-dashboard-wip">this dashboard</a> to get a general sense of what this dataset is typically used for.</p>
<p>Here's an example of a query that will calculate the failure and success rates for a subset of engines per day.</p>
<pre><code class="language-sql">WITH
counts AS (
SELECT
submission_date_s3 AS day,
engine_name AS engine,
COUNT(*) AS total,
COUNT(CASE WHEN engine_status IS NOT NULL THEN true ELSE NULL END) AS count_errors,
/* note that `engine_status` is null on sync success. */
COUNT(CASE WHEN engine_status IS NULL THEN true ELSE NULL END) AS count_success
FROM telemetry.sync_flat_summary
WHERE engine_name IN ('bookmarks','history','tabs','addons','addresses','passwords','prefs')
AND cast(submission_date_s3 AS integer) &gt;= 20190101
GROUP BY 1,2
ORDER BY 1
),
rates AS (
SELECT
day,
engine,
total,
count_errors,
count_success,
CAST(count_errors AS double) / CAST(total AS double) * 100 AS error_rate,
CAST(count_success AS double) / CAST(total AS double) * 100 AS success_rate
FROM counts
ORDER BY 1
)
SELECT * FROM rates
</code></pre>
<h2 id="sampling-6"><a class="header" href="#sampling-6">Sampling</a></h2>
<p>Sadly, these datasets are not sampled. It should be possible to derive a <code>sample_id</code> on <code>uid</code>, however. Someone should do that because querying these datasets for long time horizons is very expensive.</p>
<h2 id="scheduling-34"><a class="header" href="#scheduling-34">Scheduling</a></h2>
<p>This dataset was updated daily, shortly after midnight UTC.
The job was scheduled on <a href="https://github.com/mozilla/telemetry-airflow">Airflow</a>.
The DAG was <a href="https://github.com/mozilla/telemetry-airflow/blob/27d34a73db02131a39f469f3950c1da747bc8a95/dags/sync_view.py">here</a>.</p>
<h2 id="sync-summary-schema"><a class="header" href="#sync-summary-schema">Sync Summary Schema</a></h2>
<pre><code>root
|-- app_build_id: string (nullable = true)
|-- app_display_version: string (nullable = true)
|-- app_name: string (nullable = true)
|-- app_version: string (nullable = true)
|-- app_channel: string (nullable = true)
|-- uid: string
|-- device_id: string (nullable = true)
|-- when: integer
|-- took: integer
|-- why: string (nullable = true)
|-- failure_reason: struct (nullable = true)
| |-- name: string
| |-- value: string (nullable = true)
|-- status: struct (nullable = true)
| |-- sync: string (nullable = true)
| |-- status: string (nullable = true)
|-- devices: array (nullable = true)
| |-- element: struct (containsNull = false)
| | |-- id: string
| | |-- os: string
| | |-- version: string
|-- engines: array (nullable = true)
| |-- element: struct (containsNull = false)
| | |-- name: string
| | |-- took: integer
| | |-- status: string (nullable = true)
| | |-- failure_reason: struct (nullable = true)
| | | |-- name: string
| | | |-- value: string (nullable = true)
| | |-- incoming: struct (nullable = true)
| | | |-- applied: integer
| | | |-- failed: integer
| | | |-- new_failed: integer
| | | |-- reconciled: integer
| | |-- outgoing: array (nullable = true)
| | | |-- element: struct (containsNull = false)
| | | | |-- sent: integer
| | | | |-- failed: integer
| | |-- validation: struct (containsNull = false)
| | | |-- version: integer
| | | |-- checked: integer
| | | |-- took: integer
| | | |-- failure_reason: struct (nullable = true)
| | | | |-- name: string
| | | | |-- value: string (nullable = true)
| | | |-- problems: array (nullable = true)
| | | | |-- element: struct (containsNull = false)
| | | | | |-- name: string
| | | | | |-- count: integer
</code></pre>
<h2 id="sync-flat-summary-schema"><a class="header" href="#sync-flat-summary-schema">Sync Flat Summary Schema</a></h2>
<pre><code>root
|-- app_build_id: string (nullable = true)
|-- app_display_version: string (nullable = true)
|-- app_name: string (nullable = true)
|-- app_version: string (nullable = true)
|-- app_channel: string (nullable = true)
|-- os: string
|-- os_version: string
|-- os_locale: string
|-- uid: string
|-- device_id: string (nullable = true)
|-- when: integer
|-- took: integer
|-- failure_reason: struct (nullable = true)
| |-- name: string
| |-- value: string (nullable = true)
|-- status: struct (nullable = true)
| |-- sync: string (nullable = true)
| |-- status: string (nullable = true)
|-- why: string (nullable = true)
|-- devices: array (nullable = true)
| |-- element: struct (containsNull = false)
| | |-- id: string
| | |-- os: string
| | |-- version: string
|-- sync_id: string
|-- sync_day: string
|-- engine_name: string
|-- engine_took: integer
|-- engine_status: string (nullable = true)
|-- engine_failure_reason: struct (nullable = true)
| |-- name: string
| |-- value: string (nullable = true)
|-- engine_incoming_applied: integer (nullable = true)
|-- engine_incoming_failed: integer (nullable = true)
|-- engine_incoming_new_failed: integer (nullable = true)
|-- engine_incoming_reconciled: integer (nullable = true)
|-- engine_outgoing_batch_count: integer (nullable = true)
|-- engine_outgoing_batch_total_sent: integer (nullable = true)
|-- engine_outgoing_batch_total_failed: integer (nullable = true)
|-- submission_date_s3: string
</code></pre>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/datasets/obsolete/sync_summary/reference.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="contributing"><a class="header" href="#contributing">Contributing</a></h1>
<p>Documentation is critical to making a usable data platform.
In surveys of users of the Mozilla Data Platform,
the most common complaint has been lack of documentation.
It is therefore important to expand and improve the documentation as often as possible.</p>
<ul>
<li><a href="contributing/index.html#bug-reports">Bug reports</a></li>
<li><a href="contributing/index.html#fixing-minor-problems">Fixing minor problems</a></li>
<li><a href="contributing/index.html#building-the-documentation">Building the Documentation</a></li>
<li><a href="contributing/index.html#adding-a-new-article">Adding a new article</a></li>
<li><a href="contributing/index.html#review">Review</a></li>
<li><a href="contributing/index.html#supported-plugins">Supported Plugins</a>
<ul>
<li><a href="contributing/index.html#table-of-contents">Table of contents</a></li>
<li><a href="contributing/index.html#mermaid">Mermaid</a></li>
</ul>
</li>
<li><a href="contributing/index.html#publishing">Publishing</a></li>
</ul>
<h2 id="bug-reports"><a class="header" href="#bug-reports">Bug reports</a></h2>
<p>If you see an error in the documentation or want to extend a chapter,
<a href="https://bugzilla.mozilla.org/enter_bug.cgi?assigned_to=nobody%40mozilla.org&amp;bug_file_loc=http%3A%2F%2F&amp;bug_ignored=0&amp;bug_severity=normal&amp;bug_status=NEW&amp;cf_fx_iteration=---&amp;cf_fx_points=---&amp;component=Documentation%20and%20Knowledge%20Repo%20%28RTMO%29&amp;contenttypemethod=autodetect&amp;contenttypeselection=text%2Fplain&amp;defined_groups=1&amp;flag_type-4=X&amp;flag_type-607=X&amp;flag_type-800=X&amp;flag_type-803=X&amp;flag_type-916=X&amp;form_name=enter_bug&amp;maketemplate=Remember%20values%20as%20bookmarkable%20template&amp;op_sys=Linux&amp;priority=--&amp;product=Data%20Platform%20and%20Tools&amp;rep_platform=x86_64&amp;target_milestone=---&amp;version=unspecified">file a bug</a>.</p>
<h2 id="fixing-minor-problems"><a class="header" href="#fixing-minor-problems">Fixing minor problems</a></h2>
<p>For smaller issues (for example, a typo or minor inaccuracy), it is not necessary to file a bug or even
check out the source.
Instead, use the <code>Edit on GitHub</code> button on the bottom of any page, make your changes, and file a pull request entirely from the GitHub interface.</p>
<h2 id="building-the-documentation"><a class="header" href="#building-the-documentation">Building the Documentation</a></h2>
<p>This documentation is stored as <a href="https://commonmark.org/help/">CommonMark Markdown</a> in the
<a href="https://github.com/mozilla/data-docs"><code>data-docs</code> repository</a> on GitHub.
To build a local copy, fork the repository and check out your copy. Then, <a href="https://github.com/mozilla/data-docs/blob/master/README.md#building-the-documentation">see the README</a> for up-to-date information on how to build the documentation.</p>
<h2 id="adding-a-new-article"><a class="header" href="#adding-a-new-article">Adding a new article</a></h2>
<p>You should read the <a href="contributing/./style_guide.html">style guide</a> before adding a new article: it will help you write material that is more useful and fits cohesively into the rest of the documentation.</p>
<p>Be sure to link to your new article from <code>SUMMARY.md</code>, or mdBook will not render the file.</p>
<p>The structure of the repository is outlined in <a href="contributing/./structure.html">this article</a>.</p>
<p>This documentation is under active development,
so we may already be working on the documentation you need.
Take a look at
<a href="https://bugzilla.mozilla.org/buglist.cgi?product=Data%20Platform%20and%20Tools&amp;component=Documentation%20and%20Knowledge%20Repo%20%28RTMO%29&amp;resolution=---">this bug component</a>
to check.</p>
<h2 id="review"><a class="header" href="#review">Review</a></h2>
<p>Once you're happy with your contribution, open a pull request (PR). Give your PR a meaningful commit message
(see <a href="https://chris.beams.io/posts/git-commit/">this article on commit message guidelines</a> for some suggestions).
If there is a bug associated with your documentation, title it in the form of <code>Bug 1234 - &lt;descriptive one-liner&gt;</code> - that way, the <a href="https://github.com/mozilla/github-bugzilla-pr-linker">Bugzilla PR linker</a> will pick up the PR and attach it to the bug.</p>
<p>After filing your PR, assign the appropriate person for review (GitHub will usually provide some suggestions), assuming you have permissions to do so yourself.
If you do not have permission to assign a reviewer, see <a href="contributing/../concepts/getting_help.html">getting help</a>.</p>
<h2 id="supported-plugins"><a class="header" href="#supported-plugins">Supported Plugins</a></h2>
<h3 id="table-of-contents-23"><a class="header" href="#table-of-contents-23">Table of contents</a></h3>
<p>You can insert a table of contents in any article by using the <code>toc</code> shorthand. For example:</p>
<pre><code># My fine title
This article describes how to perform action X.
&lt;!-- toc --&gt;
## Section 1
...
## Section 2
</code></pre>
<p>For an example of what the rendered table of contents looks like, see the beginning of this article.</p>
<h3 id="mermaid"><a class="header" href="#mermaid">Mermaid</a></h3>
<p>You can use <a href="https://mermaidjs.github.io/"><code>mermaid.js</code></a> diagrams in code blocks. For example:</p>
<pre><code class="language-md">```mermaid
graph LR
you --&gt;|write|docs
docs --&gt; profit!
```
</code></pre>
<p>... is rendered as:</p>
<pre class="mermaid">graph LR
you --&gt;|write|docs
docs --&gt; profit!
</pre>
<h2 id="publishing"><a class="header" href="#publishing">Publishing</a></h2>
<p>The documentation is hosted on <a href="https://pages.github.com/">Github Pages</a>.</p>
<p>Updates to the documentation are automatically published to
<a href="https://docs.telemetry.mozilla.org">docs.telemetry.mozilla.org</a>.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/contributing/index.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="style-guide"><a class="header" href="#style-guide">Style Guide</a></h1>
<p>These are some general style guidelines for articles which appear on <code>docs.telemetry.mozilla.org</code> (DTMO). Reading these guidelines can help you write content that is more accessible and easier to understand.</p>
<ul>
<li><a href="contributing/style_guide.html#audience">Audience</a></li>
<li><a href="contributing/style_guide.html#what-to-write">What to write</a></li>
<li><a href="contributing/style_guide.html#what-not-to-write">What not to write</a>
<ul>
<li><a href="contributing/style_guide.html#implementation-specific-articles">Implementation-specific articles</a></li>
<li><a href="contributing/style_guide.html#lists-of-links">Lists of links</a></li>
</ul>
</li>
<li><a href="contributing/style_guide.html#general-guidelines">General guidelines</a></li>
<li><a href="contributing/style_guide.html#writing-style">Writing style</a></li>
<li><a href="contributing/style_guide.html#colophon">Colophon</a></li>
</ul>
<h2 id="audience"><a class="header" href="#audience">Audience</a></h2>
<p><em>Data practitioners</em> at Mozilla represent the primary audience for DTMO. A &quot;data practitioner&quot; is someone who wants to inform themselves or others using Mozilla-collected data. Here are some real-world examples of this persona:</p>
<ul>
<li>A data scientist performing an experiment analysis</li>
<li>An analyst producing a report on the effectiveness of a recent marketing campaign</li>
<li>A Firefox engineer trying to measure the performance of a new feature</li>
<li>A technical product manager trying to understand the characteristics of a particular user segment</li>
<li>A quality assurance engineer trying to understand the severity and frequency of a new Firefox crash</li>
</ul>
<p>In general, you can assume that readers have at least some technical knowledge.
Different articles on DTMO may have different target audiences: when you write a new article, you should consider who you are writing it for and adjust your content appropriately.
For example, a new product manager may require more careful hand-holding (and links to relevant concept or reference material) than a data scientist with many years of experience at Mozilla.</p>
<p>Note that &quot;data engineers&quot; (the maintainers of the Mozilla data platform and tools) are <em>not</em> the target audience for DTMO, though they may find some resources here helpful in the course of the work.
If something is <em>only</em> of interest to a data engineer, it probably belongs elsewhere: see the note below on <a href="contributing/style_guide.html#implementation-specific-articles">implementation-specific articles</a>.</p>
<h2 id="what-to-write"><a class="header" href="#what-to-write">What to write</a></h2>
<p>There are three different types of documentation that are useful as part of a site like DTMO:</p>
<ul>
<li>Introductory material: Material intended to help people get their bearings with Mozilla's telemetry system.
A set of these articles form the <a href="contributing/../introduction/index.html">Introduction</a> section on this site.</li>
<li>Tutorials &amp; Cookbooks: Instructions on how to perform specific tasks using Mozilla's data platform. The focus here is on how to do things, rather than on what they are.</li>
<li>Reference: Reference material on either <a href="contributing/../datasets/reference.html">datasets</a> or the <a href="contributing/../reference/index.html">data platform itself</a>. The focus is on describing how things work or what they do, rather than how to use them.</li>
</ul>
<p>In general, the most useful documentation for newcomers is usually a cookbook or tutorial as <a href="https://stevelosh.com/blog/2013/09/teach-dont-tell/">they often don't know where to begin</a>. For advanced users, reference material may be more useful.</p>
<h2 id="what-not-to-write"><a class="header" href="#what-not-to-write">What <em>not</em> to write</a></h2>
<p>There are a few types of documentation that are less appropriate for a general-purpose reference like DTMO.</p>
<h3 id="implementation-specific-articles"><a class="header" href="#implementation-specific-articles">Implementation-specific articles</a></h3>
<p>Any articles that are specific to the implementation of particular data systems or tools should be published alongside the system or tool itself. Examples of this may include:</p>
<ul>
<li>Usage guides for command-line tools</li>
<li>In-depth documentation on architecture and design choices (beyond a general overview)</li>
<li>A general usage guide for <a href="contributing/../introduction/tools.html">specific data tool</a></li>
</ul>
<p>In the past, this type of documentation has gone out of date quickly as people update the implementation while forgetting to update what has been written here.
Vast amounts of implementation detail can also be overwhelming to anyone who just wants to get an answer to a data-related question.</p>
<p>That said, it can sometimes be useful to provide a general overview of a topic on DTMO while saving the details for site-specific documentation.
A good example of this is the <a href="https://mozilla.github.io/gcp-ingestion/">gcp-ingestion</a> documentation: while we maintain <a href="contributing/../concepts/pipeline/gcp_data_pipeline.html">a high-level description of the data pipeline here</a>, details on the Beam-specific implementation are stored alongside the source on GitHub.</p>
<h3 id="lists-of-links"><a class="header" href="#lists-of-links">Lists of links</a></h3>
<p>Articles which simply link out to other resources are of limited value and tend to go out of date quickly. Instead, consider your motivation for producing the list and think about what your <a href="contributing/style_guide.html#audience">intended audience</a> might need. Concept, reference, or tutorial documentation need not be long to be helpful.</p>
<p>Of course, linking to other resources as part of other documentation is always okay.</p>
<h2 id="general-guidelines"><a class="header" href="#general-guidelines">General guidelines</a></h2>
<ul>
<li>Articles should be written in Markdown:
mdBook uses the <a href="https://commonmark.org/help/">CommonMark dialect</a>
as implemented by <a href="https://github.com/raphlinus/pulldown-cmark"><code>pulldown-cmark</code></a>,
which supports certain extensions including GitHub-flavored tables.</li>
<li>Limit lines to <strong>100 characters</strong> where possible.
Try to split lines at the end of sentences,
or use <a href="http://rhodesmill.org/brandon/2012/one-sentence-per-line/">Semantic Line Breaks</a>.
This makes it easier to reorganize your thoughts later.</li>
<li>This documentation is almost always read digitally.
Keep in mind that people read digital content much differently than other media.
Specifically, readers are going to skim your writing,
so make it easy to identify important information.
<ul>
<li>Use <strong>visual markup</strong> like <strong>bold text</strong>, <code>code blocks</code>, and section headers.</li>
<li>Avoid long paragraphs: short paragraphs that describe one concept each makes finding important information easier.</li>
<li>When writing longer articles with many sections, use a <a href="contributing/./index.html#table-of-contents">table of contents</a> to help people quickly navigate to a section of interest.</li>
</ul>
</li>
<li>Use self-explanatory link descriptions.
<ul>
<li><strong>Do not</strong> use <code>here</code> or <code>this</code> as the description for an external link. It is repetitive across a site and does not explain where it leads to.</li>
<li>For links to queries on Redash use <code>STMO#&lt;query id&gt;</code>. This makes it consistent across the documentation and provides a hint about how old / new a query might be (newer queries have higher numeric IDs)</li>
</ul>
</li>
</ul>
<h2 id="writing-style"><a class="header" href="#writing-style">Writing style</a></h2>
<p>The following is a distillation of common best practices for technical writing in a resource like DTMO. Following these guidelines helps give our documentation a consistent voice and makes it easier to read:</p>
<ul>
<li>In general: use a friendly, conversational tone. This makes the documentation more approachable.</li>
<li>Avoid specifying particular people or teams unless absolutely necessary: change is constant at Mozilla, and the person or people who maintains a system today is not necessarily the same as yesterday.</li>
<li>If possible, use &quot;you&quot; when describing how to do something. Avoid use of first person (for example: &quot;I&quot;, &quot;we&quot;, &quot;our&quot;) as this emphasizes the writer rather than the reader and it is often unclear who &quot;I&quot;, &quot;we&quot; or &quot;our&quot; actually refer to.</li>
<li>Avoid unnecessary formalities like &quot;please&quot; or &quot;thank you&quot;.</li>
<li>Where it makes sense, use present tense (as opposed to future tense). For example, &quot;You need to perform the following tasks&quot; is preferable to &quot;You will need to perform the following tasks&quot;.</li>
<li>Where possible, avoid the use of the passive voice (identifying the agent of action as the subject of the sentence): active voice sentences are generally clearer and less verbose. For example, &quot;Press OK to confirm changes&quot; is preferable to &quot;The system will confirm changes when you press OK&quot;. You can use a tool like <a href="https://github.com/btford/write-good">write-good</a> to identify uses of passive voice in your own work.</li>
</ul>
<p>For much more helpful advice on technical writing, you may wish to review the <a href="https://docs.openstack.org/doc-contrib-guide/writing-style/general-writing-guidelines.html">OpenStack General Writing Guidelines</a>, which inspired some of the above.</p>
<h2 id="colophon"><a class="header" href="#colophon">Colophon</a></h2>
<p>You can find more context for these guidelines in
<a href="http://blog.harterrt.com/lit-review.html">this literature review</a> and <a href="https://wrla.ch/blog/2020/05/a-principled-reorganization-of-docs-telemetry-mozilla-org/">this follow-up on organization and audience</a>.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/contributing/style_guide.md">Edit this page on GitHub.</a></footer><div style="break-before: page; page-break-before: always;"></div><h1 id="documentation-structure"><a class="header" href="#documentation-structure">Documentation Structure</a></h1>
<p>The directory structure should feel comfortable to anyone who is familiar with the data platform:</p>
<pre><code>.
|--src
|--datasets - contains dataset level documentation
|--tools - contains tool level documentation
|--concepts - contains tutorials meant to introduce a new concept to the reader
|--cookbooks - focused code examples for reference
</code></pre>
<p>This documentation is meant to take the reader from beginner to expert.</p>
<ul>
<li>Getting Started: Introduce concepts and provides information on how to perform and complete a simple analysis, so the user understands the amount of work involved and what the data platform feels like. Primarily intended for people new to Mozilla's data platform.</li>
<li>Tutorials &amp; Cookbooks: Guides on how to perform specific tasks. Intended for all audiences.</li>
<li>Reference material: In-depth reference material on metrics and the data platform. Intended primarily for more advanced users.</li>
</ul>
<p>This document's structure is heavily influenced by
<a href="https://docs.djangoproject.com/en/1.11/internals/contributing/writing-documentation/">Django's Documentation Style Guide</a>.</p>
<footer id="open-on-gh">Found a bug? <a href="https://github.com/mozilla/data-docs/edit/main/src/contributing/structure.md">Edit this page on GitHub.</a></footer>
</main>
<nav class="nav-wrapper" aria-label="Page navigation">
<!-- Mobile navigation buttons -->
<div style="clear: both"></div>
</nav>
</div>
</div>
<nav class="nav-wide-wrapper" aria-label="Page navigation">
</nav>
</div>
<!-- Google Analytics Tag -->
<script>
var localAddrs = ["localhost", "127.0.0.1", ""];
// make sure we don't activate google analytics if the developer is
// inspecting the book locally...
if (localAddrs.indexOf(document.location.hostname) === -1) {
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-104326577-1', 'auto');
ga('send', 'pageview');
}
</script>
<script>
window.playground_copyable = true;
</script>
<script src="elasticlunr.min.js"></script>
<script src="mark.min.js"></script>
<script src="searcher.js"></script>
<script src="clipboard.min.js"></script>
<script src="highlight.js"></script>
<script src="book.js"></script>
<!-- Custom JS scripts -->
<script src="mermaid.min.js"></script>
<script src="mermaid-init.js"></script>
<script>
window.addEventListener('load', function() {
MathJax.Hub.Register.StartupHook('End', function() {
window.setTimeout(window.print, 100);
});
});
</script>
</div>
</body>
</html>