From fad5b77715db84ee267c09d7c1279feb6e04bdc8 Mon Sep 17 00:00:00 2001 From: Mauro Doglio Date: Fri, 24 Feb 2017 18:03:20 +0000 Subject: [PATCH] Update usage of Dataset.where in Telemetry Hello World --- tutorials/telemetry_hello_world.kp/REVISION | 2 +- .../telemetry_hello_world.kp/knowledge.md | 85 ++----------------- .../orig_src/Telemetry Hello World.ipynb | 12 +-- 3 files changed, 13 insertions(+), 86 deletions(-) diff --git a/tutorials/telemetry_hello_world.kp/REVISION b/tutorials/telemetry_hello_world.kp/REVISION index 56a6051..d8263ee 100644 --- a/tutorials/telemetry_hello_world.kp/REVISION +++ b/tutorials/telemetry_hello_world.kp/REVISION @@ -1 +1 @@ -1 \ No newline at end of file +2 \ No newline at end of file diff --git a/tutorials/telemetry_hello_world.kp/knowledge.md b/tutorials/telemetry_hello_world.kp/knowledge.md index 2426671..c658340 100644 --- a/tutorials/telemetry_hello_world.kp/knowledge.md +++ b/tutorials/telemetry_hello_world.kp/knowledge.md @@ -8,9 +8,8 @@ tags: - telemetry - spark created_at: 2016-03-10 00:00:00 -updated_at: 2016-11-29 15:07:12.387854 +updated_at: 2017-02-24 18:00:25.791035 tldr: Brief introduction to Spark and Telemetry in Python -thumbnail: images/output_23_0.png --- ### Telemetry Hello World @@ -30,9 +29,6 @@ from moztelemetry.dataset import Dataset %matplotlib inline ``` - Unable to parse whitelist (/home/hadoop/anaconda2/lib/python2.7/site-packages/moztelemetry/histogram-whitelists.json). Assuming all histograms are acceptable. - - ### Basics The goal of this example is to plot the startup distribution for each OS. Let's see how many parallel workers we have at our disposal: @@ -41,14 +37,6 @@ The goal of this example is to plot the startup distribution for each OS. Let's ```python sc.defaultParallelism ``` - - - - - 32 - - - Let's fetch 10% of Telemetry submissions for a given submission date... @@ -56,27 +44,12 @@ Let's fetch 10% of Telemetry submissions for a given submission date... Dataset.from_source("telemetry").schema ``` - - - - [u'submissionDate', - u'sourceName', - u'sourceVersion', - u'docType', - u'appName', - u'appUpdateChannel', - u'appVersion', - u'appBuildId'] - - - - ```python -pings = Dataset.from_source("telemetry") \ - .where(docType='main') \ - .where(submissionDate="20161101") \ - .where(appUpdateChannel="nightly") \ - .records(sc, sample=0.1) +pings = Dataset.from_source("telemetry").where( + docType='main', + submissionDate="20161101", + appUpdateChannel="nightly" +).records(sc, sample=0.1) ``` ... and extract only the attributes we need from the Telemetry submissions: @@ -110,14 +83,6 @@ How many pings are we looking at? ```python cached.count() ``` - - - - - 7132 - - - Let's group the startup timings by OS: @@ -134,11 +99,6 @@ frame.boxplot(return_type="axes") plt.ylabel("log10(firstPaint)") plt.show() ``` - - -![png](images/output_23_0.png) - - You can also create interactive plots with [plotly](https://plot.ly/): @@ -150,14 +110,6 @@ plt.ylabel("count") plt.xlabel("log10(firstPaint)") py.iplot_mpl(fig, strip_style=True) ``` - - - - - - - - ### Histograms Let's extract a histogram from the submissions: @@ -187,19 +139,6 @@ def aggregate_arrays(xs, ys): aggregate = histograms.map(lambda p: p["payload/histograms/GC_MARK_MS"]).reduce(aggregate_arrays) aggregate.plot(kind="bar", figsize=(15, 7)) ``` - - - - - - - - - - -![png](images/output_31_1.png) - - Keyed histograms follow a similar pattern. To extract a keyed histogram for which we know the key/label we are interested in: @@ -217,18 +156,6 @@ keys = keys.distinct().collect() ```python keys[:5] ``` - - - - - [u'firefox@zenmate.com', - u'jid1-f3mYMbCpz2AZYl@jetpack', - u'jid0-SQnwtgW1b8BsMB5PLV5WScEDWOjw@jetpack', - u'light_plugin_ACF0E80077C511E59DED005056C00008@kaspersky.com', - u'netvideohunter@netvideohunter.com'] - - - Retrieve the histograms for a set of labels: diff --git a/tutorials/telemetry_hello_world.kp/orig_src/Telemetry Hello World.ipynb b/tutorials/telemetry_hello_world.kp/orig_src/Telemetry Hello World.ipynb index b42a3a8..6d45dd5 100644 --- a/tutorials/telemetry_hello_world.kp/orig_src/Telemetry Hello World.ipynb +++ b/tutorials/telemetry_hello_world.kp/orig_src/Telemetry Hello World.ipynb @@ -106,11 +106,11 @@ }, "outputs": [], "source": [ - "pings = Dataset.from_source(\"telemetry\") \\\n", - " .where(docType='main') \\\n", - " .where(submissionDate=\"20161101\") \\\n", - " .where(appUpdateChannel=\"nightly\") \\\n", - " .records(sc, sample=0.1)" + "pings = Dataset.from_source(\"telemetry\").where(\n", + " docType='main',\n", + " submissionDate=\"20161101\",\n", + " appUpdateChannel=\"nightly\"\n", + ").records(sc, sample=0.1)" ] }, { @@ -419,7 +419,7 @@ "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 2.0 }, "file_extension": ".py", "mimetype": "text/x-python",