Add converted static html and docere report.json files

This commit is contained in:
Jeff Klukas 2018-10-16 13:55:08 -04:00
Родитель 92bd6284d4
Коммит fcf64bce64
120 изменённых файлов: 73650 добавлений и 0 удалений

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -0,0 +1,14 @@
{
"title": "Query AMO with Add-on GUID",
"authors": [
"Ben Miroglio"
],
"tags": [
"AMO",
"add-ons",
"firefox-desktop"
],
"publish_date": "2017-01-09",
"updated_at": "2017-01-09",
"tldr": "Get metadata for an add-on through AMO given its GUID"
}

Просмотреть файл

@ -0,0 +1,623 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title> Knowledge </title>
<!-- js includes at the top as post embedded js colliding -->
<script src="https://reports.telemetry.mozilla.org/static/modules/jquery/jquery.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/tether/js/tether.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap/js/bootstrap.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/js/bootstrap-slider.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/typeahead.js/typeahead.bundle.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/handlebars/js/handlebars.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/select2/js/select2.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/hightlight.pack.js/highlight.pack.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/marked.js/marked.js"></script>
<!-- require js is used for plotly, but has a bunch of collisions with other js packages
make sure to have it be last js package imported -->
<script src="https://reports.telemetry.mozilla.org/static/modules/require.js/require.min.js"></script>
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.6.2/html5shiv.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/2.1.0/es5-shim.min.js"></script>
<![endif]-->
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap/css/bootstrap.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/css/bootstrap-slider.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/css/custom.css" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/select2/css/select2.min.css" rel="stylesheet" type="text/css"/>
<link href="https://fonts.googleapis.com/css?family=Lato:400,900|Playfair+Display|Source+Serif+Pro:400,700" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/images/favicon.png" rel="shortcut icon"/>
<link href="https://reports.telemetry.mozilla.org/static/css/codehilite-friendly.css" rel="stylesheet"/>
<style>
.spinner {
position: fixed;
top: 50%;
left: 50%;
margin-left: -50px; /* half width of the spinner gif */
margin-top: -50px; /* half height of the spinner gif */
text-align:center;
z-index:1234;
overflow: auto;
width: 100px; /* width of the spinner gif */
height: 102px; /*hight of the spinner gif +2px to fix IE8 issue */
}
.table {
font-size: 14px;
}
.modal-content {
max-width: 1024px;
}
</style>
</head>
<body>
<div class="container page-container">
<br/>
<div class="container-fluid">
<div class="row">
<div class="col-md-6">
</div>
<div class="col-md-2">
</div>
</div>
<div class="row col-md-12">
</div>
</div>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<div id="renderedMarkdown">
<div class="metadata">
<h1>addon_aggregates derived dataset script</h1>
<span class="authors"><a href="/feed?authors=Ben+Miroglio">Ben Miroglio</a></span>
<span class="date_created">February 08, 2017</span>
<span class="date_updated">(Last Updated: May 03, 2017)</span>
<span class="tldr"><p>script to be run daily that contructs the addon_aggregates table in re:dash</p></span>
</div>
<h1 id="add-ons-2017-okr-data-collection">Add-ons 2017 OKR Data Collection</h1>
<p>Some OKRs for 2017 can be feasibly collected via the <code>addons</code> and <code>main_summary</code> tables. These tables are huge and arent appropriate to query directly via re:dash. This script condenses these tables so that the result contains the least data possible to track the following OKRs:</p>
<ul>
<li><strong>OKR 1: Increase number of users who self-install an Add-on by 5%</strong></li>
<li><strong>OKR 2: Increase average number of add-ons per profile by 3%</strong></li>
<li><strong>OKR 3: Increase number of new Firefox users who install an add-on in first 14 days by 25%</strong></li>
</ul>
<p>These OKRs, in addition to other add-on metrics, are tracked via the <a href="https://sql.telemetry.mozilla.org/dashboard/add-on-okrs_1#edit_dashboard_dialog">Add-on OKRs Dashboard</a> in re:dash.</p>
<div class="codehilite"><pre><span></span><span class="kn">from</span> <span class="nn">__future__</span> <span class="kn">import</span> <span class="n">division</span>
<span class="kn">import</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">as</span> <span class="nn">fun</span>
<span class="kn">import</span> <span class="nn">pyspark.sql.types</span> <span class="kn">as</span> <span class="nn">st</span>
<span class="kn">import</span> <span class="nn">math</span>
<span class="kn">import</span> <span class="nn">os</span>
<span class="kn">import</span> <span class="nn">datetime</span> <span class="kn">as</span> <span class="nn">dt</span>
<span class="n">sc</span><span class="o">.</span><span class="n">setLogLevel</span><span class="p">(</span><span class="s2">"INFO"</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">optimize_repartition</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">record_size</span><span class="p">,</span> <span class="n">partition_size</span><span class="o">=</span><span class="mi">280</span><span class="p">):</span>
<span class="sd">'''</span>
<span class="sd"> Repartitions a spark DataFrame &lt;df&gt; so that each partition is </span>
<span class="sd"> ~ &lt;partition_size&gt;MB, defaulting to 280MB. record_size must be </span>
<span class="sd"> estimated beforehand--i.e. write the dataframe to s3, get the size </span>
<span class="sd"> in bytes and divide by df.count(). </span>
<span class="sd"> Returns repartitioned dataframe if a repartition is necessary.</span>
<span class="sd"> '''</span>
<span class="n">total_records</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
<span class="k">print</span> <span class="s2">"-- Found {} records"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">total_records</span><span class="p">),</span>
<span class="c1">#convert megabytes to bytes</span>
<span class="n">partition_size</span> <span class="o">*=</span> <span class="mi">1000000</span>
<span class="n">records_per_partition</span> <span class="o">=</span> <span class="n">partition_size</span> <span class="o">/</span> <span class="n">record_size</span>
<span class="n">num_partitions</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">math</span><span class="o">.</span><span class="n">ceil</span><span class="p">(</span><span class="n">total_records</span> <span class="o">/</span> <span class="n">records_per_partition</span><span class="p">))</span>
<span class="k">if</span> <span class="n">num_partitions</span> <span class="o">!=</span> <span class="n">df</span><span class="o">.</span><span class="n">rdd</span><span class="o">.</span><span class="n">getNumPartitions</span><span class="p">():</span>
<span class="k">print</span> <span class="s2">"-- Repartitioning with {} partitions"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">num_partitions</span><span class="p">)</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">repartition</span><span class="p">(</span><span class="n">num_partitions</span><span class="p">)</span>
<span class="k">return</span> <span class="n">df</span>
<span class="k">def</span> <span class="nf">get_env_date</span><span class="p">():</span>
<span class="sd">'''</span>
<span class="sd"> Returns environment date if it exists.</span>
<span class="sd"> otherwise returns yesterday's date</span>
<span class="sd"> '''</span>
<span class="n">yesterday</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">utcnow</span><span class="p">()</span> <span class="o">-</span> <span class="n">dt</span><span class="o">.</span><span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">),</span> <span class="s2">"%Y%m</span><span class="si">%d</span><span class="s2">"</span><span class="p">)</span>
<span class="k">return</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'date'</span><span class="p">,</span> <span class="n">yesterday</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">get_dest</span><span class="p">(</span><span class="n">bucket</span><span class="p">,</span> <span class="n">prefix</span><span class="p">):</span>
<span class="sd">'''</span>
<span class="sd"> Uses environment bucket if it exists.</span>
<span class="sd"> Otherwises uses the bucket passed as a parameter</span>
<span class="sd"> '''</span>
<span class="n">bucket</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'bucket'</span><span class="p">,</span> <span class="n">bucket</span><span class="p">)</span>
<span class="k">return</span> <span class="s1">'/'</span><span class="o">.</span><span class="n">join</span><span class="p">([</span><span class="n">bucket</span><span class="p">,</span> <span class="n">prefix</span><span class="p">])</span>
<span class="c1"># I use -1 and 1 because it allows me to segment users </span>
<span class="c1"># into three groups for two different cases:</span>
<span class="c1">#</span>
<span class="c1"># **Case 1**: </span>
<span class="c1"># Users that have only foreign-installed add-ons, only self-installed add-ons, </span>
<span class="c1"># or a combination. Applying `boot_to_int()` on a `foreign_install` boolean, </span>
<span class="c1"># I can sum the resulting field grouped by `client_id` and `submission_date_s3` </span>
<span class="c1"># to identify these groups as 1, -1, and 0 respectively.</span>
<span class="c1">#</span>
<span class="c1"># **Case 2**: Users that have the default theme, a custom theme, </span>
<span class="c1"># or changed their theme (from default to custom or visa versa) on a given day: </span>
<span class="c1"># Applying `boot_to_int()` on a `has_custom_theme` boolean, I can sum the </span>
<span class="c1"># resulting field grouped by `client_id` and `submission_date_s3` </span>
<span class="c1"># to identify these groups as -1, 1, and 0 respectively.</span>
<span class="n">bool_to_int</span> <span class="o">=</span> <span class="n">fun</span><span class="o">.</span><span class="n">udf</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="mi">1</span> <span class="k">if</span> <span class="n">x</span> <span class="o">==</span> <span class="bp">True</span> <span class="k">else</span> <span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="n">st</span><span class="o">.</span><span class="n">IntegerType</span><span class="p">())</span>
</pre></div>
<p>Unless specified in the environment, the target date is yesterday, and the bucket used is passed as a string to <code>get_dest()</code></p>
<div class="codehilite"><pre><span></span><span class="n">target_date</span> <span class="o">=</span> <span class="n">get_env_date</span><span class="p">()</span>
<span class="n">dest</span> <span class="o">=</span> <span class="n">get_dest</span><span class="p">(</span><span class="n">bucket</span><span class="o">=</span><span class="s2">"telemetry-parquet"</span><span class="p">,</span> <span class="n">prefix</span><span class="o">=</span><span class="s2">"addons/agg/v1"</span><span class="p">)</span>
</pre></div>
<h1 id="load-addons-and-main_summary-for-yesterday-unless-specified-in-the-environment">Load <code>addons</code> and <code>main_summary</code> for yesterday (unless specified in the environment)</h1>
<div class="codehilite"><pre><span></span><span class="n">addons</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="s2">"s3://telemetry-parquet/addons/v2"</span><span class="p">)</span>
<span class="n">addons</span> <span class="o">=</span> <span class="n">addons</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">addons</span><span class="o">.</span><span class="n">submission_date_s3</span> <span class="o">==</span> <span class="n">target_date</span><span class="p">)</span> \
<span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">addons</span><span class="o">.</span><span class="n">is_system</span> <span class="o">==</span> <span class="bp">False</span><span class="p">)</span> \
<span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">addons</span><span class="o">.</span><span class="n">user_disabled</span> <span class="o">==</span> <span class="bp">False</span><span class="p">)</span> \
<span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">addons</span><span class="o">.</span><span class="n">app_disabled</span> <span class="o">==</span> <span class="bp">False</span><span class="p">)</span> \
<span class="n">ms</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s1">'mergeSchema'</span><span class="p">,</span> <span class="s1">'true'</span><span class="p">)</span>\
<span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="s1">'s3://telemetry-parquet/main_summary/v4'</span><span class="p">)</span>
<span class="n">ms</span> <span class="o">=</span> <span class="n">ms</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">ms</span><span class="o">.</span><span class="n">submission_date_s3</span> <span class="o">==</span> <span class="n">target_date</span><span class="p">)</span>
</pre></div>
<h1 id="aggregate">Aggregate</h1>
<p>These are the aggregations / joins that we <strong>dont</strong> want to do in re:dash.</p>
<ul>
<li>The resulting table is one row per distinct client, day, channel, and install type</li>
<li>foreign_install = true -&gt; side-loaded add-on, foreign_install = false -&gt; self-installed add-on</li>
<li>Each client has a static field for profile_creation_date and min_install_day (earliest add-on installation date)</li>
<li>Each client has a daily field <code>user_type</code></li>
<li>1 -&gt; only foreign installed add-ons</li>
<li>-1 -&gt; only self-installed</li>
<li>0 -&gt; foreign installed and self installed</li>
<li>Each client has a daily field <code>has_custom_theme</code>.</li>
<li>1 -&gt; has a custom theme</li>
<li>-1 -&gt; has default theme</li>
<li>0 -&gt; changed from default to custom on this date</li>
<li>To facilitate total population percentages, each submission date/channel has two static fields</li>
<li>n_custom_theme_clients (# distinct clients on that day/channel with a custom theme)</li>
<li>n_clients (# distinct total clients on that date/channel)</li>
</ul>
<div class="codehilite"><pre><span></span><span class="o">%%</span><span class="n">time</span>
<span class="n">default_theme_id</span> <span class="o">=</span> <span class="s2">"{972ce4c6-7e08-4474-a285-3208198ce6fd}"</span>
<span class="c1"># count of distinct client submission_date, channel and install type</span>
<span class="n">count_by_client_day</span> <span class="o">=</span> <span class="n">addons</span>\
<span class="o">.</span><span class="n">select</span><span class="p">([</span><span class="s1">'client_id'</span><span class="p">,</span> <span class="s1">'submission_date_s3'</span><span class="p">,</span> <span class="s1">'normalized_channel'</span><span class="p">,</span>
<span class="s1">'foreign_install'</span><span class="p">,</span> <span class="s1">'addon_id'</span><span class="p">])</span>\
<span class="o">.</span><span class="n">distinct</span><span class="p">()</span>\
<span class="o">.</span><span class="n">groupBy</span><span class="p">([</span><span class="s1">'client_id'</span><span class="p">,</span> <span class="s1">'submission_date_s3'</span><span class="p">,</span><span class="s1">'foreign_install'</span><span class="p">,</span> <span class="s1">'normalized_channel'</span><span class="p">])</span>\
<span class="o">.</span><span class="n">count</span><span class="p">()</span>
<span class="c1"># count of clients that have only foreign_installed, only self_installed and both</span>
<span class="c1"># per day/channel</span>
<span class="n">user_types</span> <span class="o">=</span> <span class="n">count_by_client_day</span>\
<span class="o">.</span><span class="n">select</span><span class="p">([</span><span class="s1">'client_id'</span><span class="p">,</span> <span class="s1">'submission_date_s3'</span><span class="p">,</span> <span class="s1">'normalized_channel'</span><span class="p">,</span>
<span class="n">bool_to_int</span><span class="p">(</span><span class="s1">'foreign_install'</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s1">'user_type'</span><span class="p">)])</span>\
<span class="o">.</span><span class="n">groupBy</span><span class="p">([</span><span class="s1">'client_id'</span><span class="p">,</span> <span class="s1">'submission_date_s3'</span><span class="p">,</span> <span class="s1">'normalized_channel'</span><span class="p">])</span>\
<span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="s1">'user_type'</span><span class="p">)</span>\
<span class="o">.</span><span class="n">withColumnRenamed</span><span class="p">(</span><span class="s1">'sum(user_type)'</span><span class="p">,</span> <span class="s1">'user_type'</span><span class="p">)</span>
<span class="n">count_by_client_day</span> <span class="o">=</span> <span class="n">count_by_client_day</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">user_types</span><span class="p">,</span>
<span class="n">on</span><span class="o">=</span><span class="p">[</span><span class="s1">'client_id'</span><span class="p">,</span> <span class="s1">'submission_date_s3'</span><span class="p">,</span> <span class="s1">'normalized_channel'</span><span class="p">])</span>
<span class="c1"># does a client have a custom theme?</span>
<span class="c1"># aggregate distinct values on a day/channel, since a client could have</span>
<span class="c1"># changed from default to custom</span>
<span class="n">ms_has_theme</span> <span class="o">=</span> <span class="n">ms</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>\
<span class="n">ms</span><span class="o">.</span><span class="n">client_id</span><span class="p">,</span> <span class="n">ms</span><span class="o">.</span><span class="n">normalized_channel</span><span class="p">,</span> <span class="n">bool_to_int</span><span class="p">(</span><span class="n">ms</span><span class="o">.</span><span class="n">active_theme</span><span class="o">.</span><span class="n">addon_id</span> <span class="o">!=</span> <span class="n">default_theme_id</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s1">'has_custom_theme'</span><span class="p">))</span>\
<span class="o">.</span><span class="n">distinct</span><span class="p">()</span>\
<span class="o">.</span><span class="n">groupBy</span><span class="p">([</span><span class="s1">'client_id'</span><span class="p">,</span> <span class="s1">'normalized_channel'</span><span class="p">])</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="s1">'has_custom_theme'</span><span class="p">)</span> \
<span class="o">.</span><span class="n">withColumnRenamed</span><span class="p">(</span><span class="s1">'sum(has_custom_theme)'</span><span class="p">,</span> <span class="s1">'has_custom_theme'</span><span class="p">)</span>
<span class="c1"># client_id, profile_creation_date and the earliest</span>
<span class="c1"># install day for an addon</span>
<span class="n">ms_install_days</span> <span class="o">=</span> <span class="n">ms</span>\
<span class="o">.</span><span class="n">select</span><span class="p">([</span><span class="s1">'client_id'</span><span class="p">,</span> <span class="s1">'profile_creation_date'</span><span class="p">,</span>
<span class="n">fun</span><span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="s1">'active_addons'</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s1">'addons'</span><span class="p">)])</span>\
<span class="o">.</span><span class="n">groupBy</span><span class="p">([</span><span class="s1">'client_id'</span><span class="p">,</span> <span class="s1">'profile_creation_date'</span><span class="p">])</span>\
<span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="n">fun</span><span class="o">.</span><span class="n">min</span><span class="p">(</span><span class="s2">"addons.install_day"</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s1">'min_install_day'</span><span class="p">))</span>
<span class="c1"># combine data</span>
<span class="n">current</span> <span class="o">=</span> <span class="n">count_by_client_day</span>\
<span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">ms_install_days</span><span class="p">,</span> <span class="n">on</span><span class="o">=</span><span class="s1">'client_id'</span><span class="p">,</span> <span class="n">how</span><span class="o">=</span><span class="s1">'left'</span><span class="p">)</span>\
<span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">ms_has_theme</span><span class="p">,</span> <span class="n">on</span><span class="o">=</span><span class="p">[</span><span class="s1">'client_id'</span><span class="p">,</span> <span class="s1">'normalized_channel'</span><span class="p">],</span> <span class="n">how</span><span class="o">=</span><span class="s1">'left'</span><span class="p">)</span>\
<span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="s1">'submission_date_s3'</span><span class="p">)</span>
<span class="c1"># add total number of distinct clients per day/channel</span>
<span class="c1"># and total number of distinct clients with a custom theme per day/channel</span>
<span class="c1"># Note that we could see the same client on multiple channels</span>
<span class="c1"># so downstream analysis should be done within channel</span>
<span class="n">n_clients</span> <span class="o">=</span> <span class="n">ms</span><span class="o">.</span><span class="n">select</span><span class="p">([</span><span class="s1">'client_id'</span><span class="p">,</span> <span class="s1">'normalized_channel'</span><span class="p">])</span><span class="o">.</span><span class="n">distinct</span><span class="p">()</span>\
<span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s1">'normalized_channel'</span><span class="p">)</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>\
<span class="o">.</span><span class="n">withColumnRenamed</span><span class="p">(</span><span class="s1">'count'</span><span class="p">,</span> <span class="s1">'n_clients'</span><span class="p">)</span>
<span class="n">n_custom_themes</span> <span class="o">=</span> <span class="n">ms_has_theme</span>\
<span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">ms_has_theme</span><span class="o">.</span><span class="n">has_custom_theme</span> <span class="o">&gt;=</span> <span class="mi">0</span><span class="p">)</span>\
<span class="o">.</span><span class="n">select</span><span class="p">([</span><span class="s1">'client_id'</span><span class="p">,</span> <span class="s1">'normalized_channel'</span><span class="p">])</span><span class="o">.</span><span class="n">distinct</span><span class="p">()</span>\
<span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s1">'normalized_channel'</span><span class="p">)</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>\
<span class="o">.</span><span class="n">withColumnRenamed</span><span class="p">(</span><span class="s1">'count'</span><span class="p">,</span> <span class="s1">'n_custom_theme_clients'</span><span class="p">)</span>
<span class="n">current</span> <span class="o">=</span> <span class="n">current</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">n_custom_themes</span><span class="p">,</span> <span class="n">on</span><span class="o">=</span><span class="s1">'normalized_channel'</span><span class="p">)</span>\
<span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">n_clients</span><span class="p">,</span> <span class="n">on</span><span class="o">=</span><span class="s1">'normalized_channel'</span><span class="p">)</span>
<span class="n">current</span> <span class="o">=</span> <span class="n">current</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s1">'n_clients'</span><span class="p">,</span> <span class="n">current</span><span class="o">.</span><span class="n">n_clients</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">st</span><span class="o">.</span><span class="n">IntegerType</span><span class="p">()))</span>\
<span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s1">'n_custom_theme_clients'</span><span class="p">,</span> <span class="n">current</span><span class="o">.</span><span class="n">n_custom_theme_clients</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">st</span><span class="o">.</span><span class="n">IntegerType</span><span class="p">()))</span>
<span class="c1"># repartition data</span>
<span class="n">current</span> <span class="o">=</span> <span class="n">optimize_repartition</span><span class="p">(</span><span class="n">current</span><span class="p">,</span> <span class="n">record_size</span><span class="o">=</span><span class="mi">39</span><span class="p">)</span>
<span class="c1"># write to s3</span>
<span class="n">current</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">"parquet"</span><span class="p">)</span>\
<span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="s1">'s3://'</span> <span class="o">+</span> <span class="n">dest</span> <span class="o">+</span> <span class="s1">'/submission_date_s3={}'</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">target_date</span><span class="p">),</span> <span class="n">mode</span><span class="o">=</span><span class="s1">'overwrite'</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">current</span><span class="o">.</span><span class="n">printSchema</span><span class="p">()</span>
</pre></div>
</div>
</div>
</div>
</div>
</div></body></html>
<div>
</div>
<br/>
<div class="row">
<div class="col-md-12">
</div>
</div>
<script type="text/javascript">
$("#searchbar")[0].setSelectionRange(1000, 1000);
$('#searchbar').typeahead({
hint: false,
highlight: true,
minLength: 1
},
{
name: 'knowledge_posts',
limit: 10,
display: function (item) {
return item.title + " - " + item.author;
},
templates: {
empty: Handlebars.compile(
'<div class="tt-not-found">' +
'Unable to find any posts that match the current query' +
'</div>'
),
suggestion: function(data) {
return '<p style="overflow-wrap:break-word"><strong class="text-rausch">' + data.title + '</strong> – ' + data.author + '</p>';
}
},
source: function(q, sync, async) {
$.ajax('/ajax/index/typeahead?search=' + q,
{
success: function(data,status){ async(JSON.parse(data)); }
})
}
});
$('#searchbar').bind('typeahead:select', function(obj, datum, name) {
window.location = '/post/'+encodeURIComponent(datum.path);
});
$('#searchbar').keypress(function(event){
var keycode = (event.keyCode ? event.keyCode : event.which);
if(keycode == '13'){
var path = document.location.pathname;
window.location = '/feed?filters=' + $('#searchbar').val()
}
});
var padding = $('.tt-menu').outerWidth()
$('.tt-menu').width($('#searchbar').width() + padding + "px")
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/tooltips.js" type="text/javascript"></script>
<script type="text/javascript">
$("document").ready(function(){
var is_webeditor = false;
var post_id = "24";
var id = "None";
var post_path = "addons/okr-daily-script.kp"
var data_repo_github_root = ""
tooltipsJx.initializeTooltips(is_webeditor, post_id, id, data_repo_github_root);
$(".btn-rendered").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path);
})
$(".btn-raw").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path) + "?render=raw";
})
$(".btn-webeditor").on("click", function(){
document.location.href = "/edit/" + encodeURI(post_path);
})
});
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/tags.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/icons.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/comments.js" type="text/javascript"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js" type="text/javascript">
</script>
<script>
$(document).on('ready', function(){
// Make the Rendered Markdown Button active
$(".btn-rendered").addClass("btn-active");
// Initialize headers
helpersJx.linkifyHeaders();
// Initialize comments
var post_path = 'addons/okr-daily-script.kp';
$("#post_comment_btn").on('click', function(){
comment_author = 'knowledge_default';
post_author = 'Ben Miroglio';
post_title = 'addon_aggregates derived dataset script';
commentsJx.postComment(comment_author, post_author, post_title, post_path);
location.reload();
});
all_comment_delete_buttons = $("[id^=delete_comment]")
$.each(all_comment_delete_buttons, function(i,v){
$(v).on("click", function(){
var id = v.id;
var comment_id = id.split("__")[1];
if(comment_id) {
commentsJx.deleteComment(post_path, comment_id)
location.reload();
}
});
});
$(document.body).on('click',"button[id^=tag-subscription]",function () {
tagsJx.addTagSubscriptionListener($(this)[0]);
});
})
//Turn all the headers to be links
//h1 = Title, don't want that
var all_headers = [$("h2"), $("h3"), $("h4"), $("h5"), $("h6")]
$.each(all_headers, function(index, value){
$.each(value, function(i, v){
var inner_html = v.innerHTML
inner_html_no_special = inner_html.replace(/[^a-zA-Z\- ]/g, "")
var inner_link = "#" + inner_html_no_special.toLowerCase().split(" ").join("-")
v.innerHTML = "<a href='" + inner_link + "' class=link-reset>" + inner_html + "</a>"
})
})
//turn all the tags into links, similar to what's done on the feed page
var tags = $("#renderedMarkdown .metadata .tags")[0]
var tags_list = ['add-ons', 'okr', 'derived dataset']
var subscriptions_list = []
$.each(tags_list, function(i,tag){
ahref = document.createElement("a")
e_tag = encodeURIComponent(tag)
f_tag = tag.replace("/", "__")
tag_name = "#" + tag
tag_subscription_button_id_name = "tag-subscription-" + i + "__" + f_tag
ahref.setAttribute("data-container", "body")
ahref.setAttribute("data-toggle", "popover")
ahref.setAttribute("data-placement", "bottom")
ahref.setAttribute("data-html", "true")
ahref.setAttribute("data-tag-name", f_tag)
if (subscriptions_list.indexOf(tag) >= 0) {
ahref.setAttribute("class", "label label-subscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-primary btn-unsubscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-remove-sign glyphicon-white'></i>Unsubscribe " +
" </button> " +
" </div>")
} else {
ahref.setAttribute("class", "label label-unsubscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-default btn-subscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-ok-sign glyphicon-filled'></i>Subscribe " +
" </button> " +
" </div>")
}
ahref.setAttribute("href", "/tag_pages?tag=" + e_tag)
ahref.setAttribute("style", "font-weight:normal")
if (i == 0){
ahref.innerHTML = " "
colon = document.createElement("text")
colon.innerHTML = "<b>Tags</b>: "
tags.appendChild(colon)
}
ahref.innerHTML = ahref.innerHTML + tag_name
tags.appendChild(ahref)
if (i != tags_list.length - 1){
comma = document.createElement("text")
comma.innerText = ", "
tags.appendChild(comma)
}
})
tags.nextSibling.remove()
tags.innerHTML += "<i class='glyphicon glyphicon-edit icon-gray' style='font-size:12pt; padding-left:4px' id='tooltip-edit_tags'></i>"
$(".pop").popover({ trigger: "manual" , html: true, animation:false, delay: 100})
.on("mouseenter", function () {
var _this = this;
$(this).popover("show");
$(".popover").on("mouseleave", function () {
$(_this).popover('hide');
});
}).on("mouseleave", function () {
var _this = this;
setTimeout(function () {
if (!$(".popover:hover").length) {
$(_this).popover("hide");
}
}, 300);
});
$('#tooltip-edit_tags').click(function(){
$('#tooltip-edit_tags')[0].setAttribute("style", "display:none")
previousSibling = $("#tooltip-edit_tags")[0].previousSibling
tags_string = tags_list.join(", ")
form = document.createElement("form")
input = document.createElement("input")
tags_text = document.createElement("text")
icon_class = document.createElement("i")
icon_class.setAttribute("class", "glyphicon glyphicon-upload icon-gray")
icon_class.setAttribute("style", "font-size:23px; padding-left:4px")
icon_class.setAttribute("id", "tooltip-save_tags")
tags_text.innerText = "Tags: "
input.setAttribute('type', 'text')
input.setAttribute('name', 'tags_list')
input.setAttribute('value', tags_string)
input.setAttribute('style', 'width:75%; display: inline-block')
input.setAttribute('id' , 'change_tags')
form.appendChild(tags_text)
tags.textContent = " "
form.appendChild(input)
form.appendChild(icon_class)
tags.appendChild(form)
$("#tooltip-save_tags").click(function(){
tags_string = $("#change_tags")[0].value
tags_list = tags_string.split(",")
var re = /^[a-z0-9\-\_\:\/]+$/i
var good = true
for (var i = 0; i < tags_list.length; i++){
tag = tags_list[i]
if (tag.length == 0){
alert("There is a tag with length 0 - possible a trailing comma?")
good = false
break
} else {
tag_name = tag.trim()
if (!(re.test(tag_name))){
alert("The tag contains special characters. Make sure there are only alphanumeric characters in your tag")
good = false
break
}
}
}
if (good) {
var postContent = {}
postContent['tags'] = tags_string
$.ajax({
type: "POST",
dataType: "json",
data: JSON.stringify(postContent),
contentType: "application/json",
url: '/tag_list?post_path=addons/okr-daily-script.kp',
async: false
});
location.reload()
}
})
tags.nextSibling.remove()
// Allow user to edit tags
var edit_icon = iconsJx.createEditTagsIcon();
$(tags).after(edit_icon);
$("#tooltip-edit_tags").on("click", function(){
var edit_tooltip = $("#tooltip-edit_tags");
edit_tooltip.attr("style", "display:none");
var tags_string = tags_list.join(", ");
var form = $("<form>");
var input = $("<input>");
var tags_text = $("<text>");
tags_text.html("Tags: ");
var icon = iconsJx.createSaveTagsIcon();
input.attr("type", "text");
input.attr("name", "tags_list");
input.attr("style", "width:75%; display: inline-block");
input.attr("id", "change_tags");
form.append(tags_text);
tags.textContent = " ";
tags_text.innerHTML = "Tags: ";
form.append(input);
form.append(icon);
tags.appendChild(form[0]);
$("#change_tags")[0].value = tags_string;
$("#change_tags").keypress(function(e){
if (e.which == 13){
var tags_string = $("#change_tags")[0].value;
var post_path = "addons/okr-daily-script.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false;
};
});
$("#tooltip-save_tags").click(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "addons/okr-daily-script.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
});
$("form").submit(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "addons/okr-daily-script.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false
})
});
});
</script>

Просмотреть файл

@ -0,0 +1,745 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title> Knowledge </title>
<!-- js includes at the top as post embedded js colliding -->
<script src="/static/modules/jquery/jquery.min.js"></script>
<script src="/static/modules/tether/js/tether.min.js"></script>
<script src="/static/modules/bootstrap/js/bootstrap.min.js"></script>
<script src="/static/modules/bootstrap-slider/js/bootstrap-slider.min.js"></script>
<script src="/static/modules/typeahead.js/typeahead.bundle.min.js"></script>
<script src="/static/modules/handlebars/js/handlebars.js"></script>
<script src="/static/js/helpers.js"></script>
<script src="/static/modules/select2/js/select2.min.js"></script>
<script src="/static/modules/hightlight.pack.js/highlight.pack.js"></script>
<script src="/static/modules/marked.js/marked.js"></script>
<!-- require js is used for plotly, but has a bunch of collisions with other js packages
make sure to have it be last js package imported -->
<script src="/static/modules/require.js/require.min.js"></script>
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.6.2/html5shiv.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/2.1.0/es5-shim.min.js"></script>
<![endif]-->
<link rel="stylesheet" href="/static/modules/bootstrap/css/bootstrap.min.css">
<link rel="stylesheet" href="/static/modules/bootstrap-slider/css/bootstrap-slider.min.css">
<link rel="stylesheet" type='text/css' href="/static/css/custom.css">
<link rel="stylesheet" type='text/css' href="/static/modules/select2/css/select2.min.css">
<link href='https://fonts.googleapis.com/css?family=Lato:400,900|Playfair+Display|Source+Serif+Pro:400,700' rel='stylesheet' type='text/css'>
<link rel="shortcut icon" href="/static/images/favicon.png">
<link rel="stylesheet" href="/static/css/codehilite-friendly.css">
<style>
.spinner {
position: fixed;
top: 50%;
left: 50%;
margin-left: -50px; /* half width of the spinner gif */
margin-top: -50px; /* half height of the spinner gif */
text-align:center;
z-index:1234;
overflow: auto;
width: 100px; /* width of the spinner gif */
height: 102px; /*hight of the spinner gif +2px to fix IE8 issue */
}
.table {
font-size: 14px;
}
.modal-content {
max-width: 1024px;
}
</style>
</head>
<body>
<div class="navbar navbar-knowledge" role="navigation">
<div class="container page-container">
<a href="/" aria-selected="false" class="logo-image navbar-text" style='margin-top:16px; margin-left: -7px'>
<img width="125" src='/static/images/logo-white.svg'></img>
</a>
<a id="feed_tab" href="/" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Home
</a>
<a id="favorites_tab" href="/favorites" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Favorites
</a>
<a id="help_tab" href="/about" aria-selected="false" class="navbar-text" style='margin-top:18px'>
About
</a>
<a id="stats_tab" href="/stats" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Stats
</a>
<a id="webposts_tab" href="/create"
class="btn btn-primary navbar-text pull-right"
style='border-radius:4px; margin-top:10px; margin-right:-10px; background-color: #00a699; color:white; border-color: #00a699'>
Write a Post!
</a>
<div class="pull-right">
<div class="form-group">
<input class="form-control" id="searchbar" placeholder="Search for Knowledge" style="text-align:right">
</div>
</div>
</div>
</div>
<div class="container page-container">
<br>
<div class="container-fluid">
<div class="row">
<div class="col-md-6">
<div class="btn-group" role="group">
<button type="button" class="btn btn-default btn-rendered">
View Post
</button>
<button type="button" class="btn btn-default btn-raw">
View Raw Markdown
</button>
</div>
</div>
<div class="col-md-2">
</div>
<div class="col-md-4 text-right">
<i class="glyphicon glyphicon-eye-open" id="pageview_stats" style='color: #9CA299'></i>
<div id="pageview_stats" style="display: inline-block">
Viewed 2 times by 1 different users
</div>
<i class="glyphicon glyphicon-heart-empty glyphicon-clickable pop" style="font-size:16pt" id="tooltip-like" data-placement="bottom"
data-trigger="#tooltip-like"
data-container="body"
data-toggle="popover"
data-content='<div>Like This Post</div>'></i>
</div>
</div>
<div class="row col-md-12">
</div>
</div>
<div class='container-fluid'>
<div class="row">
<div class="col-md-12">
<div id="renderedMarkdown">
<div class='metadata'>
<h1>addon_aggregates derived dataset script</h1>
<span class='authors'><a href='/feed?authors=Ben+Miroglio'>Ben Miroglio</a></span>
<span class='date_created'>February 08, 2017</span>
<span class='date_updated'>(Last Updated: May 03, 2017)</span>
<span class='tldr'><p>script to be run daily that contructs the addon_aggregates table in re:dash</p></span>
<span class='tags'></span>
</div>
<h1 id="add-ons-2017-okr-data-collection">Add-ons 2017 OKR Data Collection</h1>
<p>Some OKRs for 2017 can be feasibly collected via the <code>addons</code> and <code>main_summary</code> tables. These tables are huge and aren&rsquo;t appropriate to query directly via re:dash. This script condenses these tables so that the result contains the least data possible to track the following OKRs:</p>
<ul>
<li><strong>OKR 1: Increase number of users who self-install an Add-on by 5%</strong></li>
<li><strong>OKR 2: Increase average number of add-ons per profile by 3%</strong></li>
<li><strong>OKR 3: Increase number of new Firefox users who install an add-on in first 14 days by 25%</strong></li>
</ul>
<p>These OKRs, in addition to other add-on metrics, are tracked via the <a href="https://sql.telemetry.mozilla.org/dashboard/add-on-okrs_1#edit_dashboard_dialog">Add-on OKRs Dashboard</a> in re:dash.</p>
<div class="codehilite"><pre><span></span><span class="kn">from</span> <span class="nn">__future__</span> <span class="kn">import</span> <span class="n">division</span>
<span class="kn">import</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">as</span> <span class="nn">fun</span>
<span class="kn">import</span> <span class="nn">pyspark.sql.types</span> <span class="kn">as</span> <span class="nn">st</span>
<span class="kn">import</span> <span class="nn">math</span>
<span class="kn">import</span> <span class="nn">os</span>
<span class="kn">import</span> <span class="nn">datetime</span> <span class="kn">as</span> <span class="nn">dt</span>
<span class="n">sc</span><span class="o">.</span><span class="n">setLogLevel</span><span class="p">(</span><span class="s2">&quot;INFO&quot;</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">optimize_repartition</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">record_size</span><span class="p">,</span> <span class="n">partition_size</span><span class="o">=</span><span class="mi">280</span><span class="p">):</span>
<span class="sd">&#39;&#39;&#39;</span>
<span class="sd"> Repartitions a spark DataFrame &lt;df&gt; so that each partition is </span>
<span class="sd"> ~ &lt;partition_size&gt;MB, defaulting to 280MB. record_size must be </span>
<span class="sd"> estimated beforehand--i.e. write the dataframe to s3, get the size </span>
<span class="sd"> in bytes and divide by df.count(). </span>
<span class="sd"> Returns repartitioned dataframe if a repartition is necessary.</span>
<span class="sd"> &#39;&#39;&#39;</span>
<span class="n">total_records</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
<span class="k">print</span> <span class="s2">&quot;-- Found {} records&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">total_records</span><span class="p">),</span>
<span class="c1">#convert megabytes to bytes</span>
<span class="n">partition_size</span> <span class="o">*=</span> <span class="mi">1000000</span>
<span class="n">records_per_partition</span> <span class="o">=</span> <span class="n">partition_size</span> <span class="o">/</span> <span class="n">record_size</span>
<span class="n">num_partitions</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">math</span><span class="o">.</span><span class="n">ceil</span><span class="p">(</span><span class="n">total_records</span> <span class="o">/</span> <span class="n">records_per_partition</span><span class="p">))</span>
<span class="k">if</span> <span class="n">num_partitions</span> <span class="o">!=</span> <span class="n">df</span><span class="o">.</span><span class="n">rdd</span><span class="o">.</span><span class="n">getNumPartitions</span><span class="p">():</span>
<span class="k">print</span> <span class="s2">&quot;-- Repartitioning with {} partitions&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">num_partitions</span><span class="p">)</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">repartition</span><span class="p">(</span><span class="n">num_partitions</span><span class="p">)</span>
<span class="k">return</span> <span class="n">df</span>
<span class="k">def</span> <span class="nf">get_env_date</span><span class="p">():</span>
<span class="sd">&#39;&#39;&#39;</span>
<span class="sd"> Returns environment date if it exists.</span>
<span class="sd"> otherwise returns yesterday&#39;s date</span>
<span class="sd"> &#39;&#39;&#39;</span>
<span class="n">yesterday</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">utcnow</span><span class="p">()</span> <span class="o">-</span> <span class="n">dt</span><span class="o">.</span><span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">),</span> <span class="s2">&quot;%Y%m</span><span class="si">%d</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;date&#39;</span><span class="p">,</span> <span class="n">yesterday</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">get_dest</span><span class="p">(</span><span class="n">bucket</span><span class="p">,</span> <span class="n">prefix</span><span class="p">):</span>
<span class="sd">&#39;&#39;&#39;</span>
<span class="sd"> Uses environment bucket if it exists.</span>
<span class="sd"> Otherwises uses the bucket passed as a parameter</span>
<span class="sd"> &#39;&#39;&#39;</span>
<span class="n">bucket</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;bucket&#39;</span><span class="p">,</span> <span class="n">bucket</span><span class="p">)</span>
<span class="k">return</span> <span class="s1">&#39;/&#39;</span><span class="o">.</span><span class="n">join</span><span class="p">([</span><span class="n">bucket</span><span class="p">,</span> <span class="n">prefix</span><span class="p">])</span>
<span class="c1"># I use -1 and 1 because it allows me to segment users </span>
<span class="c1"># into three groups for two different cases:</span>
<span class="c1">#</span>
<span class="c1"># **Case 1**: </span>
<span class="c1"># Users that have only foreign-installed add-ons, only self-installed add-ons, </span>
<span class="c1"># or a combination. Applying `boot_to_int()` on a `foreign_install` boolean, </span>
<span class="c1"># I can sum the resulting field grouped by `client_id` and `submission_date_s3` </span>
<span class="c1"># to identify these groups as 1, -1, and 0 respectively.</span>
<span class="c1">#</span>
<span class="c1"># **Case 2**: Users that have the default theme, a custom theme, </span>
<span class="c1"># or changed their theme (from default to custom or visa versa) on a given day: </span>
<span class="c1"># Applying `boot_to_int()` on a `has_custom_theme` boolean, I can sum the </span>
<span class="c1"># resulting field grouped by `client_id` and `submission_date_s3` </span>
<span class="c1"># to identify these groups as -1, 1, and 0 respectively.</span>
<span class="n">bool_to_int</span> <span class="o">=</span> <span class="n">fun</span><span class="o">.</span><span class="n">udf</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="mi">1</span> <span class="k">if</span> <span class="n">x</span> <span class="o">==</span> <span class="bp">True</span> <span class="k">else</span> <span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="n">st</span><span class="o">.</span><span class="n">IntegerType</span><span class="p">())</span>
</pre></div>
<p>Unless specified in the environment, the target date is yesterday, and the bucket used is passed as a string to <code>get_dest()</code></p>
<div class="codehilite"><pre><span></span><span class="n">target_date</span> <span class="o">=</span> <span class="n">get_env_date</span><span class="p">()</span>
<span class="n">dest</span> <span class="o">=</span> <span class="n">get_dest</span><span class="p">(</span><span class="n">bucket</span><span class="o">=</span><span class="s2">&quot;telemetry-parquet&quot;</span><span class="p">,</span> <span class="n">prefix</span><span class="o">=</span><span class="s2">&quot;addons/agg/v1&quot;</span><span class="p">)</span>
</pre></div>
<h1 id="load-addons-and-main_summary-for-yesterday-unless-specified-in-the-environment">Load <code>addons</code> and <code>main_summary</code> for yesterday (unless specified in the environment)</h1>
<div class="codehilite"><pre><span></span><span class="n">addons</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="s2">&quot;s3://telemetry-parquet/addons/v2&quot;</span><span class="p">)</span>
<span class="n">addons</span> <span class="o">=</span> <span class="n">addons</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">addons</span><span class="o">.</span><span class="n">submission_date_s3</span> <span class="o">==</span> <span class="n">target_date</span><span class="p">)</span> \
<span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">addons</span><span class="o">.</span><span class="n">is_system</span> <span class="o">==</span> <span class="bp">False</span><span class="p">)</span> \
<span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">addons</span><span class="o">.</span><span class="n">user_disabled</span> <span class="o">==</span> <span class="bp">False</span><span class="p">)</span> \
<span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">addons</span><span class="o">.</span><span class="n">app_disabled</span> <span class="o">==</span> <span class="bp">False</span><span class="p">)</span> \
<span class="n">ms</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s1">&#39;mergeSchema&#39;</span><span class="p">,</span> <span class="s1">&#39;true&#39;</span><span class="p">)</span>\
<span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="s1">&#39;s3://telemetry-parquet/main_summary/v4&#39;</span><span class="p">)</span>
<span class="n">ms</span> <span class="o">=</span> <span class="n">ms</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">ms</span><span class="o">.</span><span class="n">submission_date_s3</span> <span class="o">==</span> <span class="n">target_date</span><span class="p">)</span>
</pre></div>
<h1 id="aggregate">Aggregate</h1>
<p>These are the aggregations / joins that we <strong>don&rsquo;t</strong> want to do in re:dash.</p>
<ul>
<li>The resulting table is one row per distinct client, day, channel, and install type</li>
<li>foreign_install = true -&gt; side-loaded add-on, foreign_install = false -&gt; self-installed add-on</li>
<li>Each client has a static field for profile_creation_date and min_install_day (earliest add-on installation date)</li>
<li>Each client has a daily field <code>user_type</code></li>
<li>1 -&gt; only foreign installed add-ons</li>
<li>-1 -&gt; only self-installed</li>
<li>0 -&gt; foreign installed and self installed</li>
<li>Each client has a daily field <code>has_custom_theme</code>.</li>
<li>1 -&gt; has a custom theme</li>
<li>-1 -&gt; has default theme</li>
<li>0 -&gt; changed from default to custom on this date</li>
<li>To facilitate total population percentages, each submission date/channel has two static fields</li>
<li>n_custom_theme_clients (# distinct clients on that day/channel with a custom theme)</li>
<li>n_clients (# distinct total clients on that date/channel)</li>
</ul>
<div class="codehilite"><pre><span></span><span class="o">%%</span><span class="n">time</span>
<span class="n">default_theme_id</span> <span class="o">=</span> <span class="s2">&quot;{972ce4c6-7e08-4474-a285-3208198ce6fd}&quot;</span>
<span class="c1"># count of distinct client submission_date, channel and install type</span>
<span class="n">count_by_client_day</span> <span class="o">=</span> <span class="n">addons</span>\
<span class="o">.</span><span class="n">select</span><span class="p">([</span><span class="s1">&#39;client_id&#39;</span><span class="p">,</span> <span class="s1">&#39;submission_date_s3&#39;</span><span class="p">,</span> <span class="s1">&#39;normalized_channel&#39;</span><span class="p">,</span>
<span class="s1">&#39;foreign_install&#39;</span><span class="p">,</span> <span class="s1">&#39;addon_id&#39;</span><span class="p">])</span>\
<span class="o">.</span><span class="n">distinct</span><span class="p">()</span>\
<span class="o">.</span><span class="n">groupBy</span><span class="p">([</span><span class="s1">&#39;client_id&#39;</span><span class="p">,</span> <span class="s1">&#39;submission_date_s3&#39;</span><span class="p">,</span><span class="s1">&#39;foreign_install&#39;</span><span class="p">,</span> <span class="s1">&#39;normalized_channel&#39;</span><span class="p">])</span>\
<span class="o">.</span><span class="n">count</span><span class="p">()</span>
<span class="c1"># count of clients that have only foreign_installed, only self_installed and both</span>
<span class="c1"># per day/channel</span>
<span class="n">user_types</span> <span class="o">=</span> <span class="n">count_by_client_day</span>\
<span class="o">.</span><span class="n">select</span><span class="p">([</span><span class="s1">&#39;client_id&#39;</span><span class="p">,</span> <span class="s1">&#39;submission_date_s3&#39;</span><span class="p">,</span> <span class="s1">&#39;normalized_channel&#39;</span><span class="p">,</span>
<span class="n">bool_to_int</span><span class="p">(</span><span class="s1">&#39;foreign_install&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s1">&#39;user_type&#39;</span><span class="p">)])</span>\
<span class="o">.</span><span class="n">groupBy</span><span class="p">([</span><span class="s1">&#39;client_id&#39;</span><span class="p">,</span> <span class="s1">&#39;submission_date_s3&#39;</span><span class="p">,</span> <span class="s1">&#39;normalized_channel&#39;</span><span class="p">])</span>\
<span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="s1">&#39;user_type&#39;</span><span class="p">)</span>\
<span class="o">.</span><span class="n">withColumnRenamed</span><span class="p">(</span><span class="s1">&#39;sum(user_type)&#39;</span><span class="p">,</span> <span class="s1">&#39;user_type&#39;</span><span class="p">)</span>
<span class="n">count_by_client_day</span> <span class="o">=</span> <span class="n">count_by_client_day</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">user_types</span><span class="p">,</span>
<span class="n">on</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;client_id&#39;</span><span class="p">,</span> <span class="s1">&#39;submission_date_s3&#39;</span><span class="p">,</span> <span class="s1">&#39;normalized_channel&#39;</span><span class="p">])</span>
<span class="c1"># does a client have a custom theme?</span>
<span class="c1"># aggregate distinct values on a day/channel, since a client could have</span>
<span class="c1"># changed from default to custom</span>
<span class="n">ms_has_theme</span> <span class="o">=</span> <span class="n">ms</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>\
<span class="n">ms</span><span class="o">.</span><span class="n">client_id</span><span class="p">,</span> <span class="n">ms</span><span class="o">.</span><span class="n">normalized_channel</span><span class="p">,</span> <span class="n">bool_to_int</span><span class="p">(</span><span class="n">ms</span><span class="o">.</span><span class="n">active_theme</span><span class="o">.</span><span class="n">addon_id</span> <span class="o">!=</span> <span class="n">default_theme_id</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s1">&#39;has_custom_theme&#39;</span><span class="p">))</span>\
<span class="o">.</span><span class="n">distinct</span><span class="p">()</span>\
<span class="o">.</span><span class="n">groupBy</span><span class="p">([</span><span class="s1">&#39;client_id&#39;</span><span class="p">,</span> <span class="s1">&#39;normalized_channel&#39;</span><span class="p">])</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="s1">&#39;has_custom_theme&#39;</span><span class="p">)</span> \
<span class="o">.</span><span class="n">withColumnRenamed</span><span class="p">(</span><span class="s1">&#39;sum(has_custom_theme)&#39;</span><span class="p">,</span> <span class="s1">&#39;has_custom_theme&#39;</span><span class="p">)</span>
<span class="c1"># client_id, profile_creation_date and the earliest</span>
<span class="c1"># install day for an addon</span>
<span class="n">ms_install_days</span> <span class="o">=</span> <span class="n">ms</span>\
<span class="o">.</span><span class="n">select</span><span class="p">([</span><span class="s1">&#39;client_id&#39;</span><span class="p">,</span> <span class="s1">&#39;profile_creation_date&#39;</span><span class="p">,</span>
<span class="n">fun</span><span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="s1">&#39;active_addons&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s1">&#39;addons&#39;</span><span class="p">)])</span>\
<span class="o">.</span><span class="n">groupBy</span><span class="p">([</span><span class="s1">&#39;client_id&#39;</span><span class="p">,</span> <span class="s1">&#39;profile_creation_date&#39;</span><span class="p">])</span>\
<span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="n">fun</span><span class="o">.</span><span class="n">min</span><span class="p">(</span><span class="s2">&quot;addons.install_day&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s1">&#39;min_install_day&#39;</span><span class="p">))</span>
<span class="c1"># combine data</span>
<span class="n">current</span> <span class="o">=</span> <span class="n">count_by_client_day</span>\
<span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">ms_install_days</span><span class="p">,</span> <span class="n">on</span><span class="o">=</span><span class="s1">&#39;client_id&#39;</span><span class="p">,</span> <span class="n">how</span><span class="o">=</span><span class="s1">&#39;left&#39;</span><span class="p">)</span>\
<span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">ms_has_theme</span><span class="p">,</span> <span class="n">on</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;client_id&#39;</span><span class="p">,</span> <span class="s1">&#39;normalized_channel&#39;</span><span class="p">],</span> <span class="n">how</span><span class="o">=</span><span class="s1">&#39;left&#39;</span><span class="p">)</span>\
<span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="s1">&#39;submission_date_s3&#39;</span><span class="p">)</span>
<span class="c1"># add total number of distinct clients per day/channel</span>
<span class="c1"># and total number of distinct clients with a custom theme per day/channel</span>
<span class="c1"># Note that we could see the same client on multiple channels</span>
<span class="c1"># so downstream analysis should be done within channel</span>
<span class="n">n_clients</span> <span class="o">=</span> <span class="n">ms</span><span class="o">.</span><span class="n">select</span><span class="p">([</span><span class="s1">&#39;client_id&#39;</span><span class="p">,</span> <span class="s1">&#39;normalized_channel&#39;</span><span class="p">])</span><span class="o">.</span><span class="n">distinct</span><span class="p">()</span>\
<span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s1">&#39;normalized_channel&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>\
<span class="o">.</span><span class="n">withColumnRenamed</span><span class="p">(</span><span class="s1">&#39;count&#39;</span><span class="p">,</span> <span class="s1">&#39;n_clients&#39;</span><span class="p">)</span>
<span class="n">n_custom_themes</span> <span class="o">=</span> <span class="n">ms_has_theme</span>\
<span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">ms_has_theme</span><span class="o">.</span><span class="n">has_custom_theme</span> <span class="o">&gt;=</span> <span class="mi">0</span><span class="p">)</span>\
<span class="o">.</span><span class="n">select</span><span class="p">([</span><span class="s1">&#39;client_id&#39;</span><span class="p">,</span> <span class="s1">&#39;normalized_channel&#39;</span><span class="p">])</span><span class="o">.</span><span class="n">distinct</span><span class="p">()</span>\
<span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s1">&#39;normalized_channel&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>\
<span class="o">.</span><span class="n">withColumnRenamed</span><span class="p">(</span><span class="s1">&#39;count&#39;</span><span class="p">,</span> <span class="s1">&#39;n_custom_theme_clients&#39;</span><span class="p">)</span>
<span class="n">current</span> <span class="o">=</span> <span class="n">current</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">n_custom_themes</span><span class="p">,</span> <span class="n">on</span><span class="o">=</span><span class="s1">&#39;normalized_channel&#39;</span><span class="p">)</span>\
<span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">n_clients</span><span class="p">,</span> <span class="n">on</span><span class="o">=</span><span class="s1">&#39;normalized_channel&#39;</span><span class="p">)</span>
<span class="n">current</span> <span class="o">=</span> <span class="n">current</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s1">&#39;n_clients&#39;</span><span class="p">,</span> <span class="n">current</span><span class="o">.</span><span class="n">n_clients</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">st</span><span class="o">.</span><span class="n">IntegerType</span><span class="p">()))</span>\
<span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s1">&#39;n_custom_theme_clients&#39;</span><span class="p">,</span> <span class="n">current</span><span class="o">.</span><span class="n">n_custom_theme_clients</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">st</span><span class="o">.</span><span class="n">IntegerType</span><span class="p">()))</span>
<span class="c1"># repartition data</span>
<span class="n">current</span> <span class="o">=</span> <span class="n">optimize_repartition</span><span class="p">(</span><span class="n">current</span><span class="p">,</span> <span class="n">record_size</span><span class="o">=</span><span class="mi">39</span><span class="p">)</span>
<span class="c1"># write to s3</span>
<span class="n">current</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">&quot;parquet&quot;</span><span class="p">)</span>\
<span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="s1">&#39;s3://&#39;</span> <span class="o">+</span> <span class="n">dest</span> <span class="o">+</span> <span class="s1">&#39;/submission_date_s3={}&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">target_date</span><span class="p">),</span> <span class="n">mode</span><span class="o">=</span><span class="s1">&#39;overwrite&#39;</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">current</span><span class="o">.</span><span class="n">printSchema</span><span class="p">()</span>
</pre></div>
</div>
</div>
</div>
</div>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<h2> 0 Comments </h2>
</br>
<div>
<div class="row">
<div class="col-md-12">
<textarea class="form-control" type="text" id="comment-text" style="height:87px;" placeholder="Leave a comment..."></textarea>
</div>
</div>
</div>
</br>
<div>
<button class="btn btn-primary" id="post_comment_btn">Post Comment</button>
</div>
</div>
</div>
<br>
<div class="row">
<div class="col-md-12">
</div>
</br>
</div>
</div>
</div>
</div>
<div class="footer">
Served with <span class="glyphicon glyphicon-heart"></span> by <a href="https://github.com/airbnb/knowledge-repo">Knowledge Repo</a> <a href="https://github.com/airbnb/knowledge-repo/releases/tag/v0.7.4">0.7.4</a><br />
<i title="Last checked for updates: 0 seconds ago">Last indexed: 14 hours ago</i>
</div>
<script type='text/javascript'>
$("#searchbar")[0].setSelectionRange(1000, 1000);
$('#searchbar').typeahead({
hint: false,
highlight: true,
minLength: 1
},
{
name: 'knowledge_posts',
limit: 10,
display: function (item) {
return item.title + " - " + item.author;
},
templates: {
empty: Handlebars.compile(
'<div class="tt-not-found">' +
'Unable to find any posts that match the current query' +
'</div>'
),
suggestion: function(data) {
return '<p style="overflow-wrap:break-word"><strong class="text-rausch">' + data.title + '</strong> – ' + data.author + '</p>';
}
},
source: function(q, sync, async) {
$.ajax('/ajax/index/typeahead?search=' + q,
{
success: function(data,status){ async(JSON.parse(data)); }
})
}
});
$('#searchbar').bind('typeahead:select', function(obj, datum, name) {
window.location = '/post/'+encodeURIComponent(datum.path);
});
$('#searchbar').keypress(function(event){
var keycode = (event.keyCode ? event.keyCode : event.which);
if(keycode == '13'){
var path = document.location.pathname;
window.location = '/feed?filters=' + $('#searchbar').val()
}
});
var padding = $('.tt-menu').outerWidth()
$('.tt-menu').width($('#searchbar').width() + padding + "px")
</script>
<script src="/static/js/tooltips.js" type="text/javascript"></script>
<script type="text/javascript">
$("document").ready(function(){
var is_webeditor = false;
var post_id = "24";
var id = "None";
var post_path = "addons/okr-daily-script.kp"
var data_repo_github_root = ""
tooltipsJx.initializeTooltips(is_webeditor, post_id, id, data_repo_github_root);
$(".btn-rendered").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path);
})
$(".btn-raw").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path) + "?render=raw";
})
$(".btn-webeditor").on("click", function(){
document.location.href = "/edit/" + encodeURI(post_path);
})
});
</script>
<script src="/static/js/helpers.js"></script>
<script src="/static/js/tags.js" type="text/javascript"></script>
<script src="/static/js/icons.js" type="text/javascript"></script>
<script src="/static/js/comments.js" type="text/javascript"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script type="text/javascript"
src="https://cdn.mathjax.org/mathjax/latest/MathJax.js">
</script>
<script>
$(document).on('ready', function(){
// Make the Rendered Markdown Button active
$(".btn-rendered").addClass("btn-active");
// Initialize headers
helpersJx.linkifyHeaders();
// Initialize comments
var post_path = 'addons/okr-daily-script.kp';
$("#post_comment_btn").on('click', function(){
comment_author = 'knowledge_default';
post_author = 'Ben Miroglio';
post_title = 'addon_aggregates derived dataset script';
commentsJx.postComment(comment_author, post_author, post_title, post_path);
location.reload();
});
all_comment_delete_buttons = $("[id^=delete_comment]")
$.each(all_comment_delete_buttons, function(i,v){
$(v).on("click", function(){
var id = v.id;
var comment_id = id.split("__")[1];
if(comment_id) {
commentsJx.deleteComment(post_path, comment_id)
location.reload();
}
});
});
$(document.body).on('click',"button[id^=tag-subscription]",function () {
tagsJx.addTagSubscriptionListener($(this)[0]);
});
})
//Turn all the headers to be links
//h1 = Title, don't want that
var all_headers = [$("h2"), $("h3"), $("h4"), $("h5"), $("h6")]
$.each(all_headers, function(index, value){
$.each(value, function(i, v){
var inner_html = v.innerHTML
inner_html_no_special = inner_html.replace(/[^a-zA-Z\- ]/g, "")
var inner_link = "#" + inner_html_no_special.toLowerCase().split(" ").join("-")
v.innerHTML = "<a href='" + inner_link + "' class=link-reset>" + inner_html + "</a>"
})
})
//turn all the tags into links, similar to what's done on the feed page
var tags = $("#renderedMarkdown .metadata .tags")[0]
var tags_list = ['add-ons', 'okr', 'derived dataset']
var subscriptions_list = []
$.each(tags_list, function(i,tag){
ahref = document.createElement("a")
e_tag = encodeURIComponent(tag)
f_tag = tag.replace("/", "__")
tag_name = "#" + tag
tag_subscription_button_id_name = "tag-subscription-" + i + "__" + f_tag
ahref.setAttribute("data-container", "body")
ahref.setAttribute("data-toggle", "popover")
ahref.setAttribute("data-placement", "bottom")
ahref.setAttribute("data-html", "true")
ahref.setAttribute("data-tag-name", f_tag)
if (subscriptions_list.indexOf(tag) >= 0) {
ahref.setAttribute("class", "label label-subscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-primary btn-unsubscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-remove-sign glyphicon-white'></i>Unsubscribe " +
" </button> " +
" </div>")
} else {
ahref.setAttribute("class", "label label-unsubscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-default btn-subscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-ok-sign glyphicon-filled'></i>Subscribe " +
" </button> " +
" </div>")
}
ahref.setAttribute("href", "/tag_pages?tag=" + e_tag)
ahref.setAttribute("style", "font-weight:normal")
if (i == 0){
ahref.innerHTML = " "
colon = document.createElement("text")
colon.innerHTML = "<b>Tags</b>: "
tags.appendChild(colon)
}
ahref.innerHTML = ahref.innerHTML + tag_name
tags.appendChild(ahref)
if (i != tags_list.length - 1){
comma = document.createElement("text")
comma.innerText = ", "
tags.appendChild(comma)
}
})
tags.nextSibling.remove()
tags.innerHTML += "<i class='glyphicon glyphicon-edit icon-gray' style='font-size:12pt; padding-left:4px' id='tooltip-edit_tags'></i>"
$(".pop").popover({ trigger: "manual" , html: true, animation:false, delay: 100})
.on("mouseenter", function () {
var _this = this;
$(this).popover("show");
$(".popover").on("mouseleave", function () {
$(_this).popover('hide');
});
}).on("mouseleave", function () {
var _this = this;
setTimeout(function () {
if (!$(".popover:hover").length) {
$(_this).popover("hide");
}
}, 300);
});
$('#tooltip-edit_tags').click(function(){
$('#tooltip-edit_tags')[0].setAttribute("style", "display:none")
previousSibling = $("#tooltip-edit_tags")[0].previousSibling
tags_string = tags_list.join(", ")
form = document.createElement("form")
input = document.createElement("input")
tags_text = document.createElement("text")
icon_class = document.createElement("i")
icon_class.setAttribute("class", "glyphicon glyphicon-upload icon-gray")
icon_class.setAttribute("style", "font-size:23px; padding-left:4px")
icon_class.setAttribute("id", "tooltip-save_tags")
tags_text.innerText = "Tags: "
input.setAttribute('type', 'text')
input.setAttribute('name', 'tags_list')
input.setAttribute('value', tags_string)
input.setAttribute('style', 'width:75%; display: inline-block')
input.setAttribute('id' , 'change_tags')
form.appendChild(tags_text)
tags.textContent = " "
form.appendChild(input)
form.appendChild(icon_class)
tags.appendChild(form)
$("#tooltip-save_tags").click(function(){
tags_string = $("#change_tags")[0].value
tags_list = tags_string.split(",")
var re = /^[a-z0-9\-\_\:\/]+$/i
var good = true
for (var i = 0; i < tags_list.length; i++){
tag = tags_list[i]
if (tag.length == 0){
alert("There is a tag with length 0 - possible a trailing comma?")
good = false
break
} else {
tag_name = tag.trim()
if (!(re.test(tag_name))){
alert("The tag contains special characters. Make sure there are only alphanumeric characters in your tag")
good = false
break
}
}
}
if (good) {
var postContent = {}
postContent['tags'] = tags_string
$.ajax({
type: "POST",
dataType: "json",
data: JSON.stringify(postContent),
contentType: "application/json",
url: '/tag_list?post_path=addons/okr-daily-script.kp',
async: false
});
location.reload()
}
})
tags.nextSibling.remove()
// Allow user to edit tags
var edit_icon = iconsJx.createEditTagsIcon();
$(tags).after(edit_icon);
$("#tooltip-edit_tags").on("click", function(){
var edit_tooltip = $("#tooltip-edit_tags");
edit_tooltip.attr("style", "display:none");
var tags_string = tags_list.join(", ");
var form = $("<form>");
var input = $("<input>");
var tags_text = $("<text>");
tags_text.html("Tags: ");
var icon = iconsJx.createSaveTagsIcon();
input.attr("type", "text");
input.attr("name", "tags_list");
input.attr("style", "width:75%; display: inline-block");
input.attr("id", "change_tags");
form.append(tags_text);
tags.textContent = " ";
tags_text.innerHTML = "Tags: ";
form.append(input);
form.append(icon);
tags.appendChild(form[0]);
$("#change_tags")[0].value = tags_string;
$("#change_tags").keypress(function(e){
if (e.which == 13){
var tags_string = $("#change_tags")[0].value;
var post_path = "addons/okr-daily-script.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false;
};
});
$("#tooltip-save_tags").click(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "addons/okr-daily-script.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
});
$("form").submit(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "addons/okr-daily-script.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false
})
});
});
</script>
</body>
</html>

Просмотреть файл

@ -0,0 +1,14 @@
{
"title": "addon_aggregates derived dataset script",
"authors": [
"Ben Miroglio"
],
"tags": [
"add-ons",
"okr",
"derived dataset"
],
"publish_date": "2017-02-08",
"updated_at": "2017-02-15",
"tldr": "script to be run daily that contructs the addon_aggregates table in re:dash"
}

604
bug1381516.kp/index.html Normal file
Просмотреть файл

@ -0,0 +1,604 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title> Knowledge </title>
<!-- js includes at the top as post embedded js colliding -->
<script src="https://reports.telemetry.mozilla.org/static/modules/jquery/jquery.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/tether/js/tether.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap/js/bootstrap.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/js/bootstrap-slider.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/typeahead.js/typeahead.bundle.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/handlebars/js/handlebars.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/select2/js/select2.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/hightlight.pack.js/highlight.pack.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/marked.js/marked.js"></script>
<!-- require js is used for plotly, but has a bunch of collisions with other js packages
make sure to have it be last js package imported -->
<script src="https://reports.telemetry.mozilla.org/static/modules/require.js/require.min.js"></script>
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.6.2/html5shiv.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/2.1.0/es5-shim.min.js"></script>
<![endif]-->
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap/css/bootstrap.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/css/bootstrap-slider.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/css/custom.css" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/select2/css/select2.min.css" rel="stylesheet" type="text/css"/>
<link href="https://fonts.googleapis.com/css?family=Lato:400,900|Playfair+Display|Source+Serif+Pro:400,700" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/images/favicon.png" rel="shortcut icon"/>
<link href="https://reports.telemetry.mozilla.org/static/css/codehilite-friendly.css" rel="stylesheet"/>
<style>
.spinner {
position: fixed;
top: 50%;
left: 50%;
margin-left: -50px; /* half width of the spinner gif */
margin-top: -50px; /* half height of the spinner gif */
text-align:center;
z-index:1234;
overflow: auto;
width: 100px; /* width of the spinner gif */
height: 102px; /*hight of the spinner gif +2px to fix IE8 issue */
}
.table {
font-size: 14px;
}
.modal-content {
max-width: 1024px;
}
</style>
</head>
<body>
<div class="container page-container">
<br/>
<div class="container-fluid">
<div class="row">
<div class="col-md-6">
</div>
<div class="col-md-2">
</div>
</div>
<div class="row col-md-12">
</div>
</div>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<div id="renderedMarkdown">
<div class="metadata">
<h1>Bug 1381516 - How Bad Is Bug 1380880?</h1>
<span class="authors"><a href="/feed?authors=chutten">chutten</a></span>
<span class="date_created">July 17, 2017</span>
<span class="date_updated">(Last Updated: July 17, 2017)</span>
<span class="tldr"><p>How broadly and how deeply do the effects of bug 1380880 extend?</p></span>
</div>
<h3 id="how-many-keyed-histograms-have-identical-keys-across-processes">How many keyed histograms have identical keys across processes?</h3>
<p>In <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1380880">bug 1380880</a> :billm found that keyed histograms recorded on different processes would be aggregated together if their keys matched.</p>
<p>How often does this happen in practice? How long has this been happening?</p>
<div class="codehilite"><pre><span></span><span class="kn">import</span> <span class="nn">ujson</span> <span class="kn">as</span> <span class="nn">json</span>
<span class="kn">import</span> <span class="nn">matplotlib.pyplot</span> <span class="kn">as</span> <span class="nn">plt</span>
<span class="kn">import</span> <span class="nn">pandas</span> <span class="kn">as</span> <span class="nn">pd</span>
<span class="kn">import</span> <span class="nn">numpy</span> <span class="kn">as</span> <span class="nn">np</span>
<span class="kn">import</span> <span class="nn">plotly.plotly</span> <span class="kn">as</span> <span class="nn">py</span>
<span class="kn">from</span> <span class="nn">plotly.graph_objs</span> <span class="kn">import</span> <span class="o">*</span>
<span class="kn">from</span> <span class="nn">moztelemetry</span> <span class="kn">import</span> <span class="n">get_pings_properties</span><span class="p">,</span> <span class="n">get_one_ping_per_client</span>
<span class="kn">from</span> <span class="nn">moztelemetry.dataset</span> <span class="kn">import</span> <span class="n">Dataset</span>
<span class="o">%</span><span class="n">matplotlib</span> <span class="n">inline</span>
</pre></div>
<h3 id="which-keyed-histograms-share-keys-across-processes">Which keyed histograms share keys across processes?</h3>
<p>The whole child-process client aggregation thing was introduced by <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1218576">bug 1218576</a> back in September of 2016 for Firefox 52. So thats the earliest this could have started.</p>
<div class="codehilite"><pre><span></span><span class="n">pings</span> <span class="o">=</span> <span class="n">Dataset</span><span class="o">.</span><span class="n">from_source</span><span class="p">(</span><span class="s2">"telemetry"</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">docType</span><span class="o">=</span><span class="s1">'main'</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">appVersion</span><span class="o">=</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="s2">"52"</span><span class="p">))</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">appUpdateChannel</span><span class="o">=</span><span class="s2">"nightly"</span><span class="p">)</span> \
<span class="o">.</span><span class="n">records</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">sample</span><span class="o">=</span><span class="mf">0.1</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span>fetching 13254.61440MB in 54449 files...
</pre></div>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">set_of_hgram_key_tuples</span><span class="p">(</span><span class="n">payload</span><span class="p">):</span>
<span class="k">return</span> <span class="nb">set</span><span class="p">((</span><span class="n">kh_name</span><span class="p">,</span> <span class="n">key</span><span class="p">)</span> <span class="k">for</span> <span class="p">(</span><span class="n">kh_name</span><span class="p">,</span> <span class="n">v</span><span class="p">)</span> <span class="ow">in</span> <span class="n">payload</span><span class="p">[</span><span class="s1">'keyedHistograms'</span><span class="p">]</span><span class="o">.</span><span class="n">items</span><span class="p">()</span> <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">v</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>
<span class="k">def</span> <span class="nf">get_problem_combos</span><span class="p">(</span><span class="n">aping</span><span class="p">):</span>
<span class="n">parent_tuples</span> <span class="o">=</span> <span class="n">set_of_hgram_key_tuples</span><span class="p">(</span><span class="n">aping</span><span class="p">[</span><span class="s1">'payload'</span><span class="p">])</span>
<span class="n">child_tuples</span> <span class="o">=</span> <span class="p">[</span><span class="n">set_of_hgram_key_tuples</span><span class="p">(</span><span class="n">pp</span><span class="p">)</span> <span class="k">for</span> <span class="p">(</span><span class="n">process_name</span><span class="p">,</span> <span class="n">pp</span><span class="p">)</span> <span class="ow">in</span> <span class="n">aping</span><span class="p">[</span><span class="s1">'payload'</span><span class="p">]</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'processes'</span><span class="p">,</span> <span class="p">{})</span><span class="o">.</span><span class="n">items</span><span class="p">()</span> <span class="k">if</span> <span class="s1">'keyedHistograms'</span> <span class="ow">in</span> <span class="n">pp</span><span class="p">]</span>
<span class="n">problem_combos</span> <span class="o">=</span> <span class="nb">set</span><span class="o">.</span><span class="n">intersection</span><span class="p">(</span><span class="o">*</span><span class="p">(</span><span class="n">child_tuples</span> <span class="o">+</span> <span class="p">[</span><span class="n">parent_tuples</span><span class="p">]))</span> <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">child_tuples</span><span class="p">)</span> <span class="k">else</span> <span class="nb">set</span><span class="p">()</span>
<span class="k">return</span> <span class="n">problem_combos</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">problem_combos</span> <span class="o">=</span> <span class="n">pings</span><span class="o">.</span><span class="n">flatMap</span><span class="p">(</span><span class="n">get_problem_combos</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">problem_combos</span><span class="o">.</span><span class="n">cache</span><span class="p">()</span>
</pre></div>
<div class="codehilite"><pre><span></span>PythonRDD[15] at RDD at PythonRDD.scala:48
</pre></div>
<p>Alright, lets get a list of the most commonly-seen histograms:</p>
<div class="codehilite"><pre><span></span><span class="nb">sorted</span><span class="p">(</span><span class="n">problem_combos</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">c</span><span class="p">:</span> <span class="p">(</span><span class="n">c</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="mi">1</span><span class="p">))</span><span class="o">.</span><span class="n">countByKey</span><span class="p">()</span><span class="o">.</span><span class="n">iteritems</span><span class="p">(),</span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n">reverse</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span>[(u'IPC_MESSAGE_SIZE', 396905),
(u'BLOCKED_ON_PLUGIN_MODULE_INIT_MS', 72248),
(u'SYNC_WORKER_OPERATION', 47653),
(u'MESSAGE_MANAGER_MESSAGE_SIZE2', 35884),
(u'CONTENT_LARGE_PAINT_PHASE_WEIGHT', 13846),
(u'MEDIA_CODEC_USED', 1030),
(u'CANVAS_WEBGL_FAILURE_ID', 289),
(u'VIDEO_INFERRED_DECODE_SUSPEND_PERCENTAGE', 288),
(u'VIDEO_HIDDEN_PLAY_TIME_PERCENTAGE', 288),
(u'VIDEO_INTER_KEYFRAME_MAX_MS', 208),
(u'CANVAS_WEBGL_ACCL_FAILURE_ID', 183),
(u'JS_TELEMETRY_ADDON_EXCEPTIONS', 150),
(u'VIDEO_SUSPEND_RECOVERY_TIME_MS', 117),
(u'VIDEO_INTER_KEYFRAME_AVERAGE_MS', 111),
(u'PRINT_DIALOG_OPENED_COUNT', 4),
(u'PRINT_COUNT', 2)]
</pre></div>
<p>More verbosely, what are the 20 most-commonly-seen histogram,key pairs:</p>
<div class="codehilite"><pre><span></span><span class="nb">sorted</span><span class="p">(</span><span class="n">problem_combos</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">c</span><span class="p">:</span> <span class="p">(</span><span class="n">c</span><span class="p">,</span> <span class="mi">1</span><span class="p">))</span><span class="o">.</span><span class="n">countByKey</span><span class="p">()</span><span class="o">.</span><span class="n">iteritems</span><span class="p">(),</span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n">reverse</span><span class="o">=</span><span class="bp">True</span><span class="p">)[:</span><span class="mi">20</span><span class="p">]</span>
</pre></div>
<div class="codehilite"><pre><span></span>[((u'IPC_MESSAGE_SIZE', u'PLayerTransaction::Msg_Update'), 185499),
((u'IPC_MESSAGE_SIZE', u'PBrowser::Msg_AsyncMessage'), 133954),
((u'IPC_MESSAGE_SIZE', u'PLayerTransaction::Msg_UpdateNoSwap'), 64489),
((u'SYNC_WORKER_OPERATION', u'WorkerCheckAPIExposureOnMainThread'), 41428),
((u'MESSAGE_MANAGER_MESSAGE_SIZE2', u'SessionStore:update'), 24408),
((u'BLOCKED_ON_PLUGIN_MODULE_INIT_MS', u'Shockwave Flash23.0.0.185'), 21854),
((u'BLOCKED_ON_PLUGIN_MODULE_INIT_MS', u'Shockwave Flash23.0.0.205'), 18713),
((u'IPC_MESSAGE_SIZE', u'PContent::Msg_AsyncMessage'), 12066),
((u'BLOCKED_ON_PLUGIN_MODULE_INIT_MS', u'Shockwave Flash23.0.0.162'), 11700),
((u'MESSAGE_MANAGER_MESSAGE_SIZE2', u'sdk/remote/process/message'), 7776),
((u'SYNC_WORKER_OPERATION', u'XHR'), 5866),
((u'BLOCKED_ON_PLUGIN_MODULE_INIT_MS', u'Shockwave Flash23.0.0.207'), 4580),
((u'CONTENT_LARGE_PAINT_PHASE_WEIGHT', u'flb,r'), 1978),
((u'CONTENT_LARGE_PAINT_PHASE_WEIGHT', u'dl,flb'), 1978),
((u'CONTENT_LARGE_PAINT_PHASE_WEIGHT', u'dl'), 1978),
((u'CONTENT_LARGE_PAINT_PHASE_WEIGHT', u'flb'), 1978),
((u'CONTENT_LARGE_PAINT_PHASE_WEIGHT', u'r'), 1978),
((u'CONTENT_LARGE_PAINT_PHASE_WEIGHT', u'dl,r'), 1978),
((u'CONTENT_LARGE_PAINT_PHASE_WEIGHT', u'dl,flb,r'), 1978),
((u'BLOCKED_ON_PLUGIN_MODULE_INIT_MS', u'Shockwave Flash22.0.0.209'), 1642)]
</pre></div>
<h4 id="has-this-been-a-problem-this-whole-time">Has this been a problem this whole time?</h4>
<p>From earlier we note that <code>IPC_MESSAGE_SIZE/PLayerTransaction::Msg_Update</code> is the most common “present on multiple processes” combination.</p>
<p>To see if weve had this problem the whole time, how many pings have these messages in both parent and content, and whose histograms have identical sums?</p>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">relevant_ping</span><span class="p">(</span><span class="n">p</span><span class="p">):</span>
<span class="n">parent</span> <span class="o">=</span> <span class="n">p</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'payload'</span><span class="p">,</span> <span class="p">{})</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'keyedHistograms'</span><span class="p">,</span> <span class="p">{})</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'IPC_MESSAGE_SIZE'</span><span class="p">,</span> <span class="p">{})</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'PLayerTransaction::Msg_Update'</span><span class="p">)</span>
<span class="n">content</span> <span class="o">=</span> <span class="n">p</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'payload'</span><span class="p">,</span> <span class="p">{})</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'processes'</span><span class="p">,</span> <span class="p">{})</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'content'</span><span class="p">,</span> <span class="p">{})</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'keyedHistograms'</span><span class="p">,</span> <span class="p">{})</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'IPC_MESSAGE_SIZE'</span><span class="p">,</span> <span class="p">{})</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'PLayerTransaction::Msg_Update'</span><span class="p">)</span>
<span class="k">return</span> <span class="n">parent</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span> <span class="ow">and</span> <span class="n">content</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span> <span class="ow">and</span> <span class="n">parent</span><span class="p">[</span><span class="s1">'sum'</span><span class="p">]</span> <span class="o">==</span> <span class="n">content</span><span class="p">[</span><span class="s1">'sum'</span><span class="p">]</span>
<span class="n">relevant_pings</span> <span class="o">=</span> <span class="n">pings</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">relevant_ping</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">relevant_pings</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
</pre></div>
<div class="codehilite"><pre><span></span>149126
</pre></div>
<p>Yup, it appears as though weve had this problem since nightly/52.</p>
<h3 id="how-about-recently">How about recently?</h3>
<div class="codehilite"><pre><span></span><span class="n">modern_pings</span> <span class="o">=</span> <span class="n">Dataset</span><span class="o">.</span><span class="n">from_source</span><span class="p">(</span><span class="s2">"telemetry"</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">docType</span><span class="o">=</span><span class="s1">'main'</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">submissionDate</span><span class="o">=</span><span class="s2">"20170716"</span><span class="p">)</span> \
<span class="o">.</span><span class="n">records</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">sample</span><span class="o">=</span><span class="mf">0.01</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span>fetching 7012.25715MB in 1970 files...
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">modern_combos</span> <span class="o">=</span> <span class="n">modern_pings</span><span class="o">.</span><span class="n">flatMap</span><span class="p">(</span><span class="n">get_problem_combos</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">modern_combos</span><span class="o">.</span><span class="n">cache</span><span class="p">()</span>
</pre></div>
<div class="codehilite"><pre><span></span>PythonRDD[51] at RDD at PythonRDD.scala:48
</pre></div>
<div class="codehilite"><pre><span></span><span class="nb">sorted</span><span class="p">(</span><span class="n">modern_combos</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">c</span><span class="p">:</span> <span class="p">(</span><span class="n">c</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="mi">1</span><span class="p">))</span><span class="o">.</span><span class="n">countByKey</span><span class="p">()</span><span class="o">.</span><span class="n">iteritems</span><span class="p">(),</span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n">reverse</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span>[(u'NOTIFY_OBSERVERS_LATENCY_MS', 72463),
(u'DOM_SCRIPT_SRC_ENCODING', 33021),
(u'FX_SESSION_RESTORE_CONTENT_COLLECT_DATA_MS', 30709),
(u'CONTENT_LARGE_PAINT_PHASE_WEIGHT', 11613),
(u'IPC_WRITE_MAIN_THREAD_LATENCY_MS', 11186),
(u'MAIN_THREAD_RUNNABLE_MS', 7872),
(u'IPC_READ_MAIN_THREAD_LATENCY_MS', 6646),
(u'SYNC_WORKER_OPERATION', 5614),
(u'IPC_SYNC_RECEIVE_MS', 4227),
(u'IPC_MESSAGE_SIZE', 3514),
(u'BLOCKED_ON_PLUGIN_MODULE_INIT_MS', 2377),
(u'IPC_SYNC_MESSAGE_MANAGER_LATENCY_MS', 902),
(u'IPC_SYNC_MAIN_LATENCY_MS', 833),
(u'IDLE_RUNNABLE_BUDGET_OVERUSE_MS', 701),
(u'MESSAGE_MANAGER_MESSAGE_SIZE2', 615),
(u'FX_TAB_REMOTE_NAVIGATION_DELAY_MS', 433),
(u'CANVAS_WEBGL_FAILURE_ID', 138),
(u'CANVAS_WEBGL_ACCL_FAILURE_ID', 110),
(u'MEDIA_CODEC_USED', 20),
(u'IPC_SYNC_LATENCY_MS', 9),
(u'VIDEO_HIDDEN_PLAY_TIME_PERCENTAGE', 8),
(u'VIDEO_INFERRED_DECODE_SUSPEND_PERCENTAGE', 8),
(u'PRINT_DIALOG_OPENED_COUNT', 2)]
</pre></div>
<div class="codehilite"><pre><span></span><span class="nb">sorted</span><span class="p">(</span><span class="n">modern_combos</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">c</span><span class="p">:</span> <span class="p">(</span><span class="n">c</span><span class="p">,</span> <span class="mi">1</span><span class="p">))</span><span class="o">.</span><span class="n">countByKey</span><span class="p">()</span><span class="o">.</span><span class="n">iteritems</span><span class="p">(),</span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n">reverse</span><span class="o">=</span><span class="bp">True</span><span class="p">)[:</span><span class="mi">20</span><span class="p">]</span>
</pre></div>
<div class="codehilite"><pre><span></span>[((u'DOM_SCRIPT_SRC_ENCODING', u'UTF-8'), 16824),
((u'DOM_SCRIPT_SRC_ENCODING', u'windows-1252'), 16165),
((u'NOTIFY_OBSERVERS_LATENCY_MS', u'cycle-collector-begin'), 13727),
((u'NOTIFY_OBSERVERS_LATENCY_MS', u'garbage-collection-statistics'), 13150),
((u'NOTIFY_OBSERVERS_LATENCY_MS', u'cycle-collector-forget-skippable'),
12719),
((u'NOTIFY_OBSERVERS_LATENCY_MS', u'inner-window-destroyed'), 8619),
((u'NOTIFY_OBSERVERS_LATENCY_MS', u'tab-content-frameloader-created'), 7924),
((u'FX_SESSION_RESTORE_CONTENT_COLLECT_DATA_MS', u'historychange'), 7537),
((u'FX_SESSION_RESTORE_CONTENT_COLLECT_DATA_MS', u'pageStyle'), 7390),
((u'FX_SESSION_RESTORE_CONTENT_COLLECT_DATA_MS', u'scroll'), 7389),
((u'FX_SESSION_RESTORE_CONTENT_COLLECT_DATA_MS', u'storage'), 7380),
((u'IPC_WRITE_MAIN_THREAD_LATENCY_MS', u'PLayerTransaction::Msg_Update'),
6284),
((u'SYNC_WORKER_OPERATION', u'WorkerCheckAPIExposureOnMainThread'), 4926),
((u'NOTIFY_OBSERVERS_LATENCY_MS', u'content-document-global-created'), 4486),
((u'IPC_SYNC_RECEIVE_MS', u'???'), 4227),
((u'IPC_READ_MAIN_THREAD_LATENCY_MS', u'PCompositorBridge::Msg_DidComposite'),
3523),
((u'NOTIFY_OBSERVERS_LATENCY_MS', u'document-element-inserted'), 3498),
((u'IPC_WRITE_MAIN_THREAD_LATENCY_MS',
u'PCompositorBridge::Msg_PTextureConstructor'),
2231),
((u'IPC_MESSAGE_SIZE', u'PBrowser::Msg_AsyncMessage'), 2083),
((u'IPC_READ_MAIN_THREAD_LATENCY_MS', u'PBrowser::Msg_AsyncMessage'), 2031)]
</pre></div>
<p>The behaviour still exists, though this suggests that plugins and ipc messages are now less common. Instead we see more latency probes.</p>
</div>
</div>
</div>
</div>
</div></body></html>
<div>
</div>
<br/>
<div class="row">
<div class="col-md-12">
</div>
</div>
<script type="text/javascript">
$("#searchbar")[0].setSelectionRange(1000, 1000);
$('#searchbar').typeahead({
hint: false,
highlight: true,
minLength: 1
},
{
name: 'knowledge_posts',
limit: 10,
display: function (item) {
return item.title + " - " + item.author;
},
templates: {
empty: Handlebars.compile(
'<div class="tt-not-found">' +
'Unable to find any posts that match the current query' +
'</div>'
),
suggestion: function(data) {
return '<p style="overflow-wrap:break-word"><strong class="text-rausch">' + data.title + '</strong> – ' + data.author + '</p>';
}
},
source: function(q, sync, async) {
$.ajax('/ajax/index/typeahead?search=' + q,
{
success: function(data,status){ async(JSON.parse(data)); }
})
}
});
$('#searchbar').bind('typeahead:select', function(obj, datum, name) {
window.location = '/post/'+encodeURIComponent(datum.path);
});
$('#searchbar').keypress(function(event){
var keycode = (event.keyCode ? event.keyCode : event.which);
if(keycode == '13'){
var path = document.location.pathname;
window.location = '/feed?filters=' + $('#searchbar').val()
}
});
var padding = $('.tt-menu').outerWidth()
$('.tt-menu').width($('#searchbar').width() + padding + "px")
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/tooltips.js" type="text/javascript"></script>
<script type="text/javascript">
$("document").ready(function(){
var is_webeditor = false;
var post_id = "31";
var id = "None";
var post_path = "bug1381516.kp"
var data_repo_github_root = ""
tooltipsJx.initializeTooltips(is_webeditor, post_id, id, data_repo_github_root);
$(".btn-rendered").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path);
})
$(".btn-raw").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path) + "?render=raw";
})
$(".btn-webeditor").on("click", function(){
document.location.href = "/edit/" + encodeURI(post_path);
})
});
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/tags.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/icons.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/comments.js" type="text/javascript"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js" type="text/javascript">
</script>
<script>
$(document).on('ready', function(){
// Make the Rendered Markdown Button active
$(".btn-rendered").addClass("btn-active");
// Initialize headers
helpersJx.linkifyHeaders();
// Initialize comments
var post_path = 'bug1381516.kp';
$("#post_comment_btn").on('click', function(){
comment_author = 'knowledge_default';
post_author = 'chutten';
post_title = 'Bug 1381516 - How Bad Is Bug 1380880?';
commentsJx.postComment(comment_author, post_author, post_title, post_path);
location.reload();
});
all_comment_delete_buttons = $("[id^=delete_comment]")
$.each(all_comment_delete_buttons, function(i,v){
$(v).on("click", function(){
var id = v.id;
var comment_id = id.split("__")[1];
if(comment_id) {
commentsJx.deleteComment(post_path, comment_id)
location.reload();
}
});
});
$(document.body).on('click',"button[id^=tag-subscription]",function () {
tagsJx.addTagSubscriptionListener($(this)[0]);
});
})
//Turn all the headers to be links
//h1 = Title, don't want that
var all_headers = [$("h2"), $("h3"), $("h4"), $("h5"), $("h6")]
$.each(all_headers, function(index, value){
$.each(value, function(i, v){
var inner_html = v.innerHTML
inner_html_no_special = inner_html.replace(/[^a-zA-Z\- ]/g, "")
var inner_link = "#" + inner_html_no_special.toLowerCase().split(" ").join("-")
v.innerHTML = "<a href='" + inner_link + "' class=link-reset>" + inner_html + "</a>"
})
})
//turn all the tags into links, similar to what's done on the feed page
var tags = $("#renderedMarkdown .metadata .tags")[0]
var tags_list = ['investigation', 'keyed histograms', 'archaeology']
var subscriptions_list = []
$.each(tags_list, function(i,tag){
ahref = document.createElement("a")
e_tag = encodeURIComponent(tag)
f_tag = tag.replace("/", "__")
tag_name = "#" + tag
tag_subscription_button_id_name = "tag-subscription-" + i + "__" + f_tag
ahref.setAttribute("data-container", "body")
ahref.setAttribute("data-toggle", "popover")
ahref.setAttribute("data-placement", "bottom")
ahref.setAttribute("data-html", "true")
ahref.setAttribute("data-tag-name", f_tag)
if (subscriptions_list.indexOf(tag) >= 0) {
ahref.setAttribute("class", "label label-subscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-primary btn-unsubscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-remove-sign glyphicon-white'></i>Unsubscribe " +
" </button> " +
" </div>")
} else {
ahref.setAttribute("class", "label label-unsubscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-default btn-subscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-ok-sign glyphicon-filled'></i>Subscribe " +
" </button> " +
" </div>")
}
ahref.setAttribute("href", "/tag_pages?tag=" + e_tag)
ahref.setAttribute("style", "font-weight:normal")
if (i == 0){
ahref.innerHTML = " "
colon = document.createElement("text")
colon.innerHTML = "<b>Tags</b>: "
tags.appendChild(colon)
}
ahref.innerHTML = ahref.innerHTML + tag_name
tags.appendChild(ahref)
if (i != tags_list.length - 1){
comma = document.createElement("text")
comma.innerText = ", "
tags.appendChild(comma)
}
})
tags.nextSibling.remove()
tags.innerHTML += "<i class='glyphicon glyphicon-edit icon-gray' style='font-size:12pt; padding-left:4px' id='tooltip-edit_tags'></i>"
$(".pop").popover({ trigger: "manual" , html: true, animation:false, delay: 100})
.on("mouseenter", function () {
var _this = this;
$(this).popover("show");
$(".popover").on("mouseleave", function () {
$(_this).popover('hide');
});
}).on("mouseleave", function () {
var _this = this;
setTimeout(function () {
if (!$(".popover:hover").length) {
$(_this).popover("hide");
}
}, 300);
});
$('#tooltip-edit_tags').click(function(){
$('#tooltip-edit_tags')[0].setAttribute("style", "display:none")
previousSibling = $("#tooltip-edit_tags")[0].previousSibling
tags_string = tags_list.join(", ")
form = document.createElement("form")
input = document.createElement("input")
tags_text = document.createElement("text")
icon_class = document.createElement("i")
icon_class.setAttribute("class", "glyphicon glyphicon-upload icon-gray")
icon_class.setAttribute("style", "font-size:23px; padding-left:4px")
icon_class.setAttribute("id", "tooltip-save_tags")
tags_text.innerText = "Tags: "
input.setAttribute('type', 'text')
input.setAttribute('name', 'tags_list')
input.setAttribute('value', tags_string)
input.setAttribute('style', 'width:75%; display: inline-block')
input.setAttribute('id' , 'change_tags')
form.appendChild(tags_text)
tags.textContent = " "
form.appendChild(input)
form.appendChild(icon_class)
tags.appendChild(form)
$("#tooltip-save_tags").click(function(){
tags_string = $("#change_tags")[0].value
tags_list = tags_string.split(",")
var re = /^[a-z0-9\-\_\:\/]+$/i
var good = true
for (var i = 0; i < tags_list.length; i++){
tag = tags_list[i]
if (tag.length == 0){
alert("There is a tag with length 0 - possible a trailing comma?")
good = false
break
} else {
tag_name = tag.trim()
if (!(re.test(tag_name))){
alert("The tag contains special characters. Make sure there are only alphanumeric characters in your tag")
good = false
break
}
}
}
if (good) {
var postContent = {}
postContent['tags'] = tags_string
$.ajax({
type: "POST",
dataType: "json",
data: JSON.stringify(postContent),
contentType: "application/json",
url: '/tag_list?post_path=bug1381516.kp',
async: false
});
location.reload()
}
})
tags.nextSibling.remove()
// Allow user to edit tags
var edit_icon = iconsJx.createEditTagsIcon();
$(tags).after(edit_icon);
$("#tooltip-edit_tags").on("click", function(){
var edit_tooltip = $("#tooltip-edit_tags");
edit_tooltip.attr("style", "display:none");
var tags_string = tags_list.join(", ");
var form = $("<form>");
var input = $("<input>");
var tags_text = $("<text>");
tags_text.html("Tags: ");
var icon = iconsJx.createSaveTagsIcon();
input.attr("type", "text");
input.attr("name", "tags_list");
input.attr("style", "width:75%; display: inline-block");
input.attr("id", "change_tags");
form.append(tags_text);
tags.textContent = " ";
tags_text.innerHTML = "Tags: ";
form.append(input);
form.append(icon);
tags.appendChild(form[0]);
$("#change_tags")[0].value = tags_string;
$("#change_tags").keypress(function(e){
if (e.which == 13){
var tags_string = $("#change_tags")[0].value;
var post_path = "bug1381516.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false;
};
});
$("#tooltip-save_tags").click(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "bug1381516.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
});
$("form").submit(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "bug1381516.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false
})
});
});
</script>

Просмотреть файл

@ -0,0 +1,762 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title> Knowledge </title>
<!-- js includes at the top as post embedded js colliding -->
<script src="/static/modules/jquery/jquery.min.js"></script>
<script src="/static/modules/tether/js/tether.min.js"></script>
<script src="/static/modules/bootstrap/js/bootstrap.min.js"></script>
<script src="/static/modules/bootstrap-slider/js/bootstrap-slider.min.js"></script>
<script src="/static/modules/typeahead.js/typeahead.bundle.min.js"></script>
<script src="/static/modules/handlebars/js/handlebars.js"></script>
<script src="/static/js/helpers.js"></script>
<script src="/static/modules/select2/js/select2.min.js"></script>
<script src="/static/modules/hightlight.pack.js/highlight.pack.js"></script>
<script src="/static/modules/marked.js/marked.js"></script>
<!-- require js is used for plotly, but has a bunch of collisions with other js packages
make sure to have it be last js package imported -->
<script src="/static/modules/require.js/require.min.js"></script>
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.6.2/html5shiv.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/2.1.0/es5-shim.min.js"></script>
<![endif]-->
<link rel="stylesheet" href="/static/modules/bootstrap/css/bootstrap.min.css">
<link rel="stylesheet" href="/static/modules/bootstrap-slider/css/bootstrap-slider.min.css">
<link rel="stylesheet" type='text/css' href="/static/css/custom.css">
<link rel="stylesheet" type='text/css' href="/static/modules/select2/css/select2.min.css">
<link href='https://fonts.googleapis.com/css?family=Lato:400,900|Playfair+Display|Source+Serif+Pro:400,700' rel='stylesheet' type='text/css'>
<link rel="shortcut icon" href="/static/images/favicon.png">
<link rel="stylesheet" href="/static/css/codehilite-friendly.css">
<style>
.spinner {
position: fixed;
top: 50%;
left: 50%;
margin-left: -50px; /* half width of the spinner gif */
margin-top: -50px; /* half height of the spinner gif */
text-align:center;
z-index:1234;
overflow: auto;
width: 100px; /* width of the spinner gif */
height: 102px; /*hight of the spinner gif +2px to fix IE8 issue */
}
.table {
font-size: 14px;
}
.modal-content {
max-width: 1024px;
}
</style>
</head>
<body>
<div class="navbar navbar-knowledge" role="navigation">
<div class="container page-container">
<a href="/" aria-selected="false" class="logo-image navbar-text" style='margin-top:16px; margin-left: -7px'>
<img width="125" src='/static/images/logo-white.svg'></img>
</a>
<a id="feed_tab" href="/" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Home
</a>
<a id="favorites_tab" href="/favorites" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Favorites
</a>
<a id="help_tab" href="/about" aria-selected="false" class="navbar-text" style='margin-top:18px'>
About
</a>
<a id="stats_tab" href="/stats" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Stats
</a>
<a id="webposts_tab" href="/create"
class="btn btn-primary navbar-text pull-right"
style='border-radius:4px; margin-top:10px; margin-right:-10px; background-color: #00a699; color:white; border-color: #00a699'>
Write a Post!
</a>
<div class="pull-right">
<div class="form-group">
<input class="form-control" id="searchbar" placeholder="Search for Knowledge" style="text-align:right">
</div>
</div>
</div>
</div>
<div class="container page-container">
<br>
<div class="container-fluid">
<div class="row">
<div class="col-md-6">
<div class="btn-group" role="group">
<button type="button" class="btn btn-default btn-rendered">
View Post
</button>
<button type="button" class="btn btn-default btn-raw">
View Raw Markdown
</button>
</div>
</div>
<div class="col-md-2">
</div>
<div class="col-md-4 text-right">
<i class="glyphicon glyphicon-eye-open" id="pageview_stats" style='color: #9CA299'></i>
<div id="pageview_stats" style="display: inline-block">
Viewed 3 times by 1 different users
</div>
<i class="glyphicon glyphicon-heart-empty glyphicon-clickable pop" style="font-size:16pt" id="tooltip-like" data-placement="bottom"
data-trigger="#tooltip-like"
data-container="body"
data-toggle="popover"
data-content='<div>Like This Post</div>'></i>
</div>
</div>
<div class="row col-md-12">
</div>
</div>
<div class='container-fluid'>
<div class="row">
<div class="col-md-12">
<div id="renderedMarkdown">
<div class='metadata'>
<h1>Bug 1381516 - How Bad Is Bug 1380880?</h1>
<span class='authors'><a href='/feed?authors=chutten'>chutten</a></span>
<span class='date_created'>July 17, 2017</span>
<span class='date_updated'>(Last Updated: July 17, 2017)</span>
<span class='tldr'><p>How broadly and how deeply do the effects of bug 1380880 extend?</p></span>
<span class='tags'></span>
</div>
<h3 id="how-many-keyed-histograms-have-identical-keys-across-processes">How many keyed histograms have identical keys across processes?</h3>
<p>In <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1380880">bug 1380880</a> :billm found that keyed histograms recorded on different processes would be aggregated together if their keys matched.</p>
<p>How often does this happen in practice? How long has this been happening?</p>
<div class="codehilite"><pre><span></span><span class="kn">import</span> <span class="nn">ujson</span> <span class="kn">as</span> <span class="nn">json</span>
<span class="kn">import</span> <span class="nn">matplotlib.pyplot</span> <span class="kn">as</span> <span class="nn">plt</span>
<span class="kn">import</span> <span class="nn">pandas</span> <span class="kn">as</span> <span class="nn">pd</span>
<span class="kn">import</span> <span class="nn">numpy</span> <span class="kn">as</span> <span class="nn">np</span>
<span class="kn">import</span> <span class="nn">plotly.plotly</span> <span class="kn">as</span> <span class="nn">py</span>
<span class="kn">from</span> <span class="nn">plotly.graph_objs</span> <span class="kn">import</span> <span class="o">*</span>
<span class="kn">from</span> <span class="nn">moztelemetry</span> <span class="kn">import</span> <span class="n">get_pings_properties</span><span class="p">,</span> <span class="n">get_one_ping_per_client</span>
<span class="kn">from</span> <span class="nn">moztelemetry.dataset</span> <span class="kn">import</span> <span class="n">Dataset</span>
<span class="o">%</span><span class="n">matplotlib</span> <span class="n">inline</span>
</pre></div>
<h3 id="which-keyed-histograms-share-keys-across-processes">Which keyed histograms share keys across processes?</h3>
<p>The whole child-process client aggregation thing was introduced by <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1218576">bug 1218576</a> back in September of 2016 for Firefox 52. So that&rsquo;s the earliest this could have started.</p>
<div class="codehilite"><pre><span></span><span class="n">pings</span> <span class="o">=</span> <span class="n">Dataset</span><span class="o">.</span><span class="n">from_source</span><span class="p">(</span><span class="s2">&quot;telemetry&quot;</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">docType</span><span class="o">=</span><span class="s1">&#39;main&#39;</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">appVersion</span><span class="o">=</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="s2">&quot;52&quot;</span><span class="p">))</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">appUpdateChannel</span><span class="o">=</span><span class="s2">&quot;nightly&quot;</span><span class="p">)</span> \
<span class="o">.</span><span class="n">records</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">sample</span><span class="o">=</span><span class="mf">0.1</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span>fetching 13254.61440MB in 54449 files...
</pre></div>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">set_of_hgram_key_tuples</span><span class="p">(</span><span class="n">payload</span><span class="p">):</span>
<span class="k">return</span> <span class="nb">set</span><span class="p">((</span><span class="n">kh_name</span><span class="p">,</span> <span class="n">key</span><span class="p">)</span> <span class="k">for</span> <span class="p">(</span><span class="n">kh_name</span><span class="p">,</span> <span class="n">v</span><span class="p">)</span> <span class="ow">in</span> <span class="n">payload</span><span class="p">[</span><span class="s1">&#39;keyedHistograms&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">items</span><span class="p">()</span> <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">v</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>
<span class="k">def</span> <span class="nf">get_problem_combos</span><span class="p">(</span><span class="n">aping</span><span class="p">):</span>
<span class="n">parent_tuples</span> <span class="o">=</span> <span class="n">set_of_hgram_key_tuples</span><span class="p">(</span><span class="n">aping</span><span class="p">[</span><span class="s1">&#39;payload&#39;</span><span class="p">])</span>
<span class="n">child_tuples</span> <span class="o">=</span> <span class="p">[</span><span class="n">set_of_hgram_key_tuples</span><span class="p">(</span><span class="n">pp</span><span class="p">)</span> <span class="k">for</span> <span class="p">(</span><span class="n">process_name</span><span class="p">,</span> <span class="n">pp</span><span class="p">)</span> <span class="ow">in</span> <span class="n">aping</span><span class="p">[</span><span class="s1">&#39;payload&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;processes&#39;</span><span class="p">,</span> <span class="p">{})</span><span class="o">.</span><span class="n">items</span><span class="p">()</span> <span class="k">if</span> <span class="s1">&#39;keyedHistograms&#39;</span> <span class="ow">in</span> <span class="n">pp</span><span class="p">]</span>
<span class="n">problem_combos</span> <span class="o">=</span> <span class="nb">set</span><span class="o">.</span><span class="n">intersection</span><span class="p">(</span><span class="o">*</span><span class="p">(</span><span class="n">child_tuples</span> <span class="o">+</span> <span class="p">[</span><span class="n">parent_tuples</span><span class="p">]))</span> <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">child_tuples</span><span class="p">)</span> <span class="k">else</span> <span class="nb">set</span><span class="p">()</span>
<span class="k">return</span> <span class="n">problem_combos</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">problem_combos</span> <span class="o">=</span> <span class="n">pings</span><span class="o">.</span><span class="n">flatMap</span><span class="p">(</span><span class="n">get_problem_combos</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">problem_combos</span><span class="o">.</span><span class="n">cache</span><span class="p">()</span>
</pre></div>
<div class="codehilite"><pre><span></span>PythonRDD[15] at RDD at PythonRDD.scala:48
</pre></div>
<p>Alright, let&rsquo;s get a list of the most commonly-seen histograms:</p>
<div class="codehilite"><pre><span></span><span class="nb">sorted</span><span class="p">(</span><span class="n">problem_combos</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">c</span><span class="p">:</span> <span class="p">(</span><span class="n">c</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="mi">1</span><span class="p">))</span><span class="o">.</span><span class="n">countByKey</span><span class="p">()</span><span class="o">.</span><span class="n">iteritems</span><span class="p">(),</span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n">reverse</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span>[(u&#39;IPC_MESSAGE_SIZE&#39;, 396905),
(u&#39;BLOCKED_ON_PLUGIN_MODULE_INIT_MS&#39;, 72248),
(u&#39;SYNC_WORKER_OPERATION&#39;, 47653),
(u&#39;MESSAGE_MANAGER_MESSAGE_SIZE2&#39;, 35884),
(u&#39;CONTENT_LARGE_PAINT_PHASE_WEIGHT&#39;, 13846),
(u&#39;MEDIA_CODEC_USED&#39;, 1030),
(u&#39;CANVAS_WEBGL_FAILURE_ID&#39;, 289),
(u&#39;VIDEO_INFERRED_DECODE_SUSPEND_PERCENTAGE&#39;, 288),
(u&#39;VIDEO_HIDDEN_PLAY_TIME_PERCENTAGE&#39;, 288),
(u&#39;VIDEO_INTER_KEYFRAME_MAX_MS&#39;, 208),
(u&#39;CANVAS_WEBGL_ACCL_FAILURE_ID&#39;, 183),
(u&#39;JS_TELEMETRY_ADDON_EXCEPTIONS&#39;, 150),
(u&#39;VIDEO_SUSPEND_RECOVERY_TIME_MS&#39;, 117),
(u&#39;VIDEO_INTER_KEYFRAME_AVERAGE_MS&#39;, 111),
(u&#39;PRINT_DIALOG_OPENED_COUNT&#39;, 4),
(u&#39;PRINT_COUNT&#39;, 2)]
</pre></div>
<p>More verbosely, what are the 20 most-commonly-seen histogram,key pairs:</p>
<div class="codehilite"><pre><span></span><span class="nb">sorted</span><span class="p">(</span><span class="n">problem_combos</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">c</span><span class="p">:</span> <span class="p">(</span><span class="n">c</span><span class="p">,</span> <span class="mi">1</span><span class="p">))</span><span class="o">.</span><span class="n">countByKey</span><span class="p">()</span><span class="o">.</span><span class="n">iteritems</span><span class="p">(),</span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n">reverse</span><span class="o">=</span><span class="bp">True</span><span class="p">)[:</span><span class="mi">20</span><span class="p">]</span>
</pre></div>
<div class="codehilite"><pre><span></span>[((u&#39;IPC_MESSAGE_SIZE&#39;, u&#39;PLayerTransaction::Msg_Update&#39;), 185499),
((u&#39;IPC_MESSAGE_SIZE&#39;, u&#39;PBrowser::Msg_AsyncMessage&#39;), 133954),
((u&#39;IPC_MESSAGE_SIZE&#39;, u&#39;PLayerTransaction::Msg_UpdateNoSwap&#39;), 64489),
((u&#39;SYNC_WORKER_OPERATION&#39;, u&#39;WorkerCheckAPIExposureOnMainThread&#39;), 41428),
((u&#39;MESSAGE_MANAGER_MESSAGE_SIZE2&#39;, u&#39;SessionStore:update&#39;), 24408),
((u&#39;BLOCKED_ON_PLUGIN_MODULE_INIT_MS&#39;, u&#39;Shockwave Flash23.0.0.185&#39;), 21854),
((u&#39;BLOCKED_ON_PLUGIN_MODULE_INIT_MS&#39;, u&#39;Shockwave Flash23.0.0.205&#39;), 18713),
((u&#39;IPC_MESSAGE_SIZE&#39;, u&#39;PContent::Msg_AsyncMessage&#39;), 12066),
((u&#39;BLOCKED_ON_PLUGIN_MODULE_INIT_MS&#39;, u&#39;Shockwave Flash23.0.0.162&#39;), 11700),
((u&#39;MESSAGE_MANAGER_MESSAGE_SIZE2&#39;, u&#39;sdk/remote/process/message&#39;), 7776),
((u&#39;SYNC_WORKER_OPERATION&#39;, u&#39;XHR&#39;), 5866),
((u&#39;BLOCKED_ON_PLUGIN_MODULE_INIT_MS&#39;, u&#39;Shockwave Flash23.0.0.207&#39;), 4580),
((u&#39;CONTENT_LARGE_PAINT_PHASE_WEIGHT&#39;, u&#39;flb,r&#39;), 1978),
((u&#39;CONTENT_LARGE_PAINT_PHASE_WEIGHT&#39;, u&#39;dl,flb&#39;), 1978),
((u&#39;CONTENT_LARGE_PAINT_PHASE_WEIGHT&#39;, u&#39;dl&#39;), 1978),
((u&#39;CONTENT_LARGE_PAINT_PHASE_WEIGHT&#39;, u&#39;flb&#39;), 1978),
((u&#39;CONTENT_LARGE_PAINT_PHASE_WEIGHT&#39;, u&#39;r&#39;), 1978),
((u&#39;CONTENT_LARGE_PAINT_PHASE_WEIGHT&#39;, u&#39;dl,r&#39;), 1978),
((u&#39;CONTENT_LARGE_PAINT_PHASE_WEIGHT&#39;, u&#39;dl,flb,r&#39;), 1978),
((u&#39;BLOCKED_ON_PLUGIN_MODULE_INIT_MS&#39;, u&#39;Shockwave Flash22.0.0.209&#39;), 1642)]
</pre></div>
<h4 id="has-this-been-a-problem-this-whole-time">Has this been a problem this whole time?</h4>
<p>From earlier we note that <code>IPC_MESSAGE_SIZE/PLayerTransaction::Msg_Update</code> is the most common &ldquo;present on multiple processes&rdquo; combination.</p>
<p>To see if we&rsquo;ve had this problem the whole time, how many pings have these messages in both parent and content, and whose histograms have identical sums?</p>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">relevant_ping</span><span class="p">(</span><span class="n">p</span><span class="p">):</span>
<span class="n">parent</span> <span class="o">=</span> <span class="n">p</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;payload&#39;</span><span class="p">,</span> <span class="p">{})</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;keyedHistograms&#39;</span><span class="p">,</span> <span class="p">{})</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;IPC_MESSAGE_SIZE&#39;</span><span class="p">,</span> <span class="p">{})</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;PLayerTransaction::Msg_Update&#39;</span><span class="p">)</span>
<span class="n">content</span> <span class="o">=</span> <span class="n">p</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;payload&#39;</span><span class="p">,</span> <span class="p">{})</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;processes&#39;</span><span class="p">,</span> <span class="p">{})</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;content&#39;</span><span class="p">,</span> <span class="p">{})</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;keyedHistograms&#39;</span><span class="p">,</span> <span class="p">{})</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;IPC_MESSAGE_SIZE&#39;</span><span class="p">,</span> <span class="p">{})</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;PLayerTransaction::Msg_Update&#39;</span><span class="p">)</span>
<span class="k">return</span> <span class="n">parent</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span> <span class="ow">and</span> <span class="n">content</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span> <span class="ow">and</span> <span class="n">parent</span><span class="p">[</span><span class="s1">&#39;sum&#39;</span><span class="p">]</span> <span class="o">==</span> <span class="n">content</span><span class="p">[</span><span class="s1">&#39;sum&#39;</span><span class="p">]</span>
<span class="n">relevant_pings</span> <span class="o">=</span> <span class="n">pings</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">relevant_ping</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">relevant_pings</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
</pre></div>
<div class="codehilite"><pre><span></span>149126
</pre></div>
<p>Yup, it appears as though we&rsquo;ve had this problem since nightly/52.</p>
<h3 id="how-about-recently">How about recently?</h3>
<div class="codehilite"><pre><span></span><span class="n">modern_pings</span> <span class="o">=</span> <span class="n">Dataset</span><span class="o">.</span><span class="n">from_source</span><span class="p">(</span><span class="s2">&quot;telemetry&quot;</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">docType</span><span class="o">=</span><span class="s1">&#39;main&#39;</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">submissionDate</span><span class="o">=</span><span class="s2">&quot;20170716&quot;</span><span class="p">)</span> \
<span class="o">.</span><span class="n">records</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">sample</span><span class="o">=</span><span class="mf">0.01</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span>fetching 7012.25715MB in 1970 files...
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">modern_combos</span> <span class="o">=</span> <span class="n">modern_pings</span><span class="o">.</span><span class="n">flatMap</span><span class="p">(</span><span class="n">get_problem_combos</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">modern_combos</span><span class="o">.</span><span class="n">cache</span><span class="p">()</span>
</pre></div>
<div class="codehilite"><pre><span></span>PythonRDD[51] at RDD at PythonRDD.scala:48
</pre></div>
<div class="codehilite"><pre><span></span><span class="nb">sorted</span><span class="p">(</span><span class="n">modern_combos</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">c</span><span class="p">:</span> <span class="p">(</span><span class="n">c</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="mi">1</span><span class="p">))</span><span class="o">.</span><span class="n">countByKey</span><span class="p">()</span><span class="o">.</span><span class="n">iteritems</span><span class="p">(),</span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n">reverse</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span>[(u&#39;NOTIFY_OBSERVERS_LATENCY_MS&#39;, 72463),
(u&#39;DOM_SCRIPT_SRC_ENCODING&#39;, 33021),
(u&#39;FX_SESSION_RESTORE_CONTENT_COLLECT_DATA_MS&#39;, 30709),
(u&#39;CONTENT_LARGE_PAINT_PHASE_WEIGHT&#39;, 11613),
(u&#39;IPC_WRITE_MAIN_THREAD_LATENCY_MS&#39;, 11186),
(u&#39;MAIN_THREAD_RUNNABLE_MS&#39;, 7872),
(u&#39;IPC_READ_MAIN_THREAD_LATENCY_MS&#39;, 6646),
(u&#39;SYNC_WORKER_OPERATION&#39;, 5614),
(u&#39;IPC_SYNC_RECEIVE_MS&#39;, 4227),
(u&#39;IPC_MESSAGE_SIZE&#39;, 3514),
(u&#39;BLOCKED_ON_PLUGIN_MODULE_INIT_MS&#39;, 2377),
(u&#39;IPC_SYNC_MESSAGE_MANAGER_LATENCY_MS&#39;, 902),
(u&#39;IPC_SYNC_MAIN_LATENCY_MS&#39;, 833),
(u&#39;IDLE_RUNNABLE_BUDGET_OVERUSE_MS&#39;, 701),
(u&#39;MESSAGE_MANAGER_MESSAGE_SIZE2&#39;, 615),
(u&#39;FX_TAB_REMOTE_NAVIGATION_DELAY_MS&#39;, 433),
(u&#39;CANVAS_WEBGL_FAILURE_ID&#39;, 138),
(u&#39;CANVAS_WEBGL_ACCL_FAILURE_ID&#39;, 110),
(u&#39;MEDIA_CODEC_USED&#39;, 20),
(u&#39;IPC_SYNC_LATENCY_MS&#39;, 9),
(u&#39;VIDEO_HIDDEN_PLAY_TIME_PERCENTAGE&#39;, 8),
(u&#39;VIDEO_INFERRED_DECODE_SUSPEND_PERCENTAGE&#39;, 8),
(u&#39;PRINT_DIALOG_OPENED_COUNT&#39;, 2)]
</pre></div>
<div class="codehilite"><pre><span></span><span class="nb">sorted</span><span class="p">(</span><span class="n">modern_combos</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">c</span><span class="p">:</span> <span class="p">(</span><span class="n">c</span><span class="p">,</span> <span class="mi">1</span><span class="p">))</span><span class="o">.</span><span class="n">countByKey</span><span class="p">()</span><span class="o">.</span><span class="n">iteritems</span><span class="p">(),</span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n">reverse</span><span class="o">=</span><span class="bp">True</span><span class="p">)[:</span><span class="mi">20</span><span class="p">]</span>
</pre></div>
<div class="codehilite"><pre><span></span>[((u&#39;DOM_SCRIPT_SRC_ENCODING&#39;, u&#39;UTF-8&#39;), 16824),
((u&#39;DOM_SCRIPT_SRC_ENCODING&#39;, u&#39;windows-1252&#39;), 16165),
((u&#39;NOTIFY_OBSERVERS_LATENCY_MS&#39;, u&#39;cycle-collector-begin&#39;), 13727),
((u&#39;NOTIFY_OBSERVERS_LATENCY_MS&#39;, u&#39;garbage-collection-statistics&#39;), 13150),
((u&#39;NOTIFY_OBSERVERS_LATENCY_MS&#39;, u&#39;cycle-collector-forget-skippable&#39;),
12719),
((u&#39;NOTIFY_OBSERVERS_LATENCY_MS&#39;, u&#39;inner-window-destroyed&#39;), 8619),
((u&#39;NOTIFY_OBSERVERS_LATENCY_MS&#39;, u&#39;tab-content-frameloader-created&#39;), 7924),
((u&#39;FX_SESSION_RESTORE_CONTENT_COLLECT_DATA_MS&#39;, u&#39;historychange&#39;), 7537),
((u&#39;FX_SESSION_RESTORE_CONTENT_COLLECT_DATA_MS&#39;, u&#39;pageStyle&#39;), 7390),
((u&#39;FX_SESSION_RESTORE_CONTENT_COLLECT_DATA_MS&#39;, u&#39;scroll&#39;), 7389),
((u&#39;FX_SESSION_RESTORE_CONTENT_COLLECT_DATA_MS&#39;, u&#39;storage&#39;), 7380),
((u&#39;IPC_WRITE_MAIN_THREAD_LATENCY_MS&#39;, u&#39;PLayerTransaction::Msg_Update&#39;),
6284),
((u&#39;SYNC_WORKER_OPERATION&#39;, u&#39;WorkerCheckAPIExposureOnMainThread&#39;), 4926),
((u&#39;NOTIFY_OBSERVERS_LATENCY_MS&#39;, u&#39;content-document-global-created&#39;), 4486),
((u&#39;IPC_SYNC_RECEIVE_MS&#39;, u&#39;???&#39;), 4227),
((u&#39;IPC_READ_MAIN_THREAD_LATENCY_MS&#39;, u&#39;PCompositorBridge::Msg_DidComposite&#39;),
3523),
((u&#39;NOTIFY_OBSERVERS_LATENCY_MS&#39;, u&#39;document-element-inserted&#39;), 3498),
((u&#39;IPC_WRITE_MAIN_THREAD_LATENCY_MS&#39;,
u&#39;PCompositorBridge::Msg_PTextureConstructor&#39;),
2231),
((u&#39;IPC_MESSAGE_SIZE&#39;, u&#39;PBrowser::Msg_AsyncMessage&#39;), 2083),
((u&#39;IPC_READ_MAIN_THREAD_LATENCY_MS&#39;, u&#39;PBrowser::Msg_AsyncMessage&#39;), 2031)]
</pre></div>
<p>The behaviour still exists, though this suggests that plugins and ipc messages are now less common. Instead we see more latency probes.</p>
</div>
</div>
</div>
</div>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<h2> 0 Comments </h2>
</br>
<div>
<div class="row">
<div class="col-md-12">
<textarea class="form-control" type="text" id="comment-text" style="height:87px;" placeholder="Leave a comment..."></textarea>
</div>
</div>
</div>
</br>
<div>
<button class="btn btn-primary" id="post_comment_btn">Post Comment</button>
</div>
</div>
</div>
<br>
<div class="row">
<div class="col-md-12">
</div>
</br>
</div>
</div>
</div>
</div>
<div class="footer">
Served with <span class="glyphicon glyphicon-heart"></span> by <a href="https://github.com/airbnb/knowledge-repo">Knowledge Repo</a> <a href="https://github.com/airbnb/knowledge-repo/releases/tag/v0.7.4">0.7.4</a><br />
<i title="Last checked for updates: 2 minutes ago">Last indexed: 14 hours ago</i>
</div>
<script type='text/javascript'>
$("#searchbar")[0].setSelectionRange(1000, 1000);
$('#searchbar').typeahead({
hint: false,
highlight: true,
minLength: 1
},
{
name: 'knowledge_posts',
limit: 10,
display: function (item) {
return item.title + " - " + item.author;
},
templates: {
empty: Handlebars.compile(
'<div class="tt-not-found">' +
'Unable to find any posts that match the current query' +
'</div>'
),
suggestion: function(data) {
return '<p style="overflow-wrap:break-word"><strong class="text-rausch">' + data.title + '</strong> – ' + data.author + '</p>';
}
},
source: function(q, sync, async) {
$.ajax('/ajax/index/typeahead?search=' + q,
{
success: function(data,status){ async(JSON.parse(data)); }
})
}
});
$('#searchbar').bind('typeahead:select', function(obj, datum, name) {
window.location = '/post/'+encodeURIComponent(datum.path);
});
$('#searchbar').keypress(function(event){
var keycode = (event.keyCode ? event.keyCode : event.which);
if(keycode == '13'){
var path = document.location.pathname;
window.location = '/feed?filters=' + $('#searchbar').val()
}
});
var padding = $('.tt-menu').outerWidth()
$('.tt-menu').width($('#searchbar').width() + padding + "px")
</script>
<script src="/static/js/tooltips.js" type="text/javascript"></script>
<script type="text/javascript">
$("document").ready(function(){
var is_webeditor = false;
var post_id = "31";
var id = "None";
var post_path = "bug1381516.kp"
var data_repo_github_root = ""
tooltipsJx.initializeTooltips(is_webeditor, post_id, id, data_repo_github_root);
$(".btn-rendered").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path);
})
$(".btn-raw").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path) + "?render=raw";
})
$(".btn-webeditor").on("click", function(){
document.location.href = "/edit/" + encodeURI(post_path);
})
});
</script>
<script src="/static/js/helpers.js"></script>
<script src="/static/js/tags.js" type="text/javascript"></script>
<script src="/static/js/icons.js" type="text/javascript"></script>
<script src="/static/js/comments.js" type="text/javascript"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script type="text/javascript"
src="https://cdn.mathjax.org/mathjax/latest/MathJax.js">
</script>
<script>
$(document).on('ready', function(){
// Make the Rendered Markdown Button active
$(".btn-rendered").addClass("btn-active");
// Initialize headers
helpersJx.linkifyHeaders();
// Initialize comments
var post_path = 'bug1381516.kp';
$("#post_comment_btn").on('click', function(){
comment_author = 'knowledge_default';
post_author = 'chutten';
post_title = 'Bug 1381516 - How Bad Is Bug 1380880?';
commentsJx.postComment(comment_author, post_author, post_title, post_path);
location.reload();
});
all_comment_delete_buttons = $("[id^=delete_comment]")
$.each(all_comment_delete_buttons, function(i,v){
$(v).on("click", function(){
var id = v.id;
var comment_id = id.split("__")[1];
if(comment_id) {
commentsJx.deleteComment(post_path, comment_id)
location.reload();
}
});
});
$(document.body).on('click',"button[id^=tag-subscription]",function () {
tagsJx.addTagSubscriptionListener($(this)[0]);
});
})
//Turn all the headers to be links
//h1 = Title, don't want that
var all_headers = [$("h2"), $("h3"), $("h4"), $("h5"), $("h6")]
$.each(all_headers, function(index, value){
$.each(value, function(i, v){
var inner_html = v.innerHTML
inner_html_no_special = inner_html.replace(/[^a-zA-Z\- ]/g, "")
var inner_link = "#" + inner_html_no_special.toLowerCase().split(" ").join("-")
v.innerHTML = "<a href='" + inner_link + "' class=link-reset>" + inner_html + "</a>"
})
})
//turn all the tags into links, similar to what's done on the feed page
var tags = $("#renderedMarkdown .metadata .tags")[0]
var tags_list = ['investigation', 'keyed histograms', 'archaeology']
var subscriptions_list = []
$.each(tags_list, function(i,tag){
ahref = document.createElement("a")
e_tag = encodeURIComponent(tag)
f_tag = tag.replace("/", "__")
tag_name = "#" + tag
tag_subscription_button_id_name = "tag-subscription-" + i + "__" + f_tag
ahref.setAttribute("data-container", "body")
ahref.setAttribute("data-toggle", "popover")
ahref.setAttribute("data-placement", "bottom")
ahref.setAttribute("data-html", "true")
ahref.setAttribute("data-tag-name", f_tag)
if (subscriptions_list.indexOf(tag) >= 0) {
ahref.setAttribute("class", "label label-subscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-primary btn-unsubscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-remove-sign glyphicon-white'></i>Unsubscribe " +
" </button> " +
" </div>")
} else {
ahref.setAttribute("class", "label label-unsubscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-default btn-subscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-ok-sign glyphicon-filled'></i>Subscribe " +
" </button> " +
" </div>")
}
ahref.setAttribute("href", "/tag_pages?tag=" + e_tag)
ahref.setAttribute("style", "font-weight:normal")
if (i == 0){
ahref.innerHTML = " "
colon = document.createElement("text")
colon.innerHTML = "<b>Tags</b>: "
tags.appendChild(colon)
}
ahref.innerHTML = ahref.innerHTML + tag_name
tags.appendChild(ahref)
if (i != tags_list.length - 1){
comma = document.createElement("text")
comma.innerText = ", "
tags.appendChild(comma)
}
})
tags.nextSibling.remove()
tags.innerHTML += "<i class='glyphicon glyphicon-edit icon-gray' style='font-size:12pt; padding-left:4px' id='tooltip-edit_tags'></i>"
$(".pop").popover({ trigger: "manual" , html: true, animation:false, delay: 100})
.on("mouseenter", function () {
var _this = this;
$(this).popover("show");
$(".popover").on("mouseleave", function () {
$(_this).popover('hide');
});
}).on("mouseleave", function () {
var _this = this;
setTimeout(function () {
if (!$(".popover:hover").length) {
$(_this).popover("hide");
}
}, 300);
});
$('#tooltip-edit_tags').click(function(){
$('#tooltip-edit_tags')[0].setAttribute("style", "display:none")
previousSibling = $("#tooltip-edit_tags")[0].previousSibling
tags_string = tags_list.join(", ")
form = document.createElement("form")
input = document.createElement("input")
tags_text = document.createElement("text")
icon_class = document.createElement("i")
icon_class.setAttribute("class", "glyphicon glyphicon-upload icon-gray")
icon_class.setAttribute("style", "font-size:23px; padding-left:4px")
icon_class.setAttribute("id", "tooltip-save_tags")
tags_text.innerText = "Tags: "
input.setAttribute('type', 'text')
input.setAttribute('name', 'tags_list')
input.setAttribute('value', tags_string)
input.setAttribute('style', 'width:75%; display: inline-block')
input.setAttribute('id' , 'change_tags')
form.appendChild(tags_text)
tags.textContent = " "
form.appendChild(input)
form.appendChild(icon_class)
tags.appendChild(form)
$("#tooltip-save_tags").click(function(){
tags_string = $("#change_tags")[0].value
tags_list = tags_string.split(",")
var re = /^[a-z0-9\-\_\:\/]+$/i
var good = true
for (var i = 0; i < tags_list.length; i++){
tag = tags_list[i]
if (tag.length == 0){
alert("There is a tag with length 0 - possible a trailing comma?")
good = false
break
} else {
tag_name = tag.trim()
if (!(re.test(tag_name))){
alert("The tag contains special characters. Make sure there are only alphanumeric characters in your tag")
good = false
break
}
}
}
if (good) {
var postContent = {}
postContent['tags'] = tags_string
$.ajax({
type: "POST",
dataType: "json",
data: JSON.stringify(postContent),
contentType: "application/json",
url: '/tag_list?post_path=bug1381516.kp',
async: false
});
location.reload()
}
})
tags.nextSibling.remove()
// Allow user to edit tags
var edit_icon = iconsJx.createEditTagsIcon();
$(tags).after(edit_icon);
$("#tooltip-edit_tags").on("click", function(){
var edit_tooltip = $("#tooltip-edit_tags");
edit_tooltip.attr("style", "display:none");
var tags_string = tags_list.join(", ");
var form = $("<form>");
var input = $("<input>");
var tags_text = $("<text>");
tags_text.html("Tags: ");
var icon = iconsJx.createSaveTagsIcon();
input.attr("type", "text");
input.attr("name", "tags_list");
input.attr("style", "width:75%; display: inline-block");
input.attr("id", "change_tags");
form.append(tags_text);
tags.textContent = " ";
tags_text.innerHTML = "Tags: ";
form.append(input);
form.append(icon);
tags.appendChild(form[0]);
$("#change_tags")[0].value = tags_string;
$("#change_tags").keypress(function(e){
if (e.which == 13){
var tags_string = $("#change_tags")[0].value;
var post_path = "bug1381516.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false;
};
});
$("#tooltip-save_tags").click(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "bug1381516.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
});
$("form").submit(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "bug1381516.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false
})
});
});
</script>
</body>
</html>

14
bug1381516.kp/report.json Normal file
Просмотреть файл

@ -0,0 +1,14 @@
{
"title": "Bug 1381516 - How Bad Is Bug 1380880?",
"authors": [
"chutten"
],
"tags": [
"investigation",
"keyed histograms",
"archaeology"
],
"publish_date": "2017-07-17",
"updated_at": "2017-07-17",
"tldr": "How broadly and how deeply do the effects of bug 1380880 extend?"
}

Просмотреть файл

@ -0,0 +1,16 @@
{
"title": "E10s Testing for Beta 51 week 4",
"authors": [
"rvitillo",
"dzeber",
"bmiroglio"
],
"tags": [
"e10s",
"experiment",
"add-ons"
],
"publish_date": "2017-01-10",
"updated_at": "2017-01-10",
"tldr": "Analysis of e10s experiment for profiles with and without add-ons"
}

Просмотреть файл

@ -0,0 +1,16 @@
{
"title": "E10s Testing for Beta 51 week 5",
"authors": [
"rvitillo",
"dzeber",
"bmiroglio"
],
"tags": [
"e10s",
"experiment",
"add-ons"
],
"publish_date": "2017-01-10",
"updated_at": "2017-01-10",
"tldr": "Analysis of e10s experiment for profiles with and without add-ons"
}

Просмотреть файл

@ -0,0 +1,16 @@
{
"title": "E10s Testing for Beta 51 week 6",
"authors": [
"rvitillo",
"dzeber",
"bmiroglio"
],
"tags": [
"e10s",
"experiment",
"add-ons"
],
"publish_date": "2017-01-10",
"updated_at": "2017-01-10",
"tldr": "Analysis of e10s experiment for profiles with and without add-ons"
}

Просмотреть файл

@ -0,0 +1,548 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title> Knowledge </title>
<!-- js includes at the top as post embedded js colliding -->
<script src="https://reports.telemetry.mozilla.org/static/modules/jquery/jquery.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/tether/js/tether.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap/js/bootstrap.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/js/bootstrap-slider.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/typeahead.js/typeahead.bundle.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/handlebars/js/handlebars.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/select2/js/select2.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/hightlight.pack.js/highlight.pack.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/marked.js/marked.js"></script>
<!-- require js is used for plotly, but has a bunch of collisions with other js packages
make sure to have it be last js package imported -->
<script src="https://reports.telemetry.mozilla.org/static/modules/require.js/require.min.js"></script>
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.6.2/html5shiv.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/2.1.0/es5-shim.min.js"></script>
<![endif]-->
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap/css/bootstrap.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/css/bootstrap-slider.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/css/custom.css" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/select2/css/select2.min.css" rel="stylesheet" type="text/css"/>
<link href="https://fonts.googleapis.com/css?family=Lato:400,900|Playfair+Display|Source+Serif+Pro:400,700" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/images/favicon.png" rel="shortcut icon"/>
<link href="https://reports.telemetry.mozilla.org/static/css/codehilite-friendly.css" rel="stylesheet"/>
<style>
.spinner {
position: fixed;
top: 50%;
left: 50%;
margin-left: -50px; /* half width of the spinner gif */
margin-top: -50px; /* half height of the spinner gif */
text-align:center;
z-index:1234;
overflow: auto;
width: 100px; /* width of the spinner gif */
height: 102px; /*hight of the spinner gif +2px to fix IE8 issue */
}
.table {
font-size: 14px;
}
.modal-content {
max-width: 1024px;
}
</style>
</head>
<body>
<div class="container page-container">
<br/>
<div class="container-fluid">
<div class="row">
<div class="col-md-6">
</div>
<div class="col-md-2">
</div>
</div>
<div class="row col-md-12">
</div>
</div>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<div id="renderedMarkdown">
<div class="metadata">
<h1>Android Addons ETL job</h1>
<span class="authors"><a href="/feed?authors=Frank+Bertsch">Frank Bertsch</a></span>
<span class="date_created">February 17, 2017</span>
<span class="date_updated">(Last Updated: February 17, 2017)</span>
<span class="tldr"><p>This job takes the Fennec saved session pings and maps them to just client, submissionDate, activeAddons, and persona.</p></span>
</div>
<div class="codehilite"><pre><span></span><span class="kn">import</span> <span class="nn">datetime</span> <span class="kn">as</span> <span class="nn">dt</span>
<span class="kn">import</span> <span class="nn">os</span>
<span class="kn">import</span> <span class="nn">pandas</span> <span class="kn">as</span> <span class="nn">pd</span>
<span class="kn">import</span> <span class="nn">operator</span>
<span class="kn">import</span> <span class="nn">ujson</span> <span class="kn">as</span> <span class="nn">json</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="o">*</span>
<span class="kn">from</span> <span class="nn">moztelemetry</span> <span class="kn">import</span> <span class="n">get_pings</span><span class="p">,</span> <span class="n">get_pings_properties</span><span class="p">,</span> <span class="n">get_one_ping_per_client</span>
<span class="o">%</span><span class="n">pylab</span> <span class="n">inline</span>
</pre></div>
<p>Take the set of pings, make sure we have actual clientIds and remove duplicate pings.</p>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">safe_str</span><span class="p">(</span><span class="n">obj</span><span class="p">):</span>
<span class="sd">""" return the byte string representation of obj """</span>
<span class="k">if</span> <span class="n">obj</span> <span class="ow">is</span> <span class="bp">None</span><span class="p">:</span>
<span class="k">return</span> <span class="nb">unicode</span><span class="p">(</span><span class="s2">""</span><span class="p">)</span>
<span class="k">return</span> <span class="nb">unicode</span><span class="p">(</span><span class="n">obj</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">dedupe_pings</span><span class="p">(</span><span class="n">rdd</span><span class="p">):</span>
<span class="k">return</span> <span class="n">rdd</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="n">p</span><span class="p">[</span><span class="s2">"meta/clientId"</span><span class="p">]</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">)</span>\
<span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="p">(</span><span class="n">p</span><span class="p">[</span><span class="s2">"meta/documentId"</span><span class="p">],</span> <span class="n">p</span><span class="p">))</span>\
<span class="o">.</span><span class="n">reduceByKey</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span><span class="p">)</span>\
<span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
<span class="k">def</span> <span class="nf">dedupe_addons</span><span class="p">(</span><span class="n">rdd</span><span class="p">):</span>
<span class="k">return</span> <span class="n">rdd</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="p">(</span><span class="n">p</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">+</span> <span class="n">safe_str</span><span class="p">(</span><span class="n">p</span><span class="p">[</span><span class="mi">2</span><span class="p">])</span> <span class="o">+</span> <span class="n">safe_str</span><span class="p">(</span><span class="n">p</span><span class="p">[</span><span class="mi">3</span><span class="p">]),</span> <span class="n">p</span><span class="p">))</span>\
<span class="o">.</span><span class="n">reduceByKey</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span><span class="p">)</span>\
<span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
</pre></div>
<p>Were going to dump each event from the pings. Do a little empty data sanitization so we dont get NoneType errors during the dump. We create a JSON array of active experiments as part of the dump.</p>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">clean</span><span class="p">(</span><span class="n">s</span><span class="p">):</span>
<span class="k">try</span><span class="p">:</span>
<span class="n">s</span> <span class="o">=</span> <span class="n">s</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="s2">"ascii"</span><span class="p">)</span><span class="o">.</span><span class="n">strip</span><span class="p">()</span>
<span class="k">return</span> <span class="n">s</span> <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">s</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span> <span class="k">else</span> <span class="bp">None</span>
<span class="k">except</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">None</span>
<span class="k">def</span> <span class="nf">transform</span><span class="p">(</span><span class="n">ping</span><span class="p">):</span>
<span class="n">output</span> <span class="o">=</span> <span class="p">[]</span>
<span class="c1"># These should not be None since we filter those out &amp; ingestion process adds the data</span>
<span class="n">clientId</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"meta/clientId"</span><span class="p">]</span>
<span class="n">submissionDate</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">strptime</span><span class="p">(</span><span class="n">ping</span><span class="p">[</span><span class="s2">"meta/submissionDate"</span><span class="p">],</span> <span class="s2">"%Y%m</span><span class="si">%d</span><span class="s2">"</span><span class="p">)</span>
<span class="n">addonset</span> <span class="o">=</span> <span class="p">{}</span>
<span class="n">addons</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"environment/addons/activeAddons"</span><span class="p">]</span>
<span class="k">if</span> <span class="n">addons</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="k">for</span> <span class="n">addon</span><span class="p">,</span> <span class="n">desc</span> <span class="ow">in</span> <span class="n">addons</span><span class="o">.</span><span class="n">iteritems</span><span class="p">():</span>
<span class="n">name</span> <span class="o">=</span> <span class="n">clean</span><span class="p">(</span><span class="n">desc</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"name"</span><span class="p">,</span> <span class="bp">None</span><span class="p">))</span>
<span class="k">if</span> <span class="n">name</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">addonset</span><span class="p">[</span><span class="n">name</span><span class="p">]</span> <span class="o">=</span> <span class="mi">1</span>
<span class="n">persona</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"environment/addons/persona"</span><span class="p">]</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">addonset</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span> <span class="ow">or</span> <span class="n">persona</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">addonarray</span> <span class="o">=</span> <span class="bp">None</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">addonset</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">addonarray</span> <span class="o">=</span> <span class="n">json</span><span class="o">.</span><span class="n">dumps</span><span class="p">(</span><span class="n">addonset</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>
<span class="n">output</span><span class="o">.</span><span class="n">append</span><span class="p">([</span><span class="n">clientId</span><span class="p">,</span> <span class="n">submissionDate</span><span class="p">,</span> <span class="n">addonarray</span><span class="p">,</span> <span class="n">persona</span><span class="p">])</span>
<span class="k">return</span> <span class="n">output</span>
</pre></div>
<p>Create a set of events from “saved-session” UI telemetry. Output the data to CSV or Parquet.</p>
<p>This script is designed to loop over a range of days and output a single day for the given channels. Use explicit date ranges for backfilling, or now() - 1day for automated runs.</p>
<div class="codehilite"><pre><span></span><span class="n">channels</span> <span class="o">=</span> <span class="p">[</span><span class="s2">"nightly"</span><span class="p">,</span> <span class="s2">"aurora"</span><span class="p">,</span> <span class="s2">"beta"</span><span class="p">,</span> <span class="s2">"release"</span><span class="p">]</span>
<span class="n">batch_date</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'date'</span><span class="p">)</span>
<span class="k">if</span> <span class="n">batch_date</span><span class="p">:</span>
<span class="n">start</span> <span class="o">=</span> <span class="n">end</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">strptime</span><span class="p">(</span><span class="n">batch_date</span><span class="p">,</span> <span class="s1">'%Y%m</span><span class="si">%d</span><span class="s1">'</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">start</span> <span class="o">=</span> <span class="n">start</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">now</span><span class="p">()</span> <span class="o">-</span> <span class="n">dt</span><span class="o">.</span><span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="n">day</span> <span class="o">=</span> <span class="n">start</span>
<span class="k">while</span> <span class="n">day</span> <span class="o">&lt;=</span> <span class="n">end</span><span class="p">:</span>
<span class="k">for</span> <span class="n">channel</span> <span class="ow">in</span> <span class="n">channels</span><span class="p">:</span>
<span class="k">print</span> <span class="s2">"</span><span class="se">\n</span><span class="s2">channel: "</span> <span class="o">+</span> <span class="n">channel</span> <span class="o">+</span> <span class="s2">", date: "</span> <span class="o">+</span> <span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">"%Y%m</span><span class="si">%d</span><span class="s2">"</span><span class="p">)</span>
<span class="n">pings</span> <span class="o">=</span> <span class="n">get_pings</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">app</span><span class="o">=</span><span class="s2">"Fennec"</span><span class="p">,</span> <span class="n">channel</span><span class="o">=</span><span class="n">channel</span><span class="p">,</span>
<span class="n">submission_date</span><span class="o">=</span><span class="p">(</span><span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">"%Y%m</span><span class="si">%d</span><span class="s2">"</span><span class="p">),</span> <span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">"%Y%m</span><span class="si">%d</span><span class="s2">"</span><span class="p">)),</span>
<span class="n">build_id</span><span class="o">=</span><span class="p">(</span><span class="s2">"20100101000000"</span><span class="p">,</span> <span class="s2">"99999999999999"</span><span class="p">),</span>
<span class="n">fraction</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="n">subset</span> <span class="o">=</span> <span class="n">get_pings_properties</span><span class="p">(</span><span class="n">pings</span><span class="p">,</span> <span class="p">[</span><span class="s2">"meta/clientId"</span><span class="p">,</span>
<span class="s2">"meta/documentId"</span><span class="p">,</span>
<span class="s2">"meta/submissionDate"</span><span class="p">,</span>
<span class="s2">"environment/addons/activeAddons"</span><span class="p">,</span>
<span class="s2">"environment/addons/persona"</span><span class="p">])</span>
<span class="n">subset</span> <span class="o">=</span> <span class="n">dedupe_pings</span><span class="p">(</span><span class="n">subset</span><span class="p">)</span>
<span class="k">print</span> <span class="n">subset</span><span class="o">.</span><span class="n">first</span><span class="p">()</span>
<span class="n">rawAddons</span> <span class="o">=</span> <span class="n">subset</span><span class="o">.</span><span class="n">flatMap</span><span class="p">(</span><span class="n">transform</span><span class="p">)</span>
<span class="k">print</span> <span class="s2">"</span><span class="se">\n</span><span class="s2">rawAddons count: "</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">rawAddons</span><span class="o">.</span><span class="n">count</span><span class="p">())</span>
<span class="k">print</span> <span class="n">rawAddons</span><span class="o">.</span><span class="n">first</span><span class="p">()</span>
<span class="n">uniqueAddons</span> <span class="o">=</span> <span class="n">dedupe_addons</span><span class="p">(</span><span class="n">rawAddons</span><span class="p">)</span>
<span class="k">print</span> <span class="s2">"</span><span class="se">\n</span><span class="s2">uniqueAddons count: "</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">uniqueAddons</span><span class="o">.</span><span class="n">count</span><span class="p">())</span>
<span class="k">print</span> <span class="n">uniqueAddons</span><span class="o">.</span><span class="n">first</span><span class="p">()</span>
<span class="n">s3_output</span> <span class="o">=</span> <span class="s2">"s3n://net-mozaws-prod-us-west-2-pipeline-analysis/mobile/android_addons"</span>
<span class="n">s3_output</span> <span class="o">+=</span> <span class="s2">"/v1/channel="</span> <span class="o">+</span> <span class="n">channel</span> <span class="o">+</span> <span class="s2">"/submission="</span> <span class="o">+</span> <span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">"%Y%m</span><span class="si">%d</span><span class="s2">"</span><span class="p">)</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">StructType</span><span class="p">([</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"clientid"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">False</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"submissiondate"</span><span class="p">,</span> <span class="n">TimestampType</span><span class="p">(),</span> <span class="bp">False</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"addons"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"lwt"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">)</span>
<span class="p">])</span>
<span class="n">grouped</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">uniqueAddons</span><span class="p">,</span> <span class="n">schema</span><span class="p">)</span>
<span class="n">grouped</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="n">s3_output</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s2">"overwrite"</span><span class="p">)</span>
<span class="n">day</span> <span class="o">+=</span> <span class="n">dt</span><span class="o">.</span><span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
</div>
</div></body></html>
<div>
</div>
<br/>
<div class="row">
<div class="col-md-12">
</div>
</div>
<script type="text/javascript">
$("#searchbar")[0].setSelectionRange(1000, 1000);
$('#searchbar').typeahead({
hint: false,
highlight: true,
minLength: 1
},
{
name: 'knowledge_posts',
limit: 10,
display: function (item) {
return item.title + " - " + item.author;
},
templates: {
empty: Handlebars.compile(
'<div class="tt-not-found">' +
'Unable to find any posts that match the current query' +
'</div>'
),
suggestion: function(data) {
return '<p style="overflow-wrap:break-word"><strong class="text-rausch">' + data.title + '</strong> – ' + data.author + '</p>';
}
},
source: function(q, sync, async) {
$.ajax('/ajax/index/typeahead?search=' + q,
{
success: function(data,status){ async(JSON.parse(data)); }
})
}
});
$('#searchbar').bind('typeahead:select', function(obj, datum, name) {
window.location = '/post/'+encodeURIComponent(datum.path);
});
$('#searchbar').keypress(function(event){
var keycode = (event.keyCode ? event.keyCode : event.which);
if(keycode == '13'){
var path = document.location.pathname;
window.location = '/feed?filters=' + $('#searchbar').val()
}
});
var padding = $('.tt-menu').outerWidth()
$('.tt-menu').width($('#searchbar').width() + padding + "px")
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/tooltips.js" type="text/javascript"></script>
<script type="text/javascript">
$("document").ready(function(){
var is_webeditor = false;
var post_id = "10";
var id = "None";
var post_path = "etl/android-addons.kp"
var data_repo_github_root = ""
tooltipsJx.initializeTooltips(is_webeditor, post_id, id, data_repo_github_root);
$(".btn-rendered").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path);
})
$(".btn-raw").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path) + "?render=raw";
})
$(".btn-webeditor").on("click", function(){
document.location.href = "/edit/" + encodeURI(post_path);
})
});
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/tags.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/icons.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/comments.js" type="text/javascript"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js" type="text/javascript">
</script>
<script>
$(document).on('ready', function(){
// Make the Rendered Markdown Button active
$(".btn-rendered").addClass("btn-active");
// Initialize headers
helpersJx.linkifyHeaders();
// Initialize comments
var post_path = 'etl/android-addons.kp';
$("#post_comment_btn").on('click', function(){
comment_author = 'knowledge_default';
post_author = 'Frank Bertsch';
post_title = 'Android Addons ETL job';
commentsJx.postComment(comment_author, post_author, post_title, post_path);
location.reload();
});
all_comment_delete_buttons = $("[id^=delete_comment]")
$.each(all_comment_delete_buttons, function(i,v){
$(v).on("click", function(){
var id = v.id;
var comment_id = id.split("__")[1];
if(comment_id) {
commentsJx.deleteComment(post_path, comment_id)
location.reload();
}
});
});
$(document.body).on('click',"button[id^=tag-subscription]",function () {
tagsJx.addTagSubscriptionListener($(this)[0]);
});
})
//Turn all the headers to be links
//h1 = Title, don't want that
var all_headers = [$("h2"), $("h3"), $("h4"), $("h5"), $("h6")]
$.each(all_headers, function(index, value){
$.each(value, function(i, v){
var inner_html = v.innerHTML
inner_html_no_special = inner_html.replace(/[^a-zA-Z\- ]/g, "")
var inner_link = "#" + inner_html_no_special.toLowerCase().split(" ").join("-")
v.innerHTML = "<a href='" + inner_link + "' class=link-reset>" + inner_html + "</a>"
})
})
//turn all the tags into links, similar to what's done on the feed page
var tags = $("#renderedMarkdown .metadata .tags")[0]
var tags_list = ['mobile', 'etl']
var subscriptions_list = []
$.each(tags_list, function(i,tag){
ahref = document.createElement("a")
e_tag = encodeURIComponent(tag)
f_tag = tag.replace("/", "__")
tag_name = "#" + tag
tag_subscription_button_id_name = "tag-subscription-" + i + "__" + f_tag
ahref.setAttribute("data-container", "body")
ahref.setAttribute("data-toggle", "popover")
ahref.setAttribute("data-placement", "bottom")
ahref.setAttribute("data-html", "true")
ahref.setAttribute("data-tag-name", f_tag)
if (subscriptions_list.indexOf(tag) >= 0) {
ahref.setAttribute("class", "label label-subscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-primary btn-unsubscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-remove-sign glyphicon-white'></i>Unsubscribe " +
" </button> " +
" </div>")
} else {
ahref.setAttribute("class", "label label-unsubscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-default btn-subscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-ok-sign glyphicon-filled'></i>Subscribe " +
" </button> " +
" </div>")
}
ahref.setAttribute("href", "/tag_pages?tag=" + e_tag)
ahref.setAttribute("style", "font-weight:normal")
if (i == 0){
ahref.innerHTML = " "
colon = document.createElement("text")
colon.innerHTML = "<b>Tags</b>: "
tags.appendChild(colon)
}
ahref.innerHTML = ahref.innerHTML + tag_name
tags.appendChild(ahref)
if (i != tags_list.length - 1){
comma = document.createElement("text")
comma.innerText = ", "
tags.appendChild(comma)
}
})
tags.nextSibling.remove()
tags.innerHTML += "<i class='glyphicon glyphicon-edit icon-gray' style='font-size:12pt; padding-left:4px' id='tooltip-edit_tags'></i>"
$(".pop").popover({ trigger: "manual" , html: true, animation:false, delay: 100})
.on("mouseenter", function () {
var _this = this;
$(this).popover("show");
$(".popover").on("mouseleave", function () {
$(_this).popover('hide');
});
}).on("mouseleave", function () {
var _this = this;
setTimeout(function () {
if (!$(".popover:hover").length) {
$(_this).popover("hide");
}
}, 300);
});
$('#tooltip-edit_tags').click(function(){
$('#tooltip-edit_tags')[0].setAttribute("style", "display:none")
previousSibling = $("#tooltip-edit_tags")[0].previousSibling
tags_string = tags_list.join(", ")
form = document.createElement("form")
input = document.createElement("input")
tags_text = document.createElement("text")
icon_class = document.createElement("i")
icon_class.setAttribute("class", "glyphicon glyphicon-upload icon-gray")
icon_class.setAttribute("style", "font-size:23px; padding-left:4px")
icon_class.setAttribute("id", "tooltip-save_tags")
tags_text.innerText = "Tags: "
input.setAttribute('type', 'text')
input.setAttribute('name', 'tags_list')
input.setAttribute('value', tags_string)
input.setAttribute('style', 'width:75%; display: inline-block')
input.setAttribute('id' , 'change_tags')
form.appendChild(tags_text)
tags.textContent = " "
form.appendChild(input)
form.appendChild(icon_class)
tags.appendChild(form)
$("#tooltip-save_tags").click(function(){
tags_string = $("#change_tags")[0].value
tags_list = tags_string.split(",")
var re = /^[a-z0-9\-\_\:\/]+$/i
var good = true
for (var i = 0; i < tags_list.length; i++){
tag = tags_list[i]
if (tag.length == 0){
alert("There is a tag with length 0 - possible a trailing comma?")
good = false
break
} else {
tag_name = tag.trim()
if (!(re.test(tag_name))){
alert("The tag contains special characters. Make sure there are only alphanumeric characters in your tag")
good = false
break
}
}
}
if (good) {
var postContent = {}
postContent['tags'] = tags_string
$.ajax({
type: "POST",
dataType: "json",
data: JSON.stringify(postContent),
contentType: "application/json",
url: '/tag_list?post_path=etl/android-addons.kp',
async: false
});
location.reload()
}
})
tags.nextSibling.remove()
// Allow user to edit tags
var edit_icon = iconsJx.createEditTagsIcon();
$(tags).after(edit_icon);
$("#tooltip-edit_tags").on("click", function(){
var edit_tooltip = $("#tooltip-edit_tags");
edit_tooltip.attr("style", "display:none");
var tags_string = tags_list.join(", ");
var form = $("<form>");
var input = $("<input>");
var tags_text = $("<text>");
tags_text.html("Tags: ");
var icon = iconsJx.createSaveTagsIcon();
input.attr("type", "text");
input.attr("name", "tags_list");
input.attr("style", "width:75%; display: inline-block");
input.attr("id", "change_tags");
form.append(tags_text);
tags.textContent = " ";
tags_text.innerHTML = "Tags: ";
form.append(input);
form.append(icon);
tags.appendChild(form[0]);
$("#change_tags")[0].value = tags_string;
$("#change_tags").keypress(function(e){
if (e.which == 13){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/android-addons.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false;
};
});
$("#tooltip-save_tags").click(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/android-addons.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
});
$("form").submit(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/android-addons.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false
})
});
});
</script>

Просмотреть файл

@ -0,0 +1,666 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title> Knowledge </title>
<!-- js includes at the top as post embedded js colliding -->
<script src="/static/modules/jquery/jquery.min.js"></script>
<script src="/static/modules/tether/js/tether.min.js"></script>
<script src="/static/modules/bootstrap/js/bootstrap.min.js"></script>
<script src="/static/modules/bootstrap-slider/js/bootstrap-slider.min.js"></script>
<script src="/static/modules/typeahead.js/typeahead.bundle.min.js"></script>
<script src="/static/modules/handlebars/js/handlebars.js"></script>
<script src="/static/js/helpers.js"></script>
<script src="/static/modules/select2/js/select2.min.js"></script>
<script src="/static/modules/hightlight.pack.js/highlight.pack.js"></script>
<script src="/static/modules/marked.js/marked.js"></script>
<!-- require js is used for plotly, but has a bunch of collisions with other js packages
make sure to have it be last js package imported -->
<script src="/static/modules/require.js/require.min.js"></script>
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.6.2/html5shiv.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/2.1.0/es5-shim.min.js"></script>
<![endif]-->
<link rel="stylesheet" href="/static/modules/bootstrap/css/bootstrap.min.css">
<link rel="stylesheet" href="/static/modules/bootstrap-slider/css/bootstrap-slider.min.css">
<link rel="stylesheet" type='text/css' href="/static/css/custom.css">
<link rel="stylesheet" type='text/css' href="/static/modules/select2/css/select2.min.css">
<link href='https://fonts.googleapis.com/css?family=Lato:400,900|Playfair+Display|Source+Serif+Pro:400,700' rel='stylesheet' type='text/css'>
<link rel="shortcut icon" href="/static/images/favicon.png">
<link rel="stylesheet" href="/static/css/codehilite-friendly.css">
<style>
.spinner {
position: fixed;
top: 50%;
left: 50%;
margin-left: -50px; /* half width of the spinner gif */
margin-top: -50px; /* half height of the spinner gif */
text-align:center;
z-index:1234;
overflow: auto;
width: 100px; /* width of the spinner gif */
height: 102px; /*hight of the spinner gif +2px to fix IE8 issue */
}
.table {
font-size: 14px;
}
.modal-content {
max-width: 1024px;
}
</style>
</head>
<body>
<div class="navbar navbar-knowledge" role="navigation">
<div class="container page-container">
<a href="/" aria-selected="false" class="logo-image navbar-text" style='margin-top:16px; margin-left: -7px'>
<img width="125" src='/static/images/logo-white.svg'></img>
</a>
<a id="feed_tab" href="/" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Home
</a>
<a id="favorites_tab" href="/favorites" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Favorites
</a>
<a id="help_tab" href="/about" aria-selected="false" class="navbar-text" style='margin-top:18px'>
About
</a>
<a id="stats_tab" href="/stats" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Stats
</a>
<a id="webposts_tab" href="/create"
class="btn btn-primary navbar-text pull-right"
style='border-radius:4px; margin-top:10px; margin-right:-10px; background-color: #00a699; color:white; border-color: #00a699'>
Write a Post!
</a>
<div class="pull-right">
<div class="form-group">
<input class="form-control" id="searchbar" placeholder="Search for Knowledge" style="text-align:right">
</div>
</div>
</div>
</div>
<div class="container page-container">
<br>
<div class="container-fluid">
<div class="row">
<div class="col-md-6">
<div class="btn-group" role="group">
<button type="button" class="btn btn-default btn-rendered">
View Post
</button>
<button type="button" class="btn btn-default btn-raw">
View Raw Markdown
</button>
</div>
</div>
<div class="col-md-2">
</div>
<div class="col-md-4 text-right">
<i class="glyphicon glyphicon-eye-open" id="pageview_stats" style='color: #9CA299'></i>
<div id="pageview_stats" style="display: inline-block">
Viewed 2 times by 1 different users
</div>
<i class="glyphicon glyphicon-heart-empty glyphicon-clickable pop" style="font-size:16pt" id="tooltip-like" data-placement="bottom"
data-trigger="#tooltip-like"
data-container="body"
data-toggle="popover"
data-content='<div>Like This Post</div>'></i>
</div>
</div>
<div class="row col-md-12">
</div>
</div>
<div class='container-fluid'>
<div class="row">
<div class="col-md-12">
<div id="renderedMarkdown">
<div class='metadata'>
<h1>Android Addons ETL job</h1>
<span class='authors'><a href='/feed?authors=Frank+Bertsch'>Frank Bertsch</a></span>
<span class='date_created'>February 17, 2017</span>
<span class='date_updated'>(Last Updated: February 17, 2017)</span>
<span class='tldr'><p>This job takes the Fennec saved session pings and maps them to just client, submissionDate, activeAddons, and persona.</p></span>
<span class='tags'></span>
</div>
<div class="codehilite"><pre><span></span><span class="kn">import</span> <span class="nn">datetime</span> <span class="kn">as</span> <span class="nn">dt</span>
<span class="kn">import</span> <span class="nn">os</span>
<span class="kn">import</span> <span class="nn">pandas</span> <span class="kn">as</span> <span class="nn">pd</span>
<span class="kn">import</span> <span class="nn">operator</span>
<span class="kn">import</span> <span class="nn">ujson</span> <span class="kn">as</span> <span class="nn">json</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="o">*</span>
<span class="kn">from</span> <span class="nn">moztelemetry</span> <span class="kn">import</span> <span class="n">get_pings</span><span class="p">,</span> <span class="n">get_pings_properties</span><span class="p">,</span> <span class="n">get_one_ping_per_client</span>
<span class="o">%</span><span class="n">pylab</span> <span class="n">inline</span>
</pre></div>
<p>Take the set of pings, make sure we have actual clientIds and remove duplicate pings.</p>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">safe_str</span><span class="p">(</span><span class="n">obj</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot; return the byte string representation of obj &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">obj</span> <span class="ow">is</span> <span class="bp">None</span><span class="p">:</span>
<span class="k">return</span> <span class="nb">unicode</span><span class="p">(</span><span class="s2">&quot;&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="nb">unicode</span><span class="p">(</span><span class="n">obj</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">dedupe_pings</span><span class="p">(</span><span class="n">rdd</span><span class="p">):</span>
<span class="k">return</span> <span class="n">rdd</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="n">p</span><span class="p">[</span><span class="s2">&quot;meta/clientId&quot;</span><span class="p">]</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">)</span>\
<span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="p">(</span><span class="n">p</span><span class="p">[</span><span class="s2">&quot;meta/documentId&quot;</span><span class="p">],</span> <span class="n">p</span><span class="p">))</span>\
<span class="o">.</span><span class="n">reduceByKey</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span><span class="p">)</span>\
<span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
<span class="k">def</span> <span class="nf">dedupe_addons</span><span class="p">(</span><span class="n">rdd</span><span class="p">):</span>
<span class="k">return</span> <span class="n">rdd</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="p">(</span><span class="n">p</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">+</span> <span class="n">safe_str</span><span class="p">(</span><span class="n">p</span><span class="p">[</span><span class="mi">2</span><span class="p">])</span> <span class="o">+</span> <span class="n">safe_str</span><span class="p">(</span><span class="n">p</span><span class="p">[</span><span class="mi">3</span><span class="p">]),</span> <span class="n">p</span><span class="p">))</span>\
<span class="o">.</span><span class="n">reduceByKey</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span><span class="p">)</span>\
<span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
</pre></div>
<p>We&rsquo;re going to dump each event from the pings. Do a little empty data sanitization so we don&rsquo;t get NoneType errors during the dump. We create a JSON array of active experiments as part of the dump.</p>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">clean</span><span class="p">(</span><span class="n">s</span><span class="p">):</span>
<span class="k">try</span><span class="p">:</span>
<span class="n">s</span> <span class="o">=</span> <span class="n">s</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="s2">&quot;ascii&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">strip</span><span class="p">()</span>
<span class="k">return</span> <span class="n">s</span> <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">s</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span> <span class="k">else</span> <span class="bp">None</span>
<span class="k">except</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">None</span>
<span class="k">def</span> <span class="nf">transform</span><span class="p">(</span><span class="n">ping</span><span class="p">):</span>
<span class="n">output</span> <span class="o">=</span> <span class="p">[]</span>
<span class="c1"># These should not be None since we filter those out &amp; ingestion process adds the data</span>
<span class="n">clientId</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;meta/clientId&quot;</span><span class="p">]</span>
<span class="n">submissionDate</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">strptime</span><span class="p">(</span><span class="n">ping</span><span class="p">[</span><span class="s2">&quot;meta/submissionDate&quot;</span><span class="p">],</span> <span class="s2">&quot;%Y%m</span><span class="si">%d</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="n">addonset</span> <span class="o">=</span> <span class="p">{}</span>
<span class="n">addons</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;environment/addons/activeAddons&quot;</span><span class="p">]</span>
<span class="k">if</span> <span class="n">addons</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="k">for</span> <span class="n">addon</span><span class="p">,</span> <span class="n">desc</span> <span class="ow">in</span> <span class="n">addons</span><span class="o">.</span><span class="n">iteritems</span><span class="p">():</span>
<span class="n">name</span> <span class="o">=</span> <span class="n">clean</span><span class="p">(</span><span class="n">desc</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;name&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">))</span>
<span class="k">if</span> <span class="n">name</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">addonset</span><span class="p">[</span><span class="n">name</span><span class="p">]</span> <span class="o">=</span> <span class="mi">1</span>
<span class="n">persona</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;environment/addons/persona&quot;</span><span class="p">]</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">addonset</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span> <span class="ow">or</span> <span class="n">persona</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">addonarray</span> <span class="o">=</span> <span class="bp">None</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">addonset</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">addonarray</span> <span class="o">=</span> <span class="n">json</span><span class="o">.</span><span class="n">dumps</span><span class="p">(</span><span class="n">addonset</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>
<span class="n">output</span><span class="o">.</span><span class="n">append</span><span class="p">([</span><span class="n">clientId</span><span class="p">,</span> <span class="n">submissionDate</span><span class="p">,</span> <span class="n">addonarray</span><span class="p">,</span> <span class="n">persona</span><span class="p">])</span>
<span class="k">return</span> <span class="n">output</span>
</pre></div>
<p>Create a set of events from &ldquo;saved-session&rdquo; UI telemetry. Output the data to CSV or Parquet.</p>
<p>This script is designed to loop over a range of days and output a single day for the given channels. Use explicit date ranges for backfilling, or now() - &lsquo;1day&rsquo; for automated runs.</p>
<div class="codehilite"><pre><span></span><span class="n">channels</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;nightly&quot;</span><span class="p">,</span> <span class="s2">&quot;aurora&quot;</span><span class="p">,</span> <span class="s2">&quot;beta&quot;</span><span class="p">,</span> <span class="s2">&quot;release&quot;</span><span class="p">]</span>
<span class="n">batch_date</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;date&#39;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">batch_date</span><span class="p">:</span>
<span class="n">start</span> <span class="o">=</span> <span class="n">end</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">strptime</span><span class="p">(</span><span class="n">batch_date</span><span class="p">,</span> <span class="s1">&#39;%Y%m</span><span class="si">%d</span><span class="s1">&#39;</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">start</span> <span class="o">=</span> <span class="n">start</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">now</span><span class="p">()</span> <span class="o">-</span> <span class="n">dt</span><span class="o">.</span><span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="n">day</span> <span class="o">=</span> <span class="n">start</span>
<span class="k">while</span> <span class="n">day</span> <span class="o">&lt;=</span> <span class="n">end</span><span class="p">:</span>
<span class="k">for</span> <span class="n">channel</span> <span class="ow">in</span> <span class="n">channels</span><span class="p">:</span>
<span class="k">print</span> <span class="s2">&quot;</span><span class="se">\n</span><span class="s2">channel: &quot;</span> <span class="o">+</span> <span class="n">channel</span> <span class="o">+</span> <span class="s2">&quot;, date: &quot;</span> <span class="o">+</span> <span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">&quot;%Y%m</span><span class="si">%d</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="n">pings</span> <span class="o">=</span> <span class="n">get_pings</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">app</span><span class="o">=</span><span class="s2">&quot;Fennec&quot;</span><span class="p">,</span> <span class="n">channel</span><span class="o">=</span><span class="n">channel</span><span class="p">,</span>
<span class="n">submission_date</span><span class="o">=</span><span class="p">(</span><span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">&quot;%Y%m</span><span class="si">%d</span><span class="s2">&quot;</span><span class="p">),</span> <span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">&quot;%Y%m</span><span class="si">%d</span><span class="s2">&quot;</span><span class="p">)),</span>
<span class="n">build_id</span><span class="o">=</span><span class="p">(</span><span class="s2">&quot;20100101000000&quot;</span><span class="p">,</span> <span class="s2">&quot;99999999999999&quot;</span><span class="p">),</span>
<span class="n">fraction</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="n">subset</span> <span class="o">=</span> <span class="n">get_pings_properties</span><span class="p">(</span><span class="n">pings</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;meta/clientId&quot;</span><span class="p">,</span>
<span class="s2">&quot;meta/documentId&quot;</span><span class="p">,</span>
<span class="s2">&quot;meta/submissionDate&quot;</span><span class="p">,</span>
<span class="s2">&quot;environment/addons/activeAddons&quot;</span><span class="p">,</span>
<span class="s2">&quot;environment/addons/persona&quot;</span><span class="p">])</span>
<span class="n">subset</span> <span class="o">=</span> <span class="n">dedupe_pings</span><span class="p">(</span><span class="n">subset</span><span class="p">)</span>
<span class="k">print</span> <span class="n">subset</span><span class="o">.</span><span class="n">first</span><span class="p">()</span>
<span class="n">rawAddons</span> <span class="o">=</span> <span class="n">subset</span><span class="o">.</span><span class="n">flatMap</span><span class="p">(</span><span class="n">transform</span><span class="p">)</span>
<span class="k">print</span> <span class="s2">&quot;</span><span class="se">\n</span><span class="s2">rawAddons count: &quot;</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">rawAddons</span><span class="o">.</span><span class="n">count</span><span class="p">())</span>
<span class="k">print</span> <span class="n">rawAddons</span><span class="o">.</span><span class="n">first</span><span class="p">()</span>
<span class="n">uniqueAddons</span> <span class="o">=</span> <span class="n">dedupe_addons</span><span class="p">(</span><span class="n">rawAddons</span><span class="p">)</span>
<span class="k">print</span> <span class="s2">&quot;</span><span class="se">\n</span><span class="s2">uniqueAddons count: &quot;</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">uniqueAddons</span><span class="o">.</span><span class="n">count</span><span class="p">())</span>
<span class="k">print</span> <span class="n">uniqueAddons</span><span class="o">.</span><span class="n">first</span><span class="p">()</span>
<span class="n">s3_output</span> <span class="o">=</span> <span class="s2">&quot;s3n://net-mozaws-prod-us-west-2-pipeline-analysis/mobile/android_addons&quot;</span>
<span class="n">s3_output</span> <span class="o">+=</span> <span class="s2">&quot;/v1/channel=&quot;</span> <span class="o">+</span> <span class="n">channel</span> <span class="o">+</span> <span class="s2">&quot;/submission=&quot;</span> <span class="o">+</span> <span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">&quot;%Y%m</span><span class="si">%d</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">StructType</span><span class="p">([</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;clientid&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">False</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;submissiondate&quot;</span><span class="p">,</span> <span class="n">TimestampType</span><span class="p">(),</span> <span class="bp">False</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;addons&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;lwt&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">)</span>
<span class="p">])</span>
<span class="n">grouped</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">uniqueAddons</span><span class="p">,</span> <span class="n">schema</span><span class="p">)</span>
<span class="n">grouped</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="n">s3_output</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s2">&quot;overwrite&quot;</span><span class="p">)</span>
<span class="n">day</span> <span class="o">+=</span> <span class="n">dt</span><span class="o">.</span><span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
</div>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<h2> 0 Comments </h2>
</br>
<div>
<div class="row">
<div class="col-md-12">
<textarea class="form-control" type="text" id="comment-text" style="height:87px;" placeholder="Leave a comment..."></textarea>
</div>
</div>
</div>
</br>
<div>
<button class="btn btn-primary" id="post_comment_btn">Post Comment</button>
</div>
</div>
</div>
<br>
<div class="row">
<div class="col-md-12">
</div>
</br>
</div>
</div>
</div>
</div>
<div class="footer">
Served with <span class="glyphicon glyphicon-heart"></span> by <a href="https://github.com/airbnb/knowledge-repo">Knowledge Repo</a> <a href="https://github.com/airbnb/knowledge-repo/releases/tag/v0.7.4">0.7.4</a><br />
<i title="Last checked for updates: 4 minutes ago">Last indexed: 14 hours ago</i>
</div>
<script type='text/javascript'>
$("#searchbar")[0].setSelectionRange(1000, 1000);
$('#searchbar').typeahead({
hint: false,
highlight: true,
minLength: 1
},
{
name: 'knowledge_posts',
limit: 10,
display: function (item) {
return item.title + " - " + item.author;
},
templates: {
empty: Handlebars.compile(
'<div class="tt-not-found">' +
'Unable to find any posts that match the current query' +
'</div>'
),
suggestion: function(data) {
return '<p style="overflow-wrap:break-word"><strong class="text-rausch">' + data.title + '</strong> – ' + data.author + '</p>';
}
},
source: function(q, sync, async) {
$.ajax('/ajax/index/typeahead?search=' + q,
{
success: function(data,status){ async(JSON.parse(data)); }
})
}
});
$('#searchbar').bind('typeahead:select', function(obj, datum, name) {
window.location = '/post/'+encodeURIComponent(datum.path);
});
$('#searchbar').keypress(function(event){
var keycode = (event.keyCode ? event.keyCode : event.which);
if(keycode == '13'){
var path = document.location.pathname;
window.location = '/feed?filters=' + $('#searchbar').val()
}
});
var padding = $('.tt-menu').outerWidth()
$('.tt-menu').width($('#searchbar').width() + padding + "px")
</script>
<script src="/static/js/tooltips.js" type="text/javascript"></script>
<script type="text/javascript">
$("document").ready(function(){
var is_webeditor = false;
var post_id = "10";
var id = "None";
var post_path = "etl/android-addons.kp"
var data_repo_github_root = ""
tooltipsJx.initializeTooltips(is_webeditor, post_id, id, data_repo_github_root);
$(".btn-rendered").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path);
})
$(".btn-raw").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path) + "?render=raw";
})
$(".btn-webeditor").on("click", function(){
document.location.href = "/edit/" + encodeURI(post_path);
})
});
</script>
<script src="/static/js/helpers.js"></script>
<script src="/static/js/tags.js" type="text/javascript"></script>
<script src="/static/js/icons.js" type="text/javascript"></script>
<script src="/static/js/comments.js" type="text/javascript"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script type="text/javascript"
src="https://cdn.mathjax.org/mathjax/latest/MathJax.js">
</script>
<script>
$(document).on('ready', function(){
// Make the Rendered Markdown Button active
$(".btn-rendered").addClass("btn-active");
// Initialize headers
helpersJx.linkifyHeaders();
// Initialize comments
var post_path = 'etl/android-addons.kp';
$("#post_comment_btn").on('click', function(){
comment_author = 'knowledge_default';
post_author = 'Frank Bertsch';
post_title = 'Android Addons ETL job';
commentsJx.postComment(comment_author, post_author, post_title, post_path);
location.reload();
});
all_comment_delete_buttons = $("[id^=delete_comment]")
$.each(all_comment_delete_buttons, function(i,v){
$(v).on("click", function(){
var id = v.id;
var comment_id = id.split("__")[1];
if(comment_id) {
commentsJx.deleteComment(post_path, comment_id)
location.reload();
}
});
});
$(document.body).on('click',"button[id^=tag-subscription]",function () {
tagsJx.addTagSubscriptionListener($(this)[0]);
});
})
//Turn all the headers to be links
//h1 = Title, don't want that
var all_headers = [$("h2"), $("h3"), $("h4"), $("h5"), $("h6")]
$.each(all_headers, function(index, value){
$.each(value, function(i, v){
var inner_html = v.innerHTML
inner_html_no_special = inner_html.replace(/[^a-zA-Z\- ]/g, "")
var inner_link = "#" + inner_html_no_special.toLowerCase().split(" ").join("-")
v.innerHTML = "<a href='" + inner_link + "' class=link-reset>" + inner_html + "</a>"
})
})
//turn all the tags into links, similar to what's done on the feed page
var tags = $("#renderedMarkdown .metadata .tags")[0]
var tags_list = ['mobile', 'etl']
var subscriptions_list = []
$.each(tags_list, function(i,tag){
ahref = document.createElement("a")
e_tag = encodeURIComponent(tag)
f_tag = tag.replace("/", "__")
tag_name = "#" + tag
tag_subscription_button_id_name = "tag-subscription-" + i + "__" + f_tag
ahref.setAttribute("data-container", "body")
ahref.setAttribute("data-toggle", "popover")
ahref.setAttribute("data-placement", "bottom")
ahref.setAttribute("data-html", "true")
ahref.setAttribute("data-tag-name", f_tag)
if (subscriptions_list.indexOf(tag) >= 0) {
ahref.setAttribute("class", "label label-subscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-primary btn-unsubscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-remove-sign glyphicon-white'></i>Unsubscribe " +
" </button> " +
" </div>")
} else {
ahref.setAttribute("class", "label label-unsubscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-default btn-subscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-ok-sign glyphicon-filled'></i>Subscribe " +
" </button> " +
" </div>")
}
ahref.setAttribute("href", "/tag_pages?tag=" + e_tag)
ahref.setAttribute("style", "font-weight:normal")
if (i == 0){
ahref.innerHTML = " "
colon = document.createElement("text")
colon.innerHTML = "<b>Tags</b>: "
tags.appendChild(colon)
}
ahref.innerHTML = ahref.innerHTML + tag_name
tags.appendChild(ahref)
if (i != tags_list.length - 1){
comma = document.createElement("text")
comma.innerText = ", "
tags.appendChild(comma)
}
})
tags.nextSibling.remove()
tags.innerHTML += "<i class='glyphicon glyphicon-edit icon-gray' style='font-size:12pt; padding-left:4px' id='tooltip-edit_tags'></i>"
$(".pop").popover({ trigger: "manual" , html: true, animation:false, delay: 100})
.on("mouseenter", function () {
var _this = this;
$(this).popover("show");
$(".popover").on("mouseleave", function () {
$(_this).popover('hide');
});
}).on("mouseleave", function () {
var _this = this;
setTimeout(function () {
if (!$(".popover:hover").length) {
$(_this).popover("hide");
}
}, 300);
});
$('#tooltip-edit_tags').click(function(){
$('#tooltip-edit_tags')[0].setAttribute("style", "display:none")
previousSibling = $("#tooltip-edit_tags")[0].previousSibling
tags_string = tags_list.join(", ")
form = document.createElement("form")
input = document.createElement("input")
tags_text = document.createElement("text")
icon_class = document.createElement("i")
icon_class.setAttribute("class", "glyphicon glyphicon-upload icon-gray")
icon_class.setAttribute("style", "font-size:23px; padding-left:4px")
icon_class.setAttribute("id", "tooltip-save_tags")
tags_text.innerText = "Tags: "
input.setAttribute('type', 'text')
input.setAttribute('name', 'tags_list')
input.setAttribute('value', tags_string)
input.setAttribute('style', 'width:75%; display: inline-block')
input.setAttribute('id' , 'change_tags')
form.appendChild(tags_text)
tags.textContent = " "
form.appendChild(input)
form.appendChild(icon_class)
tags.appendChild(form)
$("#tooltip-save_tags").click(function(){
tags_string = $("#change_tags")[0].value
tags_list = tags_string.split(",")
var re = /^[a-z0-9\-\_\:\/]+$/i
var good = true
for (var i = 0; i < tags_list.length; i++){
tag = tags_list[i]
if (tag.length == 0){
alert("There is a tag with length 0 - possible a trailing comma?")
good = false
break
} else {
tag_name = tag.trim()
if (!(re.test(tag_name))){
alert("The tag contains special characters. Make sure there are only alphanumeric characters in your tag")
good = false
break
}
}
}
if (good) {
var postContent = {}
postContent['tags'] = tags_string
$.ajax({
type: "POST",
dataType: "json",
data: JSON.stringify(postContent),
contentType: "application/json",
url: '/tag_list?post_path=etl/android-addons.kp',
async: false
});
location.reload()
}
})
tags.nextSibling.remove()
// Allow user to edit tags
var edit_icon = iconsJx.createEditTagsIcon();
$(tags).after(edit_icon);
$("#tooltip-edit_tags").on("click", function(){
var edit_tooltip = $("#tooltip-edit_tags");
edit_tooltip.attr("style", "display:none");
var tags_string = tags_list.join(", ");
var form = $("<form>");
var input = $("<input>");
var tags_text = $("<text>");
tags_text.html("Tags: ");
var icon = iconsJx.createSaveTagsIcon();
input.attr("type", "text");
input.attr("name", "tags_list");
input.attr("style", "width:75%; display: inline-block");
input.attr("id", "change_tags");
form.append(tags_text);
tags.textContent = " ";
tags_text.innerHTML = "Tags: ";
form.append(input);
form.append(icon);
tags.appendChild(form[0]);
$("#change_tags")[0].value = tags_string;
$("#change_tags").keypress(function(e){
if (e.which == 13){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/android-addons.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false;
};
});
$("#tooltip-save_tags").click(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/android-addons.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
});
$("form").submit(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/android-addons.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false
})
});
});
</script>
</body>
</html>

Просмотреть файл

@ -0,0 +1,13 @@
{
"title": "Android Addons ETL job",
"authors": [
"Frank Bertsch"
],
"tags": [
"mobile",
"etl"
],
"publish_date": "2017-02-17",
"updated_at": "2017-02-17",
"tldr": "This job takes the Fennec saved session pings and maps them to just client, submissionDate, activeAddons, and persona."
}

Просмотреть файл

@ -0,0 +1,594 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title> Knowledge </title>
<!-- js includes at the top as post embedded js colliding -->
<script src="https://reports.telemetry.mozilla.org/static/modules/jquery/jquery.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/tether/js/tether.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap/js/bootstrap.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/js/bootstrap-slider.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/typeahead.js/typeahead.bundle.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/handlebars/js/handlebars.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/select2/js/select2.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/hightlight.pack.js/highlight.pack.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/marked.js/marked.js"></script>
<!-- require js is used for plotly, but has a bunch of collisions with other js packages
make sure to have it be last js package imported -->
<script src="https://reports.telemetry.mozilla.org/static/modules/require.js/require.min.js"></script>
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.6.2/html5shiv.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/2.1.0/es5-shim.min.js"></script>
<![endif]-->
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap/css/bootstrap.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/css/bootstrap-slider.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/css/custom.css" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/select2/css/select2.min.css" rel="stylesheet" type="text/css"/>
<link href="https://fonts.googleapis.com/css?family=Lato:400,900|Playfair+Display|Source+Serif+Pro:400,700" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/images/favicon.png" rel="shortcut icon"/>
<link href="https://reports.telemetry.mozilla.org/static/css/codehilite-friendly.css" rel="stylesheet"/>
<style>
.spinner {
position: fixed;
top: 50%;
left: 50%;
margin-left: -50px; /* half width of the spinner gif */
margin-top: -50px; /* half height of the spinner gif */
text-align:center;
z-index:1234;
overflow: auto;
width: 100px; /* width of the spinner gif */
height: 102px; /*hight of the spinner gif +2px to fix IE8 issue */
}
.table {
font-size: 14px;
}
.modal-content {
max-width: 1024px;
}
</style>
</head>
<body>
<div class="container page-container">
<br/>
<div class="container-fluid">
<div class="row">
<div class="col-md-6">
</div>
<div class="col-md-2">
</div>
</div>
<div class="row col-md-12">
</div>
</div>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<div id="renderedMarkdown">
<div class="metadata">
<h1>Android Clients ETL</h1>
<span class="authors"><a href="/feed?authors=Frank+Bertsch">Frank Bertsch</a></span>
<span class="date_created">February 09, 2017</span>
<span class="date_updated">(Last Updated: May 10, 2017)</span>
<span class="tldr"><p>This notebook maps Fennec saved_session pings to some useful information about clients. This is a 1:1 mapping.</p></span>
</div>
<div class="codehilite"><pre><span></span><span class="kn">import</span> <span class="nn">datetime</span> <span class="kn">as</span> <span class="nn">dt</span>
<span class="kn">import</span> <span class="nn">os</span>
<span class="kn">import</span> <span class="nn">pandas</span> <span class="kn">as</span> <span class="nn">pd</span>
<span class="kn">import</span> <span class="nn">ujson</span> <span class="kn">as</span> <span class="nn">json</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="o">*</span>
<span class="kn">from</span> <span class="nn">moztelemetry</span> <span class="kn">import</span> <span class="n">get_pings</span><span class="p">,</span> <span class="n">get_pings_properties</span>
<span class="o">%</span><span class="n">pylab</span> <span class="n">inline</span>
</pre></div>
<p>Take the set of pings, make sure we have actual clientIds and remove duplicate pings. We collect each unique ping.</p>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">dedupe_pings</span><span class="p">(</span><span class="n">rdd</span><span class="p">):</span>
<span class="k">return</span> <span class="n">rdd</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="n">p</span><span class="p">[</span><span class="s2">"meta/clientId"</span><span class="p">]</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">)</span>\
<span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="p">(</span><span class="n">p</span><span class="p">[</span><span class="s2">"meta/documentId"</span><span class="p">],</span> <span class="n">p</span><span class="p">))</span>\
<span class="o">.</span><span class="n">reduceByKey</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span><span class="p">)</span>\
<span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
</pre></div>
<p>Transform and sanitize the pings into arrays.</p>
<div class="codehilite"><pre><span></span><span class="c1"># bug 1362659 - int values exceeded signed 32 bit range</span>
<span class="n">MAX_INT</span> <span class="o">=</span> <span class="p">(</span><span class="mi">2</span><span class="o">**</span><span class="mi">31</span><span class="p">)</span><span class="o">-</span><span class="mi">1</span>
<span class="k">def</span> <span class="nf">transform</span><span class="p">(</span><span class="n">ping</span><span class="p">):</span>
<span class="c1"># Should not be None since we filter those out.</span>
<span class="n">clientId</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"meta/clientId"</span><span class="p">]</span>
<span class="n">profileDate</span> <span class="o">=</span> <span class="bp">None</span>
<span class="n">profileDaynum</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"environment/profile/creationDate"</span><span class="p">]</span>
<span class="k">if</span> <span class="n">profileDaynum</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="k">try</span><span class="p">:</span>
<span class="c1"># Bad data could push profileDaynum &gt; 32767 (size of a C int) and throw exception</span>
<span class="n">profileDate</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="p">(</span><span class="mi">1970</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> <span class="o">+</span> <span class="n">dt</span><span class="o">.</span><span class="n">timedelta</span><span class="p">(</span><span class="nb">int</span><span class="p">(</span><span class="n">profileDaynum</span><span class="p">))</span>
<span class="k">except</span><span class="p">:</span>
<span class="n">profileDate</span> <span class="o">=</span> <span class="bp">None</span>
<span class="c1"># Create date should already be in ISO format</span>
<span class="n">creationDate</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"creationDate"</span><span class="p">]</span>
<span class="k">if</span> <span class="n">creationDate</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="c1"># This is only accurate because we know the creation date is always in 'Z' (zulu) time.</span>
<span class="n">creationDate</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">strptime</span><span class="p">(</span><span class="n">ping</span><span class="p">[</span><span class="s2">"creationDate"</span><span class="p">],</span> <span class="s2">"%Y-%m-</span><span class="si">%d</span><span class="s2">T%H:%M:%S.</span><span class="si">%f</span><span class="s2">Z"</span><span class="p">)</span>
<span class="c1"># Added via the ingestion process so should not be None.</span>
<span class="n">submissionDate</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">strptime</span><span class="p">(</span><span class="n">ping</span><span class="p">[</span><span class="s2">"meta/submissionDate"</span><span class="p">],</span> <span class="s2">"%Y%m</span><span class="si">%d</span><span class="s2">"</span><span class="p">)</span>
<span class="n">appVersion</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"application/version"</span><span class="p">]</span>
<span class="n">osVersion</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"environment/system/os/version"</span><span class="p">]</span>
<span class="k">if</span> <span class="n">osVersion</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">osVersion</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">osVersion</span><span class="p">)</span> <span class="k">if</span> <span class="nb">int</span><span class="p">(</span><span class="n">osVersion</span><span class="p">)</span> <span class="o">&lt;=</span> <span class="n">MAX_INT</span> <span class="k">else</span> <span class="bp">None</span>
<span class="n">locale</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"environment/settings/locale"</span><span class="p">]</span>
<span class="c1"># Truncate to 32 characters</span>
<span class="n">defaultSearch</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"environment/settings/defaultSearchEngine"</span><span class="p">]</span>
<span class="k">if</span> <span class="n">defaultSearch</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">defaultSearch</span> <span class="o">=</span> <span class="n">defaultSearch</span><span class="p">[</span><span class="mi">0</span><span class="p">:</span><span class="mi">32</span><span class="p">]</span>
<span class="c1"># Build up the device string, truncating like we do in 'core' ping.</span>
<span class="n">device</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"environment/system/device/manufacturer"</span><span class="p">]</span>
<span class="n">model</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"environment/system/device/model"</span><span class="p">]</span>
<span class="k">if</span> <span class="n">device</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span> <span class="ow">and</span> <span class="n">model</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">device</span> <span class="o">=</span> <span class="n">device</span><span class="p">[</span><span class="mi">0</span><span class="p">:</span><span class="mi">12</span><span class="p">]</span> <span class="o">+</span> <span class="s2">"-"</span> <span class="o">+</span> <span class="n">model</span><span class="p">[</span><span class="mi">0</span><span class="p">:</span><span class="mi">19</span><span class="p">]</span>
<span class="n">xpcomABI</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"application/xpcomAbi"</span><span class="p">]</span>
<span class="n">arch</span> <span class="o">=</span> <span class="s2">"arm"</span>
<span class="k">if</span> <span class="n">xpcomABI</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span> <span class="ow">and</span> <span class="s2">"x86"</span> <span class="ow">in</span> <span class="n">xpcomABI</span><span class="p">:</span>
<span class="n">arch</span> <span class="o">=</span> <span class="s2">"x86"</span>
<span class="c1"># Bug 1337896</span>
<span class="n">as_topsites_loader_time</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"payload/histograms/FENNEC_ACTIVITY_STREAM_TOPSITES_LOADER_TIME_MS"</span><span class="p">]</span>
<span class="n">topsites_loader_time</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"payload/histograms/FENNEC_TOPSITES_LOADER_TIME_MS"</span><span class="p">]</span>
<span class="k">if</span> <span class="n">as_topsites_loader_time</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">as_topsites_loader_time</span> <span class="o">=</span> <span class="nb">map</span><span class="p">(</span><span class="nb">int</span><span class="p">,</span> <span class="n">as_topsites_loader_time</span><span class="o">.</span><span class="n">tolist</span><span class="p">())</span>
<span class="k">if</span> <span class="nb">any</span><span class="p">([</span><span class="n">v</span> <span class="o">&gt;</span> <span class="n">MAX_INT</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">as_topsites_loader_time</span><span class="p">]):</span>
<span class="n">as_topsites_loader_time</span> <span class="o">=</span> <span class="bp">None</span>
<span class="k">if</span> <span class="n">topsites_loader_time</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">topsites_loader_time</span> <span class="o">=</span> <span class="nb">map</span><span class="p">(</span><span class="nb">int</span><span class="p">,</span> <span class="n">topsites_loader_time</span><span class="o">.</span><span class="n">tolist</span><span class="p">())</span>
<span class="k">if</span> <span class="nb">any</span><span class="p">([</span><span class="n">v</span> <span class="o">&gt;</span> <span class="n">MAX_INT</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">topsites_loader_time</span><span class="p">]):</span>
<span class="n">topsites_loader_time</span> <span class="o">=</span> <span class="bp">None</span>
<span class="k">return</span> <span class="p">[</span><span class="n">clientId</span><span class="p">,</span>
<span class="n">profileDate</span><span class="p">,</span>
<span class="n">submissionDate</span><span class="p">,</span>
<span class="n">creationDate</span><span class="p">,</span>
<span class="n">appVersion</span><span class="p">,</span>
<span class="n">osVersion</span><span class="p">,</span>
<span class="n">locale</span><span class="p">,</span>
<span class="n">defaultSearch</span><span class="p">,</span>
<span class="n">device</span><span class="p">,</span>
<span class="n">arch</span><span class="p">,</span>
<span class="n">as_topsites_loader_time</span><span class="p">,</span>
<span class="n">topsites_loader_time</span><span class="p">]</span>
</pre></div>
<p>Create a set of pings from “saved-session” to build a set of core client data. Output the data to CSV or Parquet.</p>
<p>This script is designed to loop over a range of days and output a single day for the given channels. Use explicit date ranges for backfilling, or now() - 1day for automated runs.</p>
<div class="codehilite"><pre><span></span><span class="n">channels</span> <span class="o">=</span> <span class="p">[</span><span class="s2">"nightly"</span><span class="p">,</span> <span class="s2">"aurora"</span><span class="p">,</span> <span class="s2">"beta"</span><span class="p">,</span> <span class="s2">"release"</span><span class="p">]</span>
<span class="n">batch_date</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'date'</span><span class="p">)</span>
<span class="k">if</span> <span class="n">batch_date</span><span class="p">:</span>
<span class="n">start</span> <span class="o">=</span> <span class="n">end</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">strptime</span><span class="p">(</span><span class="n">batch_date</span><span class="p">,</span> <span class="s1">'%Y%m</span><span class="si">%d</span><span class="s1">'</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">start</span> <span class="o">=</span> <span class="n">end</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">now</span><span class="p">()</span> <span class="o">-</span> <span class="n">dt</span><span class="o">.</span><span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="n">day</span> <span class="o">=</span> <span class="n">start</span>
<span class="k">while</span> <span class="n">day</span> <span class="o">&lt;=</span> <span class="n">end</span><span class="p">:</span>
<span class="k">for</span> <span class="n">channel</span> <span class="ow">in</span> <span class="n">channels</span><span class="p">:</span>
<span class="k">print</span> <span class="s2">"</span><span class="se">\n</span><span class="s2">channel: "</span> <span class="o">+</span> <span class="n">channel</span> <span class="o">+</span> <span class="s2">", date: "</span> <span class="o">+</span> <span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">"%Y%m</span><span class="si">%d</span><span class="s2">"</span><span class="p">)</span>
<span class="n">pings</span> <span class="o">=</span> <span class="n">get_pings</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">app</span><span class="o">=</span><span class="s2">"Fennec"</span><span class="p">,</span> <span class="n">channel</span><span class="o">=</span><span class="n">channel</span><span class="p">,</span>
<span class="n">submission_date</span><span class="o">=</span><span class="p">(</span><span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">"%Y%m</span><span class="si">%d</span><span class="s2">"</span><span class="p">),</span> <span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">"%Y%m</span><span class="si">%d</span><span class="s2">"</span><span class="p">)),</span>
<span class="n">build_id</span><span class="o">=</span><span class="p">(</span><span class="s2">"20100101000000"</span><span class="p">,</span> <span class="s2">"99999999999999"</span><span class="p">),</span>
<span class="n">fraction</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="n">subset</span> <span class="o">=</span> <span class="n">get_pings_properties</span><span class="p">(</span><span class="n">pings</span><span class="p">,</span> <span class="p">[</span><span class="s2">"meta/clientId"</span><span class="p">,</span>
<span class="s2">"meta/documentId"</span><span class="p">,</span>
<span class="s2">"meta/submissionDate"</span><span class="p">,</span>
<span class="s2">"creationDate"</span><span class="p">,</span>
<span class="s2">"application/version"</span><span class="p">,</span>
<span class="s2">"environment/system/os/version"</span><span class="p">,</span>
<span class="s2">"environment/profile/creationDate"</span><span class="p">,</span>
<span class="s2">"environment/settings/locale"</span><span class="p">,</span>
<span class="s2">"environment/settings/defaultSearchEngine"</span><span class="p">,</span>
<span class="s2">"environment/system/device/model"</span><span class="p">,</span>
<span class="s2">"environment/system/device/manufacturer"</span><span class="p">,</span>
<span class="s2">"application/xpcomAbi"</span><span class="p">,</span>
<span class="s2">"payload/histograms/FENNEC_ACTIVITY_STREAM_TOPSITES_LOADER_TIME_MS"</span><span class="p">,</span>
<span class="s2">"payload/histograms/FENNEC_TOPSITES_LOADER_TIME_MS"</span><span class="p">])</span>
<span class="n">subset</span> <span class="o">=</span> <span class="n">dedupe_pings</span><span class="p">(</span><span class="n">subset</span><span class="p">)</span>
<span class="n">transformed</span> <span class="o">=</span> <span class="n">subset</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">transform</span><span class="p">)</span>
<span class="n">s3_output</span> <span class="o">=</span> <span class="s2">"s3n://net-mozaws-prod-us-west-2-pipeline-analysis/mobile/android_clients"</span>
<span class="n">s3_output</span> <span class="o">+=</span> <span class="s2">"/v2/channel="</span> <span class="o">+</span> <span class="n">channel</span> <span class="o">+</span> <span class="s2">"/submission="</span> <span class="o">+</span> <span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">"%Y%m</span><span class="si">%d</span><span class="s2">"</span><span class="p">)</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">StructType</span><span class="p">([</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"clientid"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">False</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"profiledate"</span><span class="p">,</span> <span class="n">TimestampType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"submissiondate"</span><span class="p">,</span> <span class="n">TimestampType</span><span class="p">(),</span> <span class="bp">False</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"creationdate"</span><span class="p">,</span> <span class="n">TimestampType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"appversion"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"osversion"</span><span class="p">,</span> <span class="n">IntegerType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"locale"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"defaultsearch"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"device"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"arch"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"fennec_activity_stream_topsites_loader_time_ms"</span><span class="p">,</span>
<span class="n">ArrayType</span><span class="p">(</span><span class="n">IntegerType</span><span class="p">()),</span>
<span class="bp">True</span>
<span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"fennec_topsites_loader_time_ms"</span><span class="p">,</span>
<span class="n">ArrayType</span><span class="p">(</span><span class="n">IntegerType</span><span class="p">()),</span>
<span class="bp">True</span>
<span class="p">)</span>
<span class="p">])</span>
<span class="n">grouped</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">transformed</span><span class="p">,</span> <span class="n">schema</span><span class="p">)</span>
<span class="n">grouped</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="n">s3_output</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s2">"overwrite"</span><span class="p">)</span>
<span class="n">day</span> <span class="o">+=</span> <span class="n">dt</span><span class="o">.</span><span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
</div>
</div></body></html>
<div>
</div>
<br/>
<div class="row">
<div class="col-md-12">
</div>
</div>
<script type="text/javascript">
$("#searchbar")[0].setSelectionRange(1000, 1000);
$('#searchbar').typeahead({
hint: false,
highlight: true,
minLength: 1
},
{
name: 'knowledge_posts',
limit: 10,
display: function (item) {
return item.title + " - " + item.author;
},
templates: {
empty: Handlebars.compile(
'<div class="tt-not-found">' +
'Unable to find any posts that match the current query' +
'</div>'
),
suggestion: function(data) {
return '<p style="overflow-wrap:break-word"><strong class="text-rausch">' + data.title + '</strong> – ' + data.author + '</p>';
}
},
source: function(q, sync, async) {
$.ajax('/ajax/index/typeahead?search=' + q,
{
success: function(data,status){ async(JSON.parse(data)); }
})
}
});
$('#searchbar').bind('typeahead:select', function(obj, datum, name) {
window.location = '/post/'+encodeURIComponent(datum.path);
});
$('#searchbar').keypress(function(event){
var keycode = (event.keyCode ? event.keyCode : event.which);
if(keycode == '13'){
var path = document.location.pathname;
window.location = '/feed?filters=' + $('#searchbar').val()
}
});
var padding = $('.tt-menu').outerWidth()
$('.tt-menu').width($('#searchbar').width() + padding + "px")
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/tooltips.js" type="text/javascript"></script>
<script type="text/javascript">
$("document").ready(function(){
var is_webeditor = false;
var post_id = "36";
var id = "None";
var post_path = "etl/android-clients.kp"
var data_repo_github_root = ""
tooltipsJx.initializeTooltips(is_webeditor, post_id, id, data_repo_github_root);
$(".btn-rendered").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path);
})
$(".btn-raw").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path) + "?render=raw";
})
$(".btn-webeditor").on("click", function(){
document.location.href = "/edit/" + encodeURI(post_path);
})
});
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/tags.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/icons.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/comments.js" type="text/javascript"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js" type="text/javascript">
</script>
<script>
$(document).on('ready', function(){
// Make the Rendered Markdown Button active
$(".btn-rendered").addClass("btn-active");
// Initialize headers
helpersJx.linkifyHeaders();
// Initialize comments
var post_path = 'etl/android-clients.kp';
$("#post_comment_btn").on('click', function(){
comment_author = 'knowledge_default';
post_author = 'Frank Bertsch';
post_title = 'Android Clients ETL';
commentsJx.postComment(comment_author, post_author, post_title, post_path);
location.reload();
});
all_comment_delete_buttons = $("[id^=delete_comment]")
$.each(all_comment_delete_buttons, function(i,v){
$(v).on("click", function(){
var id = v.id;
var comment_id = id.split("__")[1];
if(comment_id) {
commentsJx.deleteComment(post_path, comment_id)
location.reload();
}
});
});
$(document.body).on('click',"button[id^=tag-subscription]",function () {
tagsJx.addTagSubscriptionListener($(this)[0]);
});
})
//Turn all the headers to be links
//h1 = Title, don't want that
var all_headers = [$("h2"), $("h3"), $("h4"), $("h5"), $("h6")]
$.each(all_headers, function(index, value){
$.each(value, function(i, v){
var inner_html = v.innerHTML
inner_html_no_special = inner_html.replace(/[^a-zA-Z\- ]/g, "")
var inner_link = "#" + inner_html_no_special.toLowerCase().split(" ").join("-")
v.innerHTML = "<a href='" + inner_link + "' class=link-reset>" + inner_html + "</a>"
})
})
//turn all the tags into links, similar to what's done on the feed page
var tags = $("#renderedMarkdown .metadata .tags")[0]
var tags_list = ['mobile', 'fennec', 'etl']
var subscriptions_list = []
$.each(tags_list, function(i,tag){
ahref = document.createElement("a")
e_tag = encodeURIComponent(tag)
f_tag = tag.replace("/", "__")
tag_name = "#" + tag
tag_subscription_button_id_name = "tag-subscription-" + i + "__" + f_tag
ahref.setAttribute("data-container", "body")
ahref.setAttribute("data-toggle", "popover")
ahref.setAttribute("data-placement", "bottom")
ahref.setAttribute("data-html", "true")
ahref.setAttribute("data-tag-name", f_tag)
if (subscriptions_list.indexOf(tag) >= 0) {
ahref.setAttribute("class", "label label-subscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-primary btn-unsubscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-remove-sign glyphicon-white'></i>Unsubscribe " +
" </button> " +
" </div>")
} else {
ahref.setAttribute("class", "label label-unsubscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-default btn-subscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-ok-sign glyphicon-filled'></i>Subscribe " +
" </button> " +
" </div>")
}
ahref.setAttribute("href", "/tag_pages?tag=" + e_tag)
ahref.setAttribute("style", "font-weight:normal")
if (i == 0){
ahref.innerHTML = " "
colon = document.createElement("text")
colon.innerHTML = "<b>Tags</b>: "
tags.appendChild(colon)
}
ahref.innerHTML = ahref.innerHTML + tag_name
tags.appendChild(ahref)
if (i != tags_list.length - 1){
comma = document.createElement("text")
comma.innerText = ", "
tags.appendChild(comma)
}
})
tags.nextSibling.remove()
tags.innerHTML += "<i class='glyphicon glyphicon-edit icon-gray' style='font-size:12pt; padding-left:4px' id='tooltip-edit_tags'></i>"
$(".pop").popover({ trigger: "manual" , html: true, animation:false, delay: 100})
.on("mouseenter", function () {
var _this = this;
$(this).popover("show");
$(".popover").on("mouseleave", function () {
$(_this).popover('hide');
});
}).on("mouseleave", function () {
var _this = this;
setTimeout(function () {
if (!$(".popover:hover").length) {
$(_this).popover("hide");
}
}, 300);
});
$('#tooltip-edit_tags').click(function(){
$('#tooltip-edit_tags')[0].setAttribute("style", "display:none")
previousSibling = $("#tooltip-edit_tags")[0].previousSibling
tags_string = tags_list.join(", ")
form = document.createElement("form")
input = document.createElement("input")
tags_text = document.createElement("text")
icon_class = document.createElement("i")
icon_class.setAttribute("class", "glyphicon glyphicon-upload icon-gray")
icon_class.setAttribute("style", "font-size:23px; padding-left:4px")
icon_class.setAttribute("id", "tooltip-save_tags")
tags_text.innerText = "Tags: "
input.setAttribute('type', 'text')
input.setAttribute('name', 'tags_list')
input.setAttribute('value', tags_string)
input.setAttribute('style', 'width:75%; display: inline-block')
input.setAttribute('id' , 'change_tags')
form.appendChild(tags_text)
tags.textContent = " "
form.appendChild(input)
form.appendChild(icon_class)
tags.appendChild(form)
$("#tooltip-save_tags").click(function(){
tags_string = $("#change_tags")[0].value
tags_list = tags_string.split(",")
var re = /^[a-z0-9\-\_\:\/]+$/i
var good = true
for (var i = 0; i < tags_list.length; i++){
tag = tags_list[i]
if (tag.length == 0){
alert("There is a tag with length 0 - possible a trailing comma?")
good = false
break
} else {
tag_name = tag.trim()
if (!(re.test(tag_name))){
alert("The tag contains special characters. Make sure there are only alphanumeric characters in your tag")
good = false
break
}
}
}
if (good) {
var postContent = {}
postContent['tags'] = tags_string
$.ajax({
type: "POST",
dataType: "json",
data: JSON.stringify(postContent),
contentType: "application/json",
url: '/tag_list?post_path=etl/android-clients.kp',
async: false
});
location.reload()
}
})
tags.nextSibling.remove()
// Allow user to edit tags
var edit_icon = iconsJx.createEditTagsIcon();
$(tags).after(edit_icon);
$("#tooltip-edit_tags").on("click", function(){
var edit_tooltip = $("#tooltip-edit_tags");
edit_tooltip.attr("style", "display:none");
var tags_string = tags_list.join(", ");
var form = $("<form>");
var input = $("<input>");
var tags_text = $("<text>");
tags_text.html("Tags: ");
var icon = iconsJx.createSaveTagsIcon();
input.attr("type", "text");
input.attr("name", "tags_list");
input.attr("style", "width:75%; display: inline-block");
input.attr("id", "change_tags");
form.append(tags_text);
tags.textContent = " ";
tags_text.innerHTML = "Tags: ";
form.append(input);
form.append(icon);
tags.appendChild(form[0]);
$("#change_tags")[0].value = tags_string;
$("#change_tags").keypress(function(e){
if (e.which == 13){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/android-clients.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false;
};
});
$("#tooltip-save_tags").click(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/android-clients.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
});
$("form").submit(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/android-clients.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false
})
});
});
</script>

Просмотреть файл

@ -0,0 +1,712 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title> Knowledge </title>
<!-- js includes at the top as post embedded js colliding -->
<script src="/static/modules/jquery/jquery.min.js"></script>
<script src="/static/modules/tether/js/tether.min.js"></script>
<script src="/static/modules/bootstrap/js/bootstrap.min.js"></script>
<script src="/static/modules/bootstrap-slider/js/bootstrap-slider.min.js"></script>
<script src="/static/modules/typeahead.js/typeahead.bundle.min.js"></script>
<script src="/static/modules/handlebars/js/handlebars.js"></script>
<script src="/static/js/helpers.js"></script>
<script src="/static/modules/select2/js/select2.min.js"></script>
<script src="/static/modules/hightlight.pack.js/highlight.pack.js"></script>
<script src="/static/modules/marked.js/marked.js"></script>
<!-- require js is used for plotly, but has a bunch of collisions with other js packages
make sure to have it be last js package imported -->
<script src="/static/modules/require.js/require.min.js"></script>
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.6.2/html5shiv.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/2.1.0/es5-shim.min.js"></script>
<![endif]-->
<link rel="stylesheet" href="/static/modules/bootstrap/css/bootstrap.min.css">
<link rel="stylesheet" href="/static/modules/bootstrap-slider/css/bootstrap-slider.min.css">
<link rel="stylesheet" type='text/css' href="/static/css/custom.css">
<link rel="stylesheet" type='text/css' href="/static/modules/select2/css/select2.min.css">
<link href='https://fonts.googleapis.com/css?family=Lato:400,900|Playfair+Display|Source+Serif+Pro:400,700' rel='stylesheet' type='text/css'>
<link rel="shortcut icon" href="/static/images/favicon.png">
<link rel="stylesheet" href="/static/css/codehilite-friendly.css">
<style>
.spinner {
position: fixed;
top: 50%;
left: 50%;
margin-left: -50px; /* half width of the spinner gif */
margin-top: -50px; /* half height of the spinner gif */
text-align:center;
z-index:1234;
overflow: auto;
width: 100px; /* width of the spinner gif */
height: 102px; /*hight of the spinner gif +2px to fix IE8 issue */
}
.table {
font-size: 14px;
}
.modal-content {
max-width: 1024px;
}
</style>
</head>
<body>
<div class="navbar navbar-knowledge" role="navigation">
<div class="container page-container">
<a href="/" aria-selected="false" class="logo-image navbar-text" style='margin-top:16px; margin-left: -7px'>
<img width="125" src='/static/images/logo-white.svg'></img>
</a>
<a id="feed_tab" href="/" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Home
</a>
<a id="favorites_tab" href="/favorites" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Favorites
</a>
<a id="help_tab" href="/about" aria-selected="false" class="navbar-text" style='margin-top:18px'>
About
</a>
<a id="stats_tab" href="/stats" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Stats
</a>
<a id="webposts_tab" href="/create"
class="btn btn-primary navbar-text pull-right"
style='border-radius:4px; margin-top:10px; margin-right:-10px; background-color: #00a699; color:white; border-color: #00a699'>
Write a Post!
</a>
<div class="pull-right">
<div class="form-group">
<input class="form-control" id="searchbar" placeholder="Search for Knowledge" style="text-align:right">
</div>
</div>
</div>
</div>
<div class="container page-container">
<br>
<div class="container-fluid">
<div class="row">
<div class="col-md-6">
<div class="btn-group" role="group">
<button type="button" class="btn btn-default btn-rendered">
View Post
</button>
<button type="button" class="btn btn-default btn-raw">
View Raw Markdown
</button>
</div>
</div>
<div class="col-md-2">
</div>
<div class="col-md-4 text-right">
<i class="glyphicon glyphicon-eye-open" id="pageview_stats" style='color: #9CA299'></i>
<div id="pageview_stats" style="display: inline-block">
Viewed 1 times by 1 different users
</div>
<i class="glyphicon glyphicon-heart-empty glyphicon-clickable pop" style="font-size:16pt" id="tooltip-like" data-placement="bottom"
data-trigger="#tooltip-like"
data-container="body"
data-toggle="popover"
data-content='<div>Like This Post</div>'></i>
</div>
</div>
<div class="row col-md-12">
</div>
</div>
<div class='container-fluid'>
<div class="row">
<div class="col-md-12">
<div id="renderedMarkdown">
<div class='metadata'>
<h1>Android Clients ETL</h1>
<span class='authors'><a href='/feed?authors=Frank+Bertsch'>Frank Bertsch</a></span>
<span class='date_created'>February 09, 2017</span>
<span class='date_updated'>(Last Updated: May 10, 2017)</span>
<span class='tldr'><p>This notebook maps Fennec saved_session pings to some useful information about clients. This is a 1:1 mapping.</p></span>
<span class='tags'></span>
</div>
<div class="codehilite"><pre><span></span><span class="kn">import</span> <span class="nn">datetime</span> <span class="kn">as</span> <span class="nn">dt</span>
<span class="kn">import</span> <span class="nn">os</span>
<span class="kn">import</span> <span class="nn">pandas</span> <span class="kn">as</span> <span class="nn">pd</span>
<span class="kn">import</span> <span class="nn">ujson</span> <span class="kn">as</span> <span class="nn">json</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="o">*</span>
<span class="kn">from</span> <span class="nn">moztelemetry</span> <span class="kn">import</span> <span class="n">get_pings</span><span class="p">,</span> <span class="n">get_pings_properties</span>
<span class="o">%</span><span class="n">pylab</span> <span class="n">inline</span>
</pre></div>
<p>Take the set of pings, make sure we have actual clientIds and remove duplicate pings. We collect each unique ping.</p>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">dedupe_pings</span><span class="p">(</span><span class="n">rdd</span><span class="p">):</span>
<span class="k">return</span> <span class="n">rdd</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="n">p</span><span class="p">[</span><span class="s2">&quot;meta/clientId&quot;</span><span class="p">]</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">)</span>\
<span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="p">(</span><span class="n">p</span><span class="p">[</span><span class="s2">&quot;meta/documentId&quot;</span><span class="p">],</span> <span class="n">p</span><span class="p">))</span>\
<span class="o">.</span><span class="n">reduceByKey</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span><span class="p">)</span>\
<span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
</pre></div>
<p>Transform and sanitize the pings into arrays.</p>
<div class="codehilite"><pre><span></span><span class="c1"># bug 1362659 - int values exceeded signed 32 bit range</span>
<span class="n">MAX_INT</span> <span class="o">=</span> <span class="p">(</span><span class="mi">2</span><span class="o">**</span><span class="mi">31</span><span class="p">)</span><span class="o">-</span><span class="mi">1</span>
<span class="k">def</span> <span class="nf">transform</span><span class="p">(</span><span class="n">ping</span><span class="p">):</span>
<span class="c1"># Should not be None since we filter those out.</span>
<span class="n">clientId</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;meta/clientId&quot;</span><span class="p">]</span>
<span class="n">profileDate</span> <span class="o">=</span> <span class="bp">None</span>
<span class="n">profileDaynum</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;environment/profile/creationDate&quot;</span><span class="p">]</span>
<span class="k">if</span> <span class="n">profileDaynum</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="k">try</span><span class="p">:</span>
<span class="c1"># Bad data could push profileDaynum &gt; 32767 (size of a C int) and throw exception</span>
<span class="n">profileDate</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="p">(</span><span class="mi">1970</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> <span class="o">+</span> <span class="n">dt</span><span class="o">.</span><span class="n">timedelta</span><span class="p">(</span><span class="nb">int</span><span class="p">(</span><span class="n">profileDaynum</span><span class="p">))</span>
<span class="k">except</span><span class="p">:</span>
<span class="n">profileDate</span> <span class="o">=</span> <span class="bp">None</span>
<span class="c1"># Create date should already be in ISO format</span>
<span class="n">creationDate</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;creationDate&quot;</span><span class="p">]</span>
<span class="k">if</span> <span class="n">creationDate</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="c1"># This is only accurate because we know the creation date is always in &#39;Z&#39; (zulu) time.</span>
<span class="n">creationDate</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">strptime</span><span class="p">(</span><span class="n">ping</span><span class="p">[</span><span class="s2">&quot;creationDate&quot;</span><span class="p">],</span> <span class="s2">&quot;%Y-%m-</span><span class="si">%d</span><span class="s2">T%H:%M:%S.</span><span class="si">%f</span><span class="s2">Z&quot;</span><span class="p">)</span>
<span class="c1"># Added via the ingestion process so should not be None.</span>
<span class="n">submissionDate</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">strptime</span><span class="p">(</span><span class="n">ping</span><span class="p">[</span><span class="s2">&quot;meta/submissionDate&quot;</span><span class="p">],</span> <span class="s2">&quot;%Y%m</span><span class="si">%d</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="n">appVersion</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;application/version&quot;</span><span class="p">]</span>
<span class="n">osVersion</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;environment/system/os/version&quot;</span><span class="p">]</span>
<span class="k">if</span> <span class="n">osVersion</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">osVersion</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">osVersion</span><span class="p">)</span> <span class="k">if</span> <span class="nb">int</span><span class="p">(</span><span class="n">osVersion</span><span class="p">)</span> <span class="o">&lt;=</span> <span class="n">MAX_INT</span> <span class="k">else</span> <span class="bp">None</span>
<span class="n">locale</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;environment/settings/locale&quot;</span><span class="p">]</span>
<span class="c1"># Truncate to 32 characters</span>
<span class="n">defaultSearch</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;environment/settings/defaultSearchEngine&quot;</span><span class="p">]</span>
<span class="k">if</span> <span class="n">defaultSearch</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">defaultSearch</span> <span class="o">=</span> <span class="n">defaultSearch</span><span class="p">[</span><span class="mi">0</span><span class="p">:</span><span class="mi">32</span><span class="p">]</span>
<span class="c1"># Build up the device string, truncating like we do in &#39;core&#39; ping.</span>
<span class="n">device</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;environment/system/device/manufacturer&quot;</span><span class="p">]</span>
<span class="n">model</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;environment/system/device/model&quot;</span><span class="p">]</span>
<span class="k">if</span> <span class="n">device</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span> <span class="ow">and</span> <span class="n">model</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">device</span> <span class="o">=</span> <span class="n">device</span><span class="p">[</span><span class="mi">0</span><span class="p">:</span><span class="mi">12</span><span class="p">]</span> <span class="o">+</span> <span class="s2">&quot;-&quot;</span> <span class="o">+</span> <span class="n">model</span><span class="p">[</span><span class="mi">0</span><span class="p">:</span><span class="mi">19</span><span class="p">]</span>
<span class="n">xpcomABI</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;application/xpcomAbi&quot;</span><span class="p">]</span>
<span class="n">arch</span> <span class="o">=</span> <span class="s2">&quot;arm&quot;</span>
<span class="k">if</span> <span class="n">xpcomABI</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span> <span class="ow">and</span> <span class="s2">&quot;x86&quot;</span> <span class="ow">in</span> <span class="n">xpcomABI</span><span class="p">:</span>
<span class="n">arch</span> <span class="o">=</span> <span class="s2">&quot;x86&quot;</span>
<span class="c1"># Bug 1337896</span>
<span class="n">as_topsites_loader_time</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;payload/histograms/FENNEC_ACTIVITY_STREAM_TOPSITES_LOADER_TIME_MS&quot;</span><span class="p">]</span>
<span class="n">topsites_loader_time</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;payload/histograms/FENNEC_TOPSITES_LOADER_TIME_MS&quot;</span><span class="p">]</span>
<span class="k">if</span> <span class="n">as_topsites_loader_time</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">as_topsites_loader_time</span> <span class="o">=</span> <span class="nb">map</span><span class="p">(</span><span class="nb">int</span><span class="p">,</span> <span class="n">as_topsites_loader_time</span><span class="o">.</span><span class="n">tolist</span><span class="p">())</span>
<span class="k">if</span> <span class="nb">any</span><span class="p">([</span><span class="n">v</span> <span class="o">&gt;</span> <span class="n">MAX_INT</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">as_topsites_loader_time</span><span class="p">]):</span>
<span class="n">as_topsites_loader_time</span> <span class="o">=</span> <span class="bp">None</span>
<span class="k">if</span> <span class="n">topsites_loader_time</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">topsites_loader_time</span> <span class="o">=</span> <span class="nb">map</span><span class="p">(</span><span class="nb">int</span><span class="p">,</span> <span class="n">topsites_loader_time</span><span class="o">.</span><span class="n">tolist</span><span class="p">())</span>
<span class="k">if</span> <span class="nb">any</span><span class="p">([</span><span class="n">v</span> <span class="o">&gt;</span> <span class="n">MAX_INT</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">topsites_loader_time</span><span class="p">]):</span>
<span class="n">topsites_loader_time</span> <span class="o">=</span> <span class="bp">None</span>
<span class="k">return</span> <span class="p">[</span><span class="n">clientId</span><span class="p">,</span>
<span class="n">profileDate</span><span class="p">,</span>
<span class="n">submissionDate</span><span class="p">,</span>
<span class="n">creationDate</span><span class="p">,</span>
<span class="n">appVersion</span><span class="p">,</span>
<span class="n">osVersion</span><span class="p">,</span>
<span class="n">locale</span><span class="p">,</span>
<span class="n">defaultSearch</span><span class="p">,</span>
<span class="n">device</span><span class="p">,</span>
<span class="n">arch</span><span class="p">,</span>
<span class="n">as_topsites_loader_time</span><span class="p">,</span>
<span class="n">topsites_loader_time</span><span class="p">]</span>
</pre></div>
<p>Create a set of pings from &ldquo;saved-session&rdquo; to build a set of core client data. Output the data to CSV or Parquet.</p>
<p>This script is designed to loop over a range of days and output a single day for the given channels. Use explicit date ranges for backfilling, or now() - &lsquo;1day&rsquo; for automated runs.</p>
<div class="codehilite"><pre><span></span><span class="n">channels</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;nightly&quot;</span><span class="p">,</span> <span class="s2">&quot;aurora&quot;</span><span class="p">,</span> <span class="s2">&quot;beta&quot;</span><span class="p">,</span> <span class="s2">&quot;release&quot;</span><span class="p">]</span>
<span class="n">batch_date</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;date&#39;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">batch_date</span><span class="p">:</span>
<span class="n">start</span> <span class="o">=</span> <span class="n">end</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">strptime</span><span class="p">(</span><span class="n">batch_date</span><span class="p">,</span> <span class="s1">&#39;%Y%m</span><span class="si">%d</span><span class="s1">&#39;</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">start</span> <span class="o">=</span> <span class="n">end</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">now</span><span class="p">()</span> <span class="o">-</span> <span class="n">dt</span><span class="o">.</span><span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="n">day</span> <span class="o">=</span> <span class="n">start</span>
<span class="k">while</span> <span class="n">day</span> <span class="o">&lt;=</span> <span class="n">end</span><span class="p">:</span>
<span class="k">for</span> <span class="n">channel</span> <span class="ow">in</span> <span class="n">channels</span><span class="p">:</span>
<span class="k">print</span> <span class="s2">&quot;</span><span class="se">\n</span><span class="s2">channel: &quot;</span> <span class="o">+</span> <span class="n">channel</span> <span class="o">+</span> <span class="s2">&quot;, date: &quot;</span> <span class="o">+</span> <span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">&quot;%Y%m</span><span class="si">%d</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="n">pings</span> <span class="o">=</span> <span class="n">get_pings</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">app</span><span class="o">=</span><span class="s2">&quot;Fennec&quot;</span><span class="p">,</span> <span class="n">channel</span><span class="o">=</span><span class="n">channel</span><span class="p">,</span>
<span class="n">submission_date</span><span class="o">=</span><span class="p">(</span><span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">&quot;%Y%m</span><span class="si">%d</span><span class="s2">&quot;</span><span class="p">),</span> <span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">&quot;%Y%m</span><span class="si">%d</span><span class="s2">&quot;</span><span class="p">)),</span>
<span class="n">build_id</span><span class="o">=</span><span class="p">(</span><span class="s2">&quot;20100101000000&quot;</span><span class="p">,</span> <span class="s2">&quot;99999999999999&quot;</span><span class="p">),</span>
<span class="n">fraction</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="n">subset</span> <span class="o">=</span> <span class="n">get_pings_properties</span><span class="p">(</span><span class="n">pings</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;meta/clientId&quot;</span><span class="p">,</span>
<span class="s2">&quot;meta/documentId&quot;</span><span class="p">,</span>
<span class="s2">&quot;meta/submissionDate&quot;</span><span class="p">,</span>
<span class="s2">&quot;creationDate&quot;</span><span class="p">,</span>
<span class="s2">&quot;application/version&quot;</span><span class="p">,</span>
<span class="s2">&quot;environment/system/os/version&quot;</span><span class="p">,</span>
<span class="s2">&quot;environment/profile/creationDate&quot;</span><span class="p">,</span>
<span class="s2">&quot;environment/settings/locale&quot;</span><span class="p">,</span>
<span class="s2">&quot;environment/settings/defaultSearchEngine&quot;</span><span class="p">,</span>
<span class="s2">&quot;environment/system/device/model&quot;</span><span class="p">,</span>
<span class="s2">&quot;environment/system/device/manufacturer&quot;</span><span class="p">,</span>
<span class="s2">&quot;application/xpcomAbi&quot;</span><span class="p">,</span>
<span class="s2">&quot;payload/histograms/FENNEC_ACTIVITY_STREAM_TOPSITES_LOADER_TIME_MS&quot;</span><span class="p">,</span>
<span class="s2">&quot;payload/histograms/FENNEC_TOPSITES_LOADER_TIME_MS&quot;</span><span class="p">])</span>
<span class="n">subset</span> <span class="o">=</span> <span class="n">dedupe_pings</span><span class="p">(</span><span class="n">subset</span><span class="p">)</span>
<span class="n">transformed</span> <span class="o">=</span> <span class="n">subset</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">transform</span><span class="p">)</span>
<span class="n">s3_output</span> <span class="o">=</span> <span class="s2">&quot;s3n://net-mozaws-prod-us-west-2-pipeline-analysis/mobile/android_clients&quot;</span>
<span class="n">s3_output</span> <span class="o">+=</span> <span class="s2">&quot;/v2/channel=&quot;</span> <span class="o">+</span> <span class="n">channel</span> <span class="o">+</span> <span class="s2">&quot;/submission=&quot;</span> <span class="o">+</span> <span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">&quot;%Y%m</span><span class="si">%d</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">StructType</span><span class="p">([</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;clientid&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">False</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;profiledate&quot;</span><span class="p">,</span> <span class="n">TimestampType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;submissiondate&quot;</span><span class="p">,</span> <span class="n">TimestampType</span><span class="p">(),</span> <span class="bp">False</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;creationdate&quot;</span><span class="p">,</span> <span class="n">TimestampType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;appversion&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;osversion&quot;</span><span class="p">,</span> <span class="n">IntegerType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;locale&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;defaultsearch&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;device&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;arch&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;fennec_activity_stream_topsites_loader_time_ms&quot;</span><span class="p">,</span>
<span class="n">ArrayType</span><span class="p">(</span><span class="n">IntegerType</span><span class="p">()),</span>
<span class="bp">True</span>
<span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;fennec_topsites_loader_time_ms&quot;</span><span class="p">,</span>
<span class="n">ArrayType</span><span class="p">(</span><span class="n">IntegerType</span><span class="p">()),</span>
<span class="bp">True</span>
<span class="p">)</span>
<span class="p">])</span>
<span class="n">grouped</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">transformed</span><span class="p">,</span> <span class="n">schema</span><span class="p">)</span>
<span class="n">grouped</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="n">s3_output</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s2">&quot;overwrite&quot;</span><span class="p">)</span>
<span class="n">day</span> <span class="o">+=</span> <span class="n">dt</span><span class="o">.</span><span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
</div>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<h2> 0 Comments </h2>
</br>
<div>
<div class="row">
<div class="col-md-12">
<textarea class="form-control" type="text" id="comment-text" style="height:87px;" placeholder="Leave a comment..."></textarea>
</div>
</div>
</div>
</br>
<div>
<button class="btn btn-primary" id="post_comment_btn">Post Comment</button>
</div>
</div>
</div>
<br>
<div class="row">
<div class="col-md-12">
</div>
</br>
</div>
</div>
</div>
</div>
<div class="footer">
Served with <span class="glyphicon glyphicon-heart"></span> by <a href="https://github.com/airbnb/knowledge-repo">Knowledge Repo</a> <a href="https://github.com/airbnb/knowledge-repo/releases/tag/v0.7.4">0.7.4</a><br />
<i title="Last checked for updates: 4 minutes ago">Last indexed: 14 hours ago</i>
</div>
<script type='text/javascript'>
$("#searchbar")[0].setSelectionRange(1000, 1000);
$('#searchbar').typeahead({
hint: false,
highlight: true,
minLength: 1
},
{
name: 'knowledge_posts',
limit: 10,
display: function (item) {
return item.title + " - " + item.author;
},
templates: {
empty: Handlebars.compile(
'<div class="tt-not-found">' +
'Unable to find any posts that match the current query' +
'</div>'
),
suggestion: function(data) {
return '<p style="overflow-wrap:break-word"><strong class="text-rausch">' + data.title + '</strong> – ' + data.author + '</p>';
}
},
source: function(q, sync, async) {
$.ajax('/ajax/index/typeahead?search=' + q,
{
success: function(data,status){ async(JSON.parse(data)); }
})
}
});
$('#searchbar').bind('typeahead:select', function(obj, datum, name) {
window.location = '/post/'+encodeURIComponent(datum.path);
});
$('#searchbar').keypress(function(event){
var keycode = (event.keyCode ? event.keyCode : event.which);
if(keycode == '13'){
var path = document.location.pathname;
window.location = '/feed?filters=' + $('#searchbar').val()
}
});
var padding = $('.tt-menu').outerWidth()
$('.tt-menu').width($('#searchbar').width() + padding + "px")
</script>
<script src="/static/js/tooltips.js" type="text/javascript"></script>
<script type="text/javascript">
$("document").ready(function(){
var is_webeditor = false;
var post_id = "36";
var id = "None";
var post_path = "etl/android-clients.kp"
var data_repo_github_root = ""
tooltipsJx.initializeTooltips(is_webeditor, post_id, id, data_repo_github_root);
$(".btn-rendered").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path);
})
$(".btn-raw").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path) + "?render=raw";
})
$(".btn-webeditor").on("click", function(){
document.location.href = "/edit/" + encodeURI(post_path);
})
});
</script>
<script src="/static/js/helpers.js"></script>
<script src="/static/js/tags.js" type="text/javascript"></script>
<script src="/static/js/icons.js" type="text/javascript"></script>
<script src="/static/js/comments.js" type="text/javascript"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script type="text/javascript"
src="https://cdn.mathjax.org/mathjax/latest/MathJax.js">
</script>
<script>
$(document).on('ready', function(){
// Make the Rendered Markdown Button active
$(".btn-rendered").addClass("btn-active");
// Initialize headers
helpersJx.linkifyHeaders();
// Initialize comments
var post_path = 'etl/android-clients.kp';
$("#post_comment_btn").on('click', function(){
comment_author = 'knowledge_default';
post_author = 'Frank Bertsch';
post_title = 'Android Clients ETL';
commentsJx.postComment(comment_author, post_author, post_title, post_path);
location.reload();
});
all_comment_delete_buttons = $("[id^=delete_comment]")
$.each(all_comment_delete_buttons, function(i,v){
$(v).on("click", function(){
var id = v.id;
var comment_id = id.split("__")[1];
if(comment_id) {
commentsJx.deleteComment(post_path, comment_id)
location.reload();
}
});
});
$(document.body).on('click',"button[id^=tag-subscription]",function () {
tagsJx.addTagSubscriptionListener($(this)[0]);
});
})
//Turn all the headers to be links
//h1 = Title, don't want that
var all_headers = [$("h2"), $("h3"), $("h4"), $("h5"), $("h6")]
$.each(all_headers, function(index, value){
$.each(value, function(i, v){
var inner_html = v.innerHTML
inner_html_no_special = inner_html.replace(/[^a-zA-Z\- ]/g, "")
var inner_link = "#" + inner_html_no_special.toLowerCase().split(" ").join("-")
v.innerHTML = "<a href='" + inner_link + "' class=link-reset>" + inner_html + "</a>"
})
})
//turn all the tags into links, similar to what's done on the feed page
var tags = $("#renderedMarkdown .metadata .tags")[0]
var tags_list = ['mobile', 'fennec', 'etl']
var subscriptions_list = []
$.each(tags_list, function(i,tag){
ahref = document.createElement("a")
e_tag = encodeURIComponent(tag)
f_tag = tag.replace("/", "__")
tag_name = "#" + tag
tag_subscription_button_id_name = "tag-subscription-" + i + "__" + f_tag
ahref.setAttribute("data-container", "body")
ahref.setAttribute("data-toggle", "popover")
ahref.setAttribute("data-placement", "bottom")
ahref.setAttribute("data-html", "true")
ahref.setAttribute("data-tag-name", f_tag)
if (subscriptions_list.indexOf(tag) >= 0) {
ahref.setAttribute("class", "label label-subscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-primary btn-unsubscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-remove-sign glyphicon-white'></i>Unsubscribe " +
" </button> " +
" </div>")
} else {
ahref.setAttribute("class", "label label-unsubscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-default btn-subscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-ok-sign glyphicon-filled'></i>Subscribe " +
" </button> " +
" </div>")
}
ahref.setAttribute("href", "/tag_pages?tag=" + e_tag)
ahref.setAttribute("style", "font-weight:normal")
if (i == 0){
ahref.innerHTML = " "
colon = document.createElement("text")
colon.innerHTML = "<b>Tags</b>: "
tags.appendChild(colon)
}
ahref.innerHTML = ahref.innerHTML + tag_name
tags.appendChild(ahref)
if (i != tags_list.length - 1){
comma = document.createElement("text")
comma.innerText = ", "
tags.appendChild(comma)
}
})
tags.nextSibling.remove()
tags.innerHTML += "<i class='glyphicon glyphicon-edit icon-gray' style='font-size:12pt; padding-left:4px' id='tooltip-edit_tags'></i>"
$(".pop").popover({ trigger: "manual" , html: true, animation:false, delay: 100})
.on("mouseenter", function () {
var _this = this;
$(this).popover("show");
$(".popover").on("mouseleave", function () {
$(_this).popover('hide');
});
}).on("mouseleave", function () {
var _this = this;
setTimeout(function () {
if (!$(".popover:hover").length) {
$(_this).popover("hide");
}
}, 300);
});
$('#tooltip-edit_tags').click(function(){
$('#tooltip-edit_tags')[0].setAttribute("style", "display:none")
previousSibling = $("#tooltip-edit_tags")[0].previousSibling
tags_string = tags_list.join(", ")
form = document.createElement("form")
input = document.createElement("input")
tags_text = document.createElement("text")
icon_class = document.createElement("i")
icon_class.setAttribute("class", "glyphicon glyphicon-upload icon-gray")
icon_class.setAttribute("style", "font-size:23px; padding-left:4px")
icon_class.setAttribute("id", "tooltip-save_tags")
tags_text.innerText = "Tags: "
input.setAttribute('type', 'text')
input.setAttribute('name', 'tags_list')
input.setAttribute('value', tags_string)
input.setAttribute('style', 'width:75%; display: inline-block')
input.setAttribute('id' , 'change_tags')
form.appendChild(tags_text)
tags.textContent = " "
form.appendChild(input)
form.appendChild(icon_class)
tags.appendChild(form)
$("#tooltip-save_tags").click(function(){
tags_string = $("#change_tags")[0].value
tags_list = tags_string.split(",")
var re = /^[a-z0-9\-\_\:\/]+$/i
var good = true
for (var i = 0; i < tags_list.length; i++){
tag = tags_list[i]
if (tag.length == 0){
alert("There is a tag with length 0 - possible a trailing comma?")
good = false
break
} else {
tag_name = tag.trim()
if (!(re.test(tag_name))){
alert("The tag contains special characters. Make sure there are only alphanumeric characters in your tag")
good = false
break
}
}
}
if (good) {
var postContent = {}
postContent['tags'] = tags_string
$.ajax({
type: "POST",
dataType: "json",
data: JSON.stringify(postContent),
contentType: "application/json",
url: '/tag_list?post_path=etl/android-clients.kp',
async: false
});
location.reload()
}
})
tags.nextSibling.remove()
// Allow user to edit tags
var edit_icon = iconsJx.createEditTagsIcon();
$(tags).after(edit_icon);
$("#tooltip-edit_tags").on("click", function(){
var edit_tooltip = $("#tooltip-edit_tags");
edit_tooltip.attr("style", "display:none");
var tags_string = tags_list.join(", ");
var form = $("<form>");
var input = $("<input>");
var tags_text = $("<text>");
tags_text.html("Tags: ");
var icon = iconsJx.createSaveTagsIcon();
input.attr("type", "text");
input.attr("name", "tags_list");
input.attr("style", "width:75%; display: inline-block");
input.attr("id", "change_tags");
form.append(tags_text);
tags.textContent = " ";
tags_text.innerHTML = "Tags: ";
form.append(input);
form.append(icon);
tags.appendChild(form[0]);
$("#change_tags")[0].value = tags_string;
$("#change_tags").keypress(function(e){
if (e.which == 13){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/android-clients.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false;
};
});
$("#tooltip-save_tags").click(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/android-clients.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
});
$("form").submit(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/android-clients.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false
})
});
});
</script>
</body>
</html>

Просмотреть файл

@ -0,0 +1,14 @@
{
"title": "Android Clients ETL",
"authors": [
"Frank Bertsch"
],
"tags": [
"mobile",
"fennec",
"etl"
],
"publish_date": "2017-02-09",
"updated_at": "2017-02-09",
"tldr": "This notebook maps Fennec saved_session pings to some useful information about clients. This is a 1:1 mapping."
}

Просмотреть файл

@ -0,0 +1,565 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title> Knowledge </title>
<!-- js includes at the top as post embedded js colliding -->
<script src="https://reports.telemetry.mozilla.org/static/modules/jquery/jquery.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/tether/js/tether.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap/js/bootstrap.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/js/bootstrap-slider.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/typeahead.js/typeahead.bundle.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/handlebars/js/handlebars.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/select2/js/select2.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/hightlight.pack.js/highlight.pack.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/marked.js/marked.js"></script>
<!-- require js is used for plotly, but has a bunch of collisions with other js packages
make sure to have it be last js package imported -->
<script src="https://reports.telemetry.mozilla.org/static/modules/require.js/require.min.js"></script>
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.6.2/html5shiv.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/2.1.0/es5-shim.min.js"></script>
<![endif]-->
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap/css/bootstrap.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/css/bootstrap-slider.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/css/custom.css" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/select2/css/select2.min.css" rel="stylesheet" type="text/css"/>
<link href="https://fonts.googleapis.com/css?family=Lato:400,900|Playfair+Display|Source+Serif+Pro:400,700" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/images/favicon.png" rel="shortcut icon"/>
<link href="https://reports.telemetry.mozilla.org/static/css/codehilite-friendly.css" rel="stylesheet"/>
<style>
.spinner {
position: fixed;
top: 50%;
left: 50%;
margin-left: -50px; /* half width of the spinner gif */
margin-top: -50px; /* half height of the spinner gif */
text-align:center;
z-index:1234;
overflow: auto;
width: 100px; /* width of the spinner gif */
height: 102px; /*hight of the spinner gif +2px to fix IE8 issue */
}
.table {
font-size: 14px;
}
.modal-content {
max-width: 1024px;
}
</style>
</head>
<body>
<div class="container page-container">
<br/>
<div class="container-fluid">
<div class="row">
<div class="col-md-6">
</div>
<div class="col-md-2">
</div>
</div>
<div class="row col-md-12">
</div>
</div>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<div id="renderedMarkdown">
<div class="metadata">
<h1>Android Events ETL job</h1>
<span class="authors"><a href="/feed?authors=Frank+Bertsch">Frank Bertsch</a></span>
<span class="date_created">February 17, 2017</span>
<span class="date_updated">(Last Updated: February 27, 2017)</span>
<span class="tldr"><p>This job takes the Fennec saved session pings and transforms them, where there could be multiple events per ping.</p></span>
</div>
<div class="codehilite"><pre><span></span><span class="kn">import</span> <span class="nn">datetime</span> <span class="kn">as</span> <span class="nn">dt</span>
<span class="kn">import</span> <span class="nn">os</span>
<span class="kn">import</span> <span class="nn">pandas</span> <span class="kn">as</span> <span class="nn">pd</span>
<span class="kn">import</span> <span class="nn">ujson</span> <span class="kn">as</span> <span class="nn">json</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="o">*</span>
<span class="kn">from</span> <span class="nn">moztelemetry</span> <span class="kn">import</span> <span class="n">get_pings</span><span class="p">,</span> <span class="n">get_pings_properties</span><span class="p">,</span> <span class="n">get_one_ping_per_client</span>
<span class="o">%</span><span class="n">pylab</span> <span class="n">inline</span>
</pre></div>
<p>Take the set of pings, make sure we have actual clientIds and remove duplicate pings.</p>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">dedupe_pings</span><span class="p">(</span><span class="n">rdd</span><span class="p">):</span>
<span class="k">return</span> <span class="n">rdd</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="n">p</span><span class="p">[</span><span class="s2">"meta/clientId"</span><span class="p">]</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">)</span>\
<span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="p">(</span><span class="n">p</span><span class="p">[</span><span class="s2">"meta/documentId"</span><span class="p">],</span> <span class="n">p</span><span class="p">))</span>\
<span class="o">.</span><span class="n">reduceByKey</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span><span class="p">)</span>\
<span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
</pre></div>
<p>Were going to dump each event from the pings. Do a little empty data sanitization so we dont get NoneType errors during the dump. We create a JSON array of active experiments as part of the dump.</p>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">safe_str</span><span class="p">(</span><span class="n">obj</span><span class="p">):</span>
<span class="sd">""" return the byte string representation of obj """</span>
<span class="k">if</span> <span class="n">obj</span> <span class="ow">is</span> <span class="bp">None</span><span class="p">:</span>
<span class="k">return</span> <span class="nb">unicode</span><span class="p">(</span><span class="s2">""</span><span class="p">)</span>
<span class="k">return</span> <span class="nb">unicode</span><span class="p">(</span><span class="n">obj</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">transform</span><span class="p">(</span><span class="n">ping</span><span class="p">):</span>
<span class="n">output</span> <span class="o">=</span> <span class="p">[]</span>
<span class="c1"># These should not be None since we filter those out &amp; ingestion process adds the data</span>
<span class="n">clientId</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"meta/clientId"</span><span class="p">]</span>
<span class="n">submissionDate</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">strptime</span><span class="p">(</span><span class="n">ping</span><span class="p">[</span><span class="s2">"meta/submissionDate"</span><span class="p">],</span> <span class="s2">"%Y%m</span><span class="si">%d</span><span class="s2">"</span><span class="p">)</span>
<span class="n">events</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"payload/UIMeasurements"</span><span class="p">]</span>
<span class="k">if</span> <span class="n">events</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">events</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span>
<span class="k">for</span> <span class="n">event</span> <span class="ow">in</span> <span class="n">events</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">event</span><span class="p">,</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="s2">"type"</span> <span class="ow">in</span> <span class="n">event</span> <span class="ow">and</span> <span class="n">event</span><span class="p">[</span><span class="s2">"type"</span><span class="p">]</span> <span class="o">==</span> <span class="s2">"event"</span><span class="p">:</span>
<span class="k">if</span> <span class="s2">"timestamp"</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">event</span> <span class="ow">or</span> <span class="s2">"action"</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">event</span> <span class="ow">or</span> <span class="s2">"method"</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">event</span> <span class="ow">or</span> <span class="s2">"sessions"</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">event</span><span class="p">:</span>
<span class="k">continue</span>
<span class="c1"># Verify timestamp is a long, otherwise ignore the event</span>
<span class="n">timestamp</span> <span class="o">=</span> <span class="bp">None</span>
<span class="k">try</span><span class="p">:</span>
<span class="n">timestamp</span> <span class="o">=</span> <span class="nb">long</span><span class="p">(</span><span class="n">event</span><span class="p">[</span><span class="s2">"timestamp"</span><span class="p">])</span>
<span class="k">except</span><span class="p">:</span>
<span class="k">continue</span>
<span class="c1"># Force all fields to strings</span>
<span class="n">action</span> <span class="o">=</span> <span class="n">safe_str</span><span class="p">(</span><span class="n">event</span><span class="p">[</span><span class="s2">"action"</span><span class="p">])</span>
<span class="n">method</span> <span class="o">=</span> <span class="n">safe_str</span><span class="p">(</span><span class="n">event</span><span class="p">[</span><span class="s2">"method"</span><span class="p">])</span>
<span class="c1"># The extras is an optional field</span>
<span class="n">extras</span> <span class="o">=</span> <span class="nb">unicode</span><span class="p">(</span><span class="s2">""</span><span class="p">)</span>
<span class="k">if</span> <span class="s2">"extras"</span> <span class="ow">in</span> <span class="n">event</span> <span class="ow">and</span> <span class="n">safe_str</span><span class="p">(</span><span class="n">event</span><span class="p">[</span><span class="s2">"extras"</span><span class="p">])</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">extras</span> <span class="o">=</span> <span class="n">safe_str</span><span class="p">(</span><span class="n">event</span><span class="p">[</span><span class="s2">"extras"</span><span class="p">])</span>
<span class="n">sessions</span> <span class="o">=</span> <span class="nb">set</span><span class="p">()</span>
<span class="n">experiments</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">try</span><span class="p">:</span>
<span class="k">for</span> <span class="n">session</span> <span class="ow">in</span> <span class="n">event</span><span class="p">[</span><span class="s2">"sessions"</span><span class="p">]:</span>
<span class="k">if</span> <span class="s2">"experiment.1:"</span> <span class="ow">in</span> <span class="n">session</span><span class="p">:</span>
<span class="n">experiments</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">safe_str</span><span class="p">(</span><span class="n">session</span><span class="p">[</span><span class="mi">13</span><span class="p">:]))</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">sessions</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">safe_str</span><span class="p">(</span><span class="n">session</span><span class="p">))</span>
<span class="k">except</span> <span class="ne">TypeError</span><span class="p">:</span>
<span class="k">pass</span>
<span class="n">output</span><span class="o">.</span><span class="n">append</span><span class="p">([</span><span class="n">clientId</span><span class="p">,</span> <span class="n">submissionDate</span><span class="p">,</span> <span class="n">timestamp</span><span class="p">,</span> <span class="n">action</span><span class="p">,</span> <span class="n">method</span><span class="p">,</span> <span class="n">extras</span><span class="p">,</span> <span class="n">json</span><span class="o">.</span><span class="n">dumps</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">sessions</span><span class="p">)),</span> <span class="n">json</span><span class="o">.</span><span class="n">dumps</span><span class="p">(</span><span class="n">experiments</span><span class="p">)])</span>
<span class="k">return</span> <span class="n">output</span>
</pre></div>
<p>The data can have duplicate events, due to a bug in the data collection that was fixed (bug 1246973). We still need to de-dupe the events. Because pings can be archived on device and submitted on later days, we cant assume dupes only happen on the same submission day. We dont use submission date when de-duping.</p>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">dedupe_events</span><span class="p">(</span><span class="n">rdd</span><span class="p">):</span>
<span class="k">return</span> <span class="n">rdd</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="p">(</span><span class="n">p</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">+</span> <span class="n">safe_str</span><span class="p">(</span><span class="n">p</span><span class="p">[</span><span class="mi">2</span><span class="p">])</span> <span class="o">+</span> <span class="n">p</span><span class="p">[</span><span class="mi">3</span><span class="p">]</span> <span class="o">+</span> <span class="n">p</span><span class="p">[</span><span class="mi">4</span><span class="p">],</span> <span class="n">p</span><span class="p">))</span>\
<span class="o">.</span><span class="n">reduceByKey</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span><span class="p">)</span>\
<span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
</pre></div>
<p>Create a set of events from “saved-session” UI telemetry. Output the data to CSV or Parquet.</p>
<p>This script is designed to loop over a range of days and output a single day for the given channels. Use explicit date ranges for backfilling, or now() - 1day for automated runs.</p>
<div class="codehilite"><pre><span></span><span class="n">channels</span> <span class="o">=</span> <span class="p">[</span><span class="s2">"nightly"</span><span class="p">,</span> <span class="s2">"aurora"</span><span class="p">,</span> <span class="s2">"beta"</span><span class="p">,</span> <span class="s2">"release"</span><span class="p">]</span>
<span class="n">batch_date</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'date'</span><span class="p">)</span>
<span class="k">if</span> <span class="n">batch_date</span><span class="p">:</span>
<span class="n">start</span> <span class="o">=</span> <span class="n">end</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">strptime</span><span class="p">(</span><span class="n">batch_date</span><span class="p">,</span> <span class="s1">'%Y%m</span><span class="si">%d</span><span class="s1">'</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">start</span> <span class="o">=</span> <span class="n">start</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">now</span><span class="p">()</span> <span class="o">-</span> <span class="n">dt</span><span class="o">.</span><span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="n">day</span> <span class="o">=</span> <span class="n">start</span>
<span class="k">while</span> <span class="n">day</span> <span class="o">&lt;=</span> <span class="n">end</span><span class="p">:</span>
<span class="k">for</span> <span class="n">channel</span> <span class="ow">in</span> <span class="n">channels</span><span class="p">:</span>
<span class="k">print</span> <span class="s2">"</span><span class="se">\n</span><span class="s2">channel: "</span> <span class="o">+</span> <span class="n">channel</span> <span class="o">+</span> <span class="s2">", date: "</span> <span class="o">+</span> <span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">"%Y%m</span><span class="si">%d</span><span class="s2">"</span><span class="p">)</span>
<span class="n">pings</span> <span class="o">=</span> <span class="n">get_pings</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">app</span><span class="o">=</span><span class="s2">"Fennec"</span><span class="p">,</span> <span class="n">channel</span><span class="o">=</span><span class="n">channel</span><span class="p">,</span>
<span class="n">submission_date</span><span class="o">=</span><span class="p">(</span><span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">"%Y%m</span><span class="si">%d</span><span class="s2">"</span><span class="p">),</span> <span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">"%Y%m</span><span class="si">%d</span><span class="s2">"</span><span class="p">)),</span>
<span class="n">build_id</span><span class="o">=</span><span class="p">(</span><span class="s2">"20100101000000"</span><span class="p">,</span> <span class="s2">"99999999999999"</span><span class="p">),</span>
<span class="n">fraction</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="n">subset</span> <span class="o">=</span> <span class="n">get_pings_properties</span><span class="p">(</span><span class="n">pings</span><span class="p">,</span> <span class="p">[</span><span class="s2">"meta/clientId"</span><span class="p">,</span>
<span class="s2">"meta/documentId"</span><span class="p">,</span>
<span class="s2">"meta/submissionDate"</span><span class="p">,</span>
<span class="s2">"payload/UIMeasurements"</span><span class="p">])</span>
<span class="n">subset</span> <span class="o">=</span> <span class="n">dedupe_pings</span><span class="p">(</span><span class="n">subset</span><span class="p">)</span>
<span class="k">print</span> <span class="n">subset</span><span class="o">.</span><span class="n">first</span><span class="p">()</span>
<span class="n">rawEvents</span> <span class="o">=</span> <span class="n">subset</span><span class="o">.</span><span class="n">flatMap</span><span class="p">(</span><span class="n">transform</span><span class="p">)</span>
<span class="k">print</span> <span class="s2">"</span><span class="se">\n</span><span class="s2">Raw count: "</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">rawEvents</span><span class="o">.</span><span class="n">count</span><span class="p">())</span>
<span class="k">print</span> <span class="n">rawEvents</span><span class="o">.</span><span class="n">first</span><span class="p">()</span>
<span class="n">uniqueEvents</span> <span class="o">=</span> <span class="n">dedupe_events</span><span class="p">(</span><span class="n">rawEvents</span><span class="p">)</span>
<span class="k">print</span> <span class="s2">"</span><span class="se">\n</span><span class="s2">Unique count: "</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">uniqueEvents</span><span class="o">.</span><span class="n">count</span><span class="p">())</span>
<span class="k">print</span> <span class="n">uniqueEvents</span><span class="o">.</span><span class="n">first</span><span class="p">()</span>
<span class="n">s3_output</span> <span class="o">=</span> <span class="s2">"s3n://net-mozaws-prod-us-west-2-pipeline-analysis/mobile/android_events"</span>
<span class="n">s3_output</span> <span class="o">+=</span> <span class="s2">"/v1/channel="</span> <span class="o">+</span> <span class="n">channel</span> <span class="o">+</span> <span class="s2">"/submission="</span> <span class="o">+</span> <span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">"%Y%m</span><span class="si">%d</span><span class="s2">"</span><span class="p">)</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">StructType</span><span class="p">([</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"clientid"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">False</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"submissiondate"</span><span class="p">,</span> <span class="n">TimestampType</span><span class="p">(),</span> <span class="bp">False</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"ts"</span><span class="p">,</span> <span class="n">LongType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"action"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"method"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"extras"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"sessions"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"experiments"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">)</span>
<span class="p">])</span>
<span class="n">grouped</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">uniqueEvents</span><span class="p">,</span> <span class="n">schema</span><span class="p">)</span>
<span class="n">grouped</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="n">s3_output</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s2">"overwrite"</span><span class="p">)</span>
<span class="n">day</span> <span class="o">+=</span> <span class="n">dt</span><span class="o">.</span><span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
</div>
</div></body></html>
<div>
</div>
<br/>
<div class="row">
<div class="col-md-12">
</div>
</div>
<script type="text/javascript">
$("#searchbar")[0].setSelectionRange(1000, 1000);
$('#searchbar').typeahead({
hint: false,
highlight: true,
minLength: 1
},
{
name: 'knowledge_posts',
limit: 10,
display: function (item) {
return item.title + " - " + item.author;
},
templates: {
empty: Handlebars.compile(
'<div class="tt-not-found">' +
'Unable to find any posts that match the current query' +
'</div>'
),
suggestion: function(data) {
return '<p style="overflow-wrap:break-word"><strong class="text-rausch">' + data.title + '</strong> – ' + data.author + '</p>';
}
},
source: function(q, sync, async) {
$.ajax('/ajax/index/typeahead?search=' + q,
{
success: function(data,status){ async(JSON.parse(data)); }
})
}
});
$('#searchbar').bind('typeahead:select', function(obj, datum, name) {
window.location = '/post/'+encodeURIComponent(datum.path);
});
$('#searchbar').keypress(function(event){
var keycode = (event.keyCode ? event.keyCode : event.which);
if(keycode == '13'){
var path = document.location.pathname;
window.location = '/feed?filters=' + $('#searchbar').val()
}
});
var padding = $('.tt-menu').outerWidth()
$('.tt-menu').width($('#searchbar').width() + padding + "px")
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/tooltips.js" type="text/javascript"></script>
<script type="text/javascript">
$("document").ready(function(){
var is_webeditor = false;
var post_id = "4";
var id = "None";
var post_path = "etl/android-events.kp"
var data_repo_github_root = ""
tooltipsJx.initializeTooltips(is_webeditor, post_id, id, data_repo_github_root);
$(".btn-rendered").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path);
})
$(".btn-raw").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path) + "?render=raw";
})
$(".btn-webeditor").on("click", function(){
document.location.href = "/edit/" + encodeURI(post_path);
})
});
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/tags.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/icons.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/comments.js" type="text/javascript"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js" type="text/javascript">
</script>
<script>
$(document).on('ready', function(){
// Make the Rendered Markdown Button active
$(".btn-rendered").addClass("btn-active");
// Initialize headers
helpersJx.linkifyHeaders();
// Initialize comments
var post_path = 'etl/android-events.kp';
$("#post_comment_btn").on('click', function(){
comment_author = 'knowledge_default';
post_author = 'Frank Bertsch';
post_title = 'Android Events ETL job';
commentsJx.postComment(comment_author, post_author, post_title, post_path);
location.reload();
});
all_comment_delete_buttons = $("[id^=delete_comment]")
$.each(all_comment_delete_buttons, function(i,v){
$(v).on("click", function(){
var id = v.id;
var comment_id = id.split("__")[1];
if(comment_id) {
commentsJx.deleteComment(post_path, comment_id)
location.reload();
}
});
});
$(document.body).on('click',"button[id^=tag-subscription]",function () {
tagsJx.addTagSubscriptionListener($(this)[0]);
});
})
//Turn all the headers to be links
//h1 = Title, don't want that
var all_headers = [$("h2"), $("h3"), $("h4"), $("h5"), $("h6")]
$.each(all_headers, function(index, value){
$.each(value, function(i, v){
var inner_html = v.innerHTML
inner_html_no_special = inner_html.replace(/[^a-zA-Z\- ]/g, "")
var inner_link = "#" + inner_html_no_special.toLowerCase().split(" ").join("-")
v.innerHTML = "<a href='" + inner_link + "' class=link-reset>" + inner_html + "</a>"
})
})
//turn all the tags into links, similar to what's done on the feed page
var tags = $("#renderedMarkdown .metadata .tags")[0]
var tags_list = ['mobile', 'etl']
var subscriptions_list = []
$.each(tags_list, function(i,tag){
ahref = document.createElement("a")
e_tag = encodeURIComponent(tag)
f_tag = tag.replace("/", "__")
tag_name = "#" + tag
tag_subscription_button_id_name = "tag-subscription-" + i + "__" + f_tag
ahref.setAttribute("data-container", "body")
ahref.setAttribute("data-toggle", "popover")
ahref.setAttribute("data-placement", "bottom")
ahref.setAttribute("data-html", "true")
ahref.setAttribute("data-tag-name", f_tag)
if (subscriptions_list.indexOf(tag) >= 0) {
ahref.setAttribute("class", "label label-subscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-primary btn-unsubscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-remove-sign glyphicon-white'></i>Unsubscribe " +
" </button> " +
" </div>")
} else {
ahref.setAttribute("class", "label label-unsubscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-default btn-subscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-ok-sign glyphicon-filled'></i>Subscribe " +
" </button> " +
" </div>")
}
ahref.setAttribute("href", "/tag_pages?tag=" + e_tag)
ahref.setAttribute("style", "font-weight:normal")
if (i == 0){
ahref.innerHTML = " "
colon = document.createElement("text")
colon.innerHTML = "<b>Tags</b>: "
tags.appendChild(colon)
}
ahref.innerHTML = ahref.innerHTML + tag_name
tags.appendChild(ahref)
if (i != tags_list.length - 1){
comma = document.createElement("text")
comma.innerText = ", "
tags.appendChild(comma)
}
})
tags.nextSibling.remove()
tags.innerHTML += "<i class='glyphicon glyphicon-edit icon-gray' style='font-size:12pt; padding-left:4px' id='tooltip-edit_tags'></i>"
$(".pop").popover({ trigger: "manual" , html: true, animation:false, delay: 100})
.on("mouseenter", function () {
var _this = this;
$(this).popover("show");
$(".popover").on("mouseleave", function () {
$(_this).popover('hide');
});
}).on("mouseleave", function () {
var _this = this;
setTimeout(function () {
if (!$(".popover:hover").length) {
$(_this).popover("hide");
}
}, 300);
});
$('#tooltip-edit_tags').click(function(){
$('#tooltip-edit_tags')[0].setAttribute("style", "display:none")
previousSibling = $("#tooltip-edit_tags")[0].previousSibling
tags_string = tags_list.join(", ")
form = document.createElement("form")
input = document.createElement("input")
tags_text = document.createElement("text")
icon_class = document.createElement("i")
icon_class.setAttribute("class", "glyphicon glyphicon-upload icon-gray")
icon_class.setAttribute("style", "font-size:23px; padding-left:4px")
icon_class.setAttribute("id", "tooltip-save_tags")
tags_text.innerText = "Tags: "
input.setAttribute('type', 'text')
input.setAttribute('name', 'tags_list')
input.setAttribute('value', tags_string)
input.setAttribute('style', 'width:75%; display: inline-block')
input.setAttribute('id' , 'change_tags')
form.appendChild(tags_text)
tags.textContent = " "
form.appendChild(input)
form.appendChild(icon_class)
tags.appendChild(form)
$("#tooltip-save_tags").click(function(){
tags_string = $("#change_tags")[0].value
tags_list = tags_string.split(",")
var re = /^[a-z0-9\-\_\:\/]+$/i
var good = true
for (var i = 0; i < tags_list.length; i++){
tag = tags_list[i]
if (tag.length == 0){
alert("There is a tag with length 0 - possible a trailing comma?")
good = false
break
} else {
tag_name = tag.trim()
if (!(re.test(tag_name))){
alert("The tag contains special characters. Make sure there are only alphanumeric characters in your tag")
good = false
break
}
}
}
if (good) {
var postContent = {}
postContent['tags'] = tags_string
$.ajax({
type: "POST",
dataType: "json",
data: JSON.stringify(postContent),
contentType: "application/json",
url: '/tag_list?post_path=etl/android-events.kp',
async: false
});
location.reload()
}
})
tags.nextSibling.remove()
// Allow user to edit tags
var edit_icon = iconsJx.createEditTagsIcon();
$(tags).after(edit_icon);
$("#tooltip-edit_tags").on("click", function(){
var edit_tooltip = $("#tooltip-edit_tags");
edit_tooltip.attr("style", "display:none");
var tags_string = tags_list.join(", ");
var form = $("<form>");
var input = $("<input>");
var tags_text = $("<text>");
tags_text.html("Tags: ");
var icon = iconsJx.createSaveTagsIcon();
input.attr("type", "text");
input.attr("name", "tags_list");
input.attr("style", "width:75%; display: inline-block");
input.attr("id", "change_tags");
form.append(tags_text);
tags.textContent = " ";
tags_text.innerHTML = "Tags: ";
form.append(input);
form.append(icon);
tags.appendChild(form[0]);
$("#change_tags")[0].value = tags_string;
$("#change_tags").keypress(function(e){
if (e.which == 13){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/android-events.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false;
};
});
$("#tooltip-save_tags").click(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/android-events.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
});
$("form").submit(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/android-events.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false
})
});
});
</script>

Просмотреть файл

@ -0,0 +1,685 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title> Knowledge </title>
<!-- js includes at the top as post embedded js colliding -->
<script src="/static/modules/jquery/jquery.min.js"></script>
<script src="/static/modules/tether/js/tether.min.js"></script>
<script src="/static/modules/bootstrap/js/bootstrap.min.js"></script>
<script src="/static/modules/bootstrap-slider/js/bootstrap-slider.min.js"></script>
<script src="/static/modules/typeahead.js/typeahead.bundle.min.js"></script>
<script src="/static/modules/handlebars/js/handlebars.js"></script>
<script src="/static/js/helpers.js"></script>
<script src="/static/modules/select2/js/select2.min.js"></script>
<script src="/static/modules/hightlight.pack.js/highlight.pack.js"></script>
<script src="/static/modules/marked.js/marked.js"></script>
<!-- require js is used for plotly, but has a bunch of collisions with other js packages
make sure to have it be last js package imported -->
<script src="/static/modules/require.js/require.min.js"></script>
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.6.2/html5shiv.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/2.1.0/es5-shim.min.js"></script>
<![endif]-->
<link rel="stylesheet" href="/static/modules/bootstrap/css/bootstrap.min.css">
<link rel="stylesheet" href="/static/modules/bootstrap-slider/css/bootstrap-slider.min.css">
<link rel="stylesheet" type='text/css' href="/static/css/custom.css">
<link rel="stylesheet" type='text/css' href="/static/modules/select2/css/select2.min.css">
<link href='https://fonts.googleapis.com/css?family=Lato:400,900|Playfair+Display|Source+Serif+Pro:400,700' rel='stylesheet' type='text/css'>
<link rel="shortcut icon" href="/static/images/favicon.png">
<link rel="stylesheet" href="/static/css/codehilite-friendly.css">
<style>
.spinner {
position: fixed;
top: 50%;
left: 50%;
margin-left: -50px; /* half width of the spinner gif */
margin-top: -50px; /* half height of the spinner gif */
text-align:center;
z-index:1234;
overflow: auto;
width: 100px; /* width of the spinner gif */
height: 102px; /*hight of the spinner gif +2px to fix IE8 issue */
}
.table {
font-size: 14px;
}
.modal-content {
max-width: 1024px;
}
</style>
</head>
<body>
<div class="navbar navbar-knowledge" role="navigation">
<div class="container page-container">
<a href="/" aria-selected="false" class="logo-image navbar-text" style='margin-top:16px; margin-left: -7px'>
<img width="125" src='/static/images/logo-white.svg'></img>
</a>
<a id="feed_tab" href="/" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Home
</a>
<a id="favorites_tab" href="/favorites" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Favorites
</a>
<a id="help_tab" href="/about" aria-selected="false" class="navbar-text" style='margin-top:18px'>
About
</a>
<a id="stats_tab" href="/stats" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Stats
</a>
<a id="webposts_tab" href="/create"
class="btn btn-primary navbar-text pull-right"
style='border-radius:4px; margin-top:10px; margin-right:-10px; background-color: #00a699; color:white; border-color: #00a699'>
Write a Post!
</a>
<div class="pull-right">
<div class="form-group">
<input class="form-control" id="searchbar" placeholder="Search for Knowledge" style="text-align:right">
</div>
</div>
</div>
</div>
<div class="container page-container">
<br>
<div class="container-fluid">
<div class="row">
<div class="col-md-6">
<div class="btn-group" role="group">
<button type="button" class="btn btn-default btn-rendered">
View Post
</button>
<button type="button" class="btn btn-default btn-raw">
View Raw Markdown
</button>
</div>
</div>
<div class="col-md-2">
</div>
<div class="col-md-4 text-right">
<i class="glyphicon glyphicon-eye-open" id="pageview_stats" style='color: #9CA299'></i>
<div id="pageview_stats" style="display: inline-block">
Viewed 1 times by 1 different users
</div>
<i class="glyphicon glyphicon-heart-empty glyphicon-clickable pop" style="font-size:16pt" id="tooltip-like" data-placement="bottom"
data-trigger="#tooltip-like"
data-container="body"
data-toggle="popover"
data-content='<div>Like This Post</div>'></i>
</div>
</div>
<div class="row col-md-12">
</div>
</div>
<div class='container-fluid'>
<div class="row">
<div class="col-md-12">
<div id="renderedMarkdown">
<div class='metadata'>
<h1>Android Events ETL job</h1>
<span class='authors'><a href='/feed?authors=Frank+Bertsch'>Frank Bertsch</a></span>
<span class='date_created'>February 17, 2017</span>
<span class='date_updated'>(Last Updated: February 27, 2017)</span>
<span class='tldr'><p>This job takes the Fennec saved session pings and transforms them, where there could be multiple events per ping.</p></span>
<span class='tags'></span>
</div>
<div class="codehilite"><pre><span></span><span class="kn">import</span> <span class="nn">datetime</span> <span class="kn">as</span> <span class="nn">dt</span>
<span class="kn">import</span> <span class="nn">os</span>
<span class="kn">import</span> <span class="nn">pandas</span> <span class="kn">as</span> <span class="nn">pd</span>
<span class="kn">import</span> <span class="nn">ujson</span> <span class="kn">as</span> <span class="nn">json</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="o">*</span>
<span class="kn">from</span> <span class="nn">moztelemetry</span> <span class="kn">import</span> <span class="n">get_pings</span><span class="p">,</span> <span class="n">get_pings_properties</span><span class="p">,</span> <span class="n">get_one_ping_per_client</span>
<span class="o">%</span><span class="n">pylab</span> <span class="n">inline</span>
</pre></div>
<p>Take the set of pings, make sure we have actual clientIds and remove duplicate pings.</p>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">dedupe_pings</span><span class="p">(</span><span class="n">rdd</span><span class="p">):</span>
<span class="k">return</span> <span class="n">rdd</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="n">p</span><span class="p">[</span><span class="s2">&quot;meta/clientId&quot;</span><span class="p">]</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">)</span>\
<span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="p">(</span><span class="n">p</span><span class="p">[</span><span class="s2">&quot;meta/documentId&quot;</span><span class="p">],</span> <span class="n">p</span><span class="p">))</span>\
<span class="o">.</span><span class="n">reduceByKey</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span><span class="p">)</span>\
<span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
</pre></div>
<p>We&rsquo;re going to dump each event from the pings. Do a little empty data sanitization so we don&rsquo;t get NoneType errors during the dump. We create a JSON array of active experiments as part of the dump.</p>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">safe_str</span><span class="p">(</span><span class="n">obj</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot; return the byte string representation of obj &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">obj</span> <span class="ow">is</span> <span class="bp">None</span><span class="p">:</span>
<span class="k">return</span> <span class="nb">unicode</span><span class="p">(</span><span class="s2">&quot;&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="nb">unicode</span><span class="p">(</span><span class="n">obj</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">transform</span><span class="p">(</span><span class="n">ping</span><span class="p">):</span>
<span class="n">output</span> <span class="o">=</span> <span class="p">[]</span>
<span class="c1"># These should not be None since we filter those out &amp; ingestion process adds the data</span>
<span class="n">clientId</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;meta/clientId&quot;</span><span class="p">]</span>
<span class="n">submissionDate</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">strptime</span><span class="p">(</span><span class="n">ping</span><span class="p">[</span><span class="s2">&quot;meta/submissionDate&quot;</span><span class="p">],</span> <span class="s2">&quot;%Y%m</span><span class="si">%d</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="n">events</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;payload/UIMeasurements&quot;</span><span class="p">]</span>
<span class="k">if</span> <span class="n">events</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">events</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span>
<span class="k">for</span> <span class="n">event</span> <span class="ow">in</span> <span class="n">events</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">event</span><span class="p">,</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="s2">&quot;type&quot;</span> <span class="ow">in</span> <span class="n">event</span> <span class="ow">and</span> <span class="n">event</span><span class="p">[</span><span class="s2">&quot;type&quot;</span><span class="p">]</span> <span class="o">==</span> <span class="s2">&quot;event&quot;</span><span class="p">:</span>
<span class="k">if</span> <span class="s2">&quot;timestamp&quot;</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">event</span> <span class="ow">or</span> <span class="s2">&quot;action&quot;</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">event</span> <span class="ow">or</span> <span class="s2">&quot;method&quot;</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">event</span> <span class="ow">or</span> <span class="s2">&quot;sessions&quot;</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">event</span><span class="p">:</span>
<span class="k">continue</span>
<span class="c1"># Verify timestamp is a long, otherwise ignore the event</span>
<span class="n">timestamp</span> <span class="o">=</span> <span class="bp">None</span>
<span class="k">try</span><span class="p">:</span>
<span class="n">timestamp</span> <span class="o">=</span> <span class="nb">long</span><span class="p">(</span><span class="n">event</span><span class="p">[</span><span class="s2">&quot;timestamp&quot;</span><span class="p">])</span>
<span class="k">except</span><span class="p">:</span>
<span class="k">continue</span>
<span class="c1"># Force all fields to strings</span>
<span class="n">action</span> <span class="o">=</span> <span class="n">safe_str</span><span class="p">(</span><span class="n">event</span><span class="p">[</span><span class="s2">&quot;action&quot;</span><span class="p">])</span>
<span class="n">method</span> <span class="o">=</span> <span class="n">safe_str</span><span class="p">(</span><span class="n">event</span><span class="p">[</span><span class="s2">&quot;method&quot;</span><span class="p">])</span>
<span class="c1"># The extras is an optional field</span>
<span class="n">extras</span> <span class="o">=</span> <span class="nb">unicode</span><span class="p">(</span><span class="s2">&quot;&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="s2">&quot;extras&quot;</span> <span class="ow">in</span> <span class="n">event</span> <span class="ow">and</span> <span class="n">safe_str</span><span class="p">(</span><span class="n">event</span><span class="p">[</span><span class="s2">&quot;extras&quot;</span><span class="p">])</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">extras</span> <span class="o">=</span> <span class="n">safe_str</span><span class="p">(</span><span class="n">event</span><span class="p">[</span><span class="s2">&quot;extras&quot;</span><span class="p">])</span>
<span class="n">sessions</span> <span class="o">=</span> <span class="nb">set</span><span class="p">()</span>
<span class="n">experiments</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">try</span><span class="p">:</span>
<span class="k">for</span> <span class="n">session</span> <span class="ow">in</span> <span class="n">event</span><span class="p">[</span><span class="s2">&quot;sessions&quot;</span><span class="p">]:</span>
<span class="k">if</span> <span class="s2">&quot;experiment.1:&quot;</span> <span class="ow">in</span> <span class="n">session</span><span class="p">:</span>
<span class="n">experiments</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">safe_str</span><span class="p">(</span><span class="n">session</span><span class="p">[</span><span class="mi">13</span><span class="p">:]))</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">sessions</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">safe_str</span><span class="p">(</span><span class="n">session</span><span class="p">))</span>
<span class="k">except</span> <span class="ne">TypeError</span><span class="p">:</span>
<span class="k">pass</span>
<span class="n">output</span><span class="o">.</span><span class="n">append</span><span class="p">([</span><span class="n">clientId</span><span class="p">,</span> <span class="n">submissionDate</span><span class="p">,</span> <span class="n">timestamp</span><span class="p">,</span> <span class="n">action</span><span class="p">,</span> <span class="n">method</span><span class="p">,</span> <span class="n">extras</span><span class="p">,</span> <span class="n">json</span><span class="o">.</span><span class="n">dumps</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">sessions</span><span class="p">)),</span> <span class="n">json</span><span class="o">.</span><span class="n">dumps</span><span class="p">(</span><span class="n">experiments</span><span class="p">)])</span>
<span class="k">return</span> <span class="n">output</span>
</pre></div>
<p>The data can have duplicate events, due to a bug in the data collection that was fixed (bug 1246973). We still need to de-dupe the events. Because pings can be archived on device and submitted on later days, we can&rsquo;t assume dupes only happen on the same submission day. We don&rsquo;t use submission date when de-duping.</p>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">dedupe_events</span><span class="p">(</span><span class="n">rdd</span><span class="p">):</span>
<span class="k">return</span> <span class="n">rdd</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="p">(</span><span class="n">p</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">+</span> <span class="n">safe_str</span><span class="p">(</span><span class="n">p</span><span class="p">[</span><span class="mi">2</span><span class="p">])</span> <span class="o">+</span> <span class="n">p</span><span class="p">[</span><span class="mi">3</span><span class="p">]</span> <span class="o">+</span> <span class="n">p</span><span class="p">[</span><span class="mi">4</span><span class="p">],</span> <span class="n">p</span><span class="p">))</span>\
<span class="o">.</span><span class="n">reduceByKey</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span><span class="p">)</span>\
<span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
</pre></div>
<p>Create a set of events from &ldquo;saved-session&rdquo; UI telemetry. Output the data to CSV or Parquet.</p>
<p>This script is designed to loop over a range of days and output a single day for the given channels. Use explicit date ranges for backfilling, or now() - &lsquo;1day&rsquo; for automated runs.</p>
<div class="codehilite"><pre><span></span><span class="n">channels</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;nightly&quot;</span><span class="p">,</span> <span class="s2">&quot;aurora&quot;</span><span class="p">,</span> <span class="s2">&quot;beta&quot;</span><span class="p">,</span> <span class="s2">&quot;release&quot;</span><span class="p">]</span>
<span class="n">batch_date</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;date&#39;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">batch_date</span><span class="p">:</span>
<span class="n">start</span> <span class="o">=</span> <span class="n">end</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">strptime</span><span class="p">(</span><span class="n">batch_date</span><span class="p">,</span> <span class="s1">&#39;%Y%m</span><span class="si">%d</span><span class="s1">&#39;</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">start</span> <span class="o">=</span> <span class="n">start</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">now</span><span class="p">()</span> <span class="o">-</span> <span class="n">dt</span><span class="o">.</span><span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="n">day</span> <span class="o">=</span> <span class="n">start</span>
<span class="k">while</span> <span class="n">day</span> <span class="o">&lt;=</span> <span class="n">end</span><span class="p">:</span>
<span class="k">for</span> <span class="n">channel</span> <span class="ow">in</span> <span class="n">channels</span><span class="p">:</span>
<span class="k">print</span> <span class="s2">&quot;</span><span class="se">\n</span><span class="s2">channel: &quot;</span> <span class="o">+</span> <span class="n">channel</span> <span class="o">+</span> <span class="s2">&quot;, date: &quot;</span> <span class="o">+</span> <span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">&quot;%Y%m</span><span class="si">%d</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="n">pings</span> <span class="o">=</span> <span class="n">get_pings</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">app</span><span class="o">=</span><span class="s2">&quot;Fennec&quot;</span><span class="p">,</span> <span class="n">channel</span><span class="o">=</span><span class="n">channel</span><span class="p">,</span>
<span class="n">submission_date</span><span class="o">=</span><span class="p">(</span><span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">&quot;%Y%m</span><span class="si">%d</span><span class="s2">&quot;</span><span class="p">),</span> <span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">&quot;%Y%m</span><span class="si">%d</span><span class="s2">&quot;</span><span class="p">)),</span>
<span class="n">build_id</span><span class="o">=</span><span class="p">(</span><span class="s2">&quot;20100101000000&quot;</span><span class="p">,</span> <span class="s2">&quot;99999999999999&quot;</span><span class="p">),</span>
<span class="n">fraction</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="n">subset</span> <span class="o">=</span> <span class="n">get_pings_properties</span><span class="p">(</span><span class="n">pings</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;meta/clientId&quot;</span><span class="p">,</span>
<span class="s2">&quot;meta/documentId&quot;</span><span class="p">,</span>
<span class="s2">&quot;meta/submissionDate&quot;</span><span class="p">,</span>
<span class="s2">&quot;payload/UIMeasurements&quot;</span><span class="p">])</span>
<span class="n">subset</span> <span class="o">=</span> <span class="n">dedupe_pings</span><span class="p">(</span><span class="n">subset</span><span class="p">)</span>
<span class="k">print</span> <span class="n">subset</span><span class="o">.</span><span class="n">first</span><span class="p">()</span>
<span class="n">rawEvents</span> <span class="o">=</span> <span class="n">subset</span><span class="o">.</span><span class="n">flatMap</span><span class="p">(</span><span class="n">transform</span><span class="p">)</span>
<span class="k">print</span> <span class="s2">&quot;</span><span class="se">\n</span><span class="s2">Raw count: &quot;</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">rawEvents</span><span class="o">.</span><span class="n">count</span><span class="p">())</span>
<span class="k">print</span> <span class="n">rawEvents</span><span class="o">.</span><span class="n">first</span><span class="p">()</span>
<span class="n">uniqueEvents</span> <span class="o">=</span> <span class="n">dedupe_events</span><span class="p">(</span><span class="n">rawEvents</span><span class="p">)</span>
<span class="k">print</span> <span class="s2">&quot;</span><span class="se">\n</span><span class="s2">Unique count: &quot;</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">uniqueEvents</span><span class="o">.</span><span class="n">count</span><span class="p">())</span>
<span class="k">print</span> <span class="n">uniqueEvents</span><span class="o">.</span><span class="n">first</span><span class="p">()</span>
<span class="n">s3_output</span> <span class="o">=</span> <span class="s2">&quot;s3n://net-mozaws-prod-us-west-2-pipeline-analysis/mobile/android_events&quot;</span>
<span class="n">s3_output</span> <span class="o">+=</span> <span class="s2">&quot;/v1/channel=&quot;</span> <span class="o">+</span> <span class="n">channel</span> <span class="o">+</span> <span class="s2">&quot;/submission=&quot;</span> <span class="o">+</span> <span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">&quot;%Y%m</span><span class="si">%d</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">StructType</span><span class="p">([</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;clientid&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">False</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;submissiondate&quot;</span><span class="p">,</span> <span class="n">TimestampType</span><span class="p">(),</span> <span class="bp">False</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;ts&quot;</span><span class="p">,</span> <span class="n">LongType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;action&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;method&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;extras&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;sessions&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;experiments&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">)</span>
<span class="p">])</span>
<span class="n">grouped</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">uniqueEvents</span><span class="p">,</span> <span class="n">schema</span><span class="p">)</span>
<span class="n">grouped</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="n">s3_output</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s2">&quot;overwrite&quot;</span><span class="p">)</span>
<span class="n">day</span> <span class="o">+=</span> <span class="n">dt</span><span class="o">.</span><span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
</div>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<h2> 0 Comments </h2>
</br>
<div>
<div class="row">
<div class="col-md-12">
<textarea class="form-control" type="text" id="comment-text" style="height:87px;" placeholder="Leave a comment..."></textarea>
</div>
</div>
</div>
</br>
<div>
<button class="btn btn-primary" id="post_comment_btn">Post Comment</button>
</div>
</div>
</div>
<br>
<div class="row">
<div class="col-md-12">
</div>
</br>
</div>
</div>
</div>
</div>
<div class="footer">
Served with <span class="glyphicon glyphicon-heart"></span> by <a href="https://github.com/airbnb/knowledge-repo">Knowledge Repo</a> <a href="https://github.com/airbnb/knowledge-repo/releases/tag/v0.7.4">0.7.4</a><br />
<i title="Last checked for updates: 7 seconds ago">Last indexed: 14 hours ago</i>
</div>
<script type='text/javascript'>
$("#searchbar")[0].setSelectionRange(1000, 1000);
$('#searchbar').typeahead({
hint: false,
highlight: true,
minLength: 1
},
{
name: 'knowledge_posts',
limit: 10,
display: function (item) {
return item.title + " - " + item.author;
},
templates: {
empty: Handlebars.compile(
'<div class="tt-not-found">' +
'Unable to find any posts that match the current query' +
'</div>'
),
suggestion: function(data) {
return '<p style="overflow-wrap:break-word"><strong class="text-rausch">' + data.title + '</strong> – ' + data.author + '</p>';
}
},
source: function(q, sync, async) {
$.ajax('/ajax/index/typeahead?search=' + q,
{
success: function(data,status){ async(JSON.parse(data)); }
})
}
});
$('#searchbar').bind('typeahead:select', function(obj, datum, name) {
window.location = '/post/'+encodeURIComponent(datum.path);
});
$('#searchbar').keypress(function(event){
var keycode = (event.keyCode ? event.keyCode : event.which);
if(keycode == '13'){
var path = document.location.pathname;
window.location = '/feed?filters=' + $('#searchbar').val()
}
});
var padding = $('.tt-menu').outerWidth()
$('.tt-menu').width($('#searchbar').width() + padding + "px")
</script>
<script src="/static/js/tooltips.js" type="text/javascript"></script>
<script type="text/javascript">
$("document").ready(function(){
var is_webeditor = false;
var post_id = "4";
var id = "None";
var post_path = "etl/android-events.kp"
var data_repo_github_root = ""
tooltipsJx.initializeTooltips(is_webeditor, post_id, id, data_repo_github_root);
$(".btn-rendered").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path);
})
$(".btn-raw").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path) + "?render=raw";
})
$(".btn-webeditor").on("click", function(){
document.location.href = "/edit/" + encodeURI(post_path);
})
});
</script>
<script src="/static/js/helpers.js"></script>
<script src="/static/js/tags.js" type="text/javascript"></script>
<script src="/static/js/icons.js" type="text/javascript"></script>
<script src="/static/js/comments.js" type="text/javascript"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script type="text/javascript"
src="https://cdn.mathjax.org/mathjax/latest/MathJax.js">
</script>
<script>
$(document).on('ready', function(){
// Make the Rendered Markdown Button active
$(".btn-rendered").addClass("btn-active");
// Initialize headers
helpersJx.linkifyHeaders();
// Initialize comments
var post_path = 'etl/android-events.kp';
$("#post_comment_btn").on('click', function(){
comment_author = 'knowledge_default';
post_author = 'Frank Bertsch';
post_title = 'Android Events ETL job';
commentsJx.postComment(comment_author, post_author, post_title, post_path);
location.reload();
});
all_comment_delete_buttons = $("[id^=delete_comment]")
$.each(all_comment_delete_buttons, function(i,v){
$(v).on("click", function(){
var id = v.id;
var comment_id = id.split("__")[1];
if(comment_id) {
commentsJx.deleteComment(post_path, comment_id)
location.reload();
}
});
});
$(document.body).on('click',"button[id^=tag-subscription]",function () {
tagsJx.addTagSubscriptionListener($(this)[0]);
});
})
//Turn all the headers to be links
//h1 = Title, don't want that
var all_headers = [$("h2"), $("h3"), $("h4"), $("h5"), $("h6")]
$.each(all_headers, function(index, value){
$.each(value, function(i, v){
var inner_html = v.innerHTML
inner_html_no_special = inner_html.replace(/[^a-zA-Z\- ]/g, "")
var inner_link = "#" + inner_html_no_special.toLowerCase().split(" ").join("-")
v.innerHTML = "<a href='" + inner_link + "' class=link-reset>" + inner_html + "</a>"
})
})
//turn all the tags into links, similar to what's done on the feed page
var tags = $("#renderedMarkdown .metadata .tags")[0]
var tags_list = ['mobile', 'etl']
var subscriptions_list = []
$.each(tags_list, function(i,tag){
ahref = document.createElement("a")
e_tag = encodeURIComponent(tag)
f_tag = tag.replace("/", "__")
tag_name = "#" + tag
tag_subscription_button_id_name = "tag-subscription-" + i + "__" + f_tag
ahref.setAttribute("data-container", "body")
ahref.setAttribute("data-toggle", "popover")
ahref.setAttribute("data-placement", "bottom")
ahref.setAttribute("data-html", "true")
ahref.setAttribute("data-tag-name", f_tag)
if (subscriptions_list.indexOf(tag) >= 0) {
ahref.setAttribute("class", "label label-subscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-primary btn-unsubscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-remove-sign glyphicon-white'></i>Unsubscribe " +
" </button> " +
" </div>")
} else {
ahref.setAttribute("class", "label label-unsubscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-default btn-subscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-ok-sign glyphicon-filled'></i>Subscribe " +
" </button> " +
" </div>")
}
ahref.setAttribute("href", "/tag_pages?tag=" + e_tag)
ahref.setAttribute("style", "font-weight:normal")
if (i == 0){
ahref.innerHTML = " "
colon = document.createElement("text")
colon.innerHTML = "<b>Tags</b>: "
tags.appendChild(colon)
}
ahref.innerHTML = ahref.innerHTML + tag_name
tags.appendChild(ahref)
if (i != tags_list.length - 1){
comma = document.createElement("text")
comma.innerText = ", "
tags.appendChild(comma)
}
})
tags.nextSibling.remove()
tags.innerHTML += "<i class='glyphicon glyphicon-edit icon-gray' style='font-size:12pt; padding-left:4px' id='tooltip-edit_tags'></i>"
$(".pop").popover({ trigger: "manual" , html: true, animation:false, delay: 100})
.on("mouseenter", function () {
var _this = this;
$(this).popover("show");
$(".popover").on("mouseleave", function () {
$(_this).popover('hide');
});
}).on("mouseleave", function () {
var _this = this;
setTimeout(function () {
if (!$(".popover:hover").length) {
$(_this).popover("hide");
}
}, 300);
});
$('#tooltip-edit_tags').click(function(){
$('#tooltip-edit_tags')[0].setAttribute("style", "display:none")
previousSibling = $("#tooltip-edit_tags")[0].previousSibling
tags_string = tags_list.join(", ")
form = document.createElement("form")
input = document.createElement("input")
tags_text = document.createElement("text")
icon_class = document.createElement("i")
icon_class.setAttribute("class", "glyphicon glyphicon-upload icon-gray")
icon_class.setAttribute("style", "font-size:23px; padding-left:4px")
icon_class.setAttribute("id", "tooltip-save_tags")
tags_text.innerText = "Tags: "
input.setAttribute('type', 'text')
input.setAttribute('name', 'tags_list')
input.setAttribute('value', tags_string)
input.setAttribute('style', 'width:75%; display: inline-block')
input.setAttribute('id' , 'change_tags')
form.appendChild(tags_text)
tags.textContent = " "
form.appendChild(input)
form.appendChild(icon_class)
tags.appendChild(form)
$("#tooltip-save_tags").click(function(){
tags_string = $("#change_tags")[0].value
tags_list = tags_string.split(",")
var re = /^[a-z0-9\-\_\:\/]+$/i
var good = true
for (var i = 0; i < tags_list.length; i++){
tag = tags_list[i]
if (tag.length == 0){
alert("There is a tag with length 0 - possible a trailing comma?")
good = false
break
} else {
tag_name = tag.trim()
if (!(re.test(tag_name))){
alert("The tag contains special characters. Make sure there are only alphanumeric characters in your tag")
good = false
break
}
}
}
if (good) {
var postContent = {}
postContent['tags'] = tags_string
$.ajax({
type: "POST",
dataType: "json",
data: JSON.stringify(postContent),
contentType: "application/json",
url: '/tag_list?post_path=etl/android-events.kp',
async: false
});
location.reload()
}
})
tags.nextSibling.remove()
// Allow user to edit tags
var edit_icon = iconsJx.createEditTagsIcon();
$(tags).after(edit_icon);
$("#tooltip-edit_tags").on("click", function(){
var edit_tooltip = $("#tooltip-edit_tags");
edit_tooltip.attr("style", "display:none");
var tags_string = tags_list.join(", ");
var form = $("<form>");
var input = $("<input>");
var tags_text = $("<text>");
tags_text.html("Tags: ");
var icon = iconsJx.createSaveTagsIcon();
input.attr("type", "text");
input.attr("name", "tags_list");
input.attr("style", "width:75%; display: inline-block");
input.attr("id", "change_tags");
form.append(tags_text);
tags.textContent = " ";
tags_text.innerHTML = "Tags: ";
form.append(input);
form.append(icon);
tags.appendChild(form[0]);
$("#change_tags")[0].value = tags_string;
$("#change_tags").keypress(function(e){
if (e.which == 13){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/android-events.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false;
};
});
$("#tooltip-save_tags").click(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/android-events.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
});
$("form").submit(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/android-events.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false
})
});
});
</script>
</body>
</html>

Просмотреть файл

@ -0,0 +1,13 @@
{
"title": "Android Events ETL job",
"authors": [
"Frank Bertsch"
],
"tags": [
"mobile",
"etl"
],
"publish_date": "2017-02-17",
"updated_at": "2017-02-17",
"tldr": "This job takes the Fennec saved session pings and transforms them, where there could be multiple events per ping."
}

Просмотреть файл

@ -0,0 +1,602 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title> Knowledge </title>
<!-- js includes at the top as post embedded js colliding -->
<script src="https://reports.telemetry.mozilla.org/static/modules/jquery/jquery.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/tether/js/tether.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap/js/bootstrap.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/js/bootstrap-slider.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/typeahead.js/typeahead.bundle.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/handlebars/js/handlebars.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/select2/js/select2.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/hightlight.pack.js/highlight.pack.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/marked.js/marked.js"></script>
<!-- require js is used for plotly, but has a bunch of collisions with other js packages
make sure to have it be last js package imported -->
<script src="https://reports.telemetry.mozilla.org/static/modules/require.js/require.min.js"></script>
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.6.2/html5shiv.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/2.1.0/es5-shim.min.js"></script>
<![endif]-->
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap/css/bootstrap.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/css/bootstrap-slider.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/css/custom.css" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/select2/css/select2.min.css" rel="stylesheet" type="text/css"/>
<link href="https://fonts.googleapis.com/css?family=Lato:400,900|Playfair+Display|Source+Serif+Pro:400,700" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/images/favicon.png" rel="shortcut icon"/>
<link href="https://reports.telemetry.mozilla.org/static/css/codehilite-friendly.css" rel="stylesheet"/>
<style>
.spinner {
position: fixed;
top: 50%;
left: 50%;
margin-left: -50px; /* half width of the spinner gif */
margin-top: -50px; /* half height of the spinner gif */
text-align:center;
z-index:1234;
overflow: auto;
width: 100px; /* width of the spinner gif */
height: 102px; /*hight of the spinner gif +2px to fix IE8 issue */
}
.table {
font-size: 14px;
}
.modal-content {
max-width: 1024px;
}
</style>
</head>
<body>
<div class="container page-container">
<br/>
<div class="container-fluid">
<div class="row">
<div class="col-md-6">
</div>
<div class="col-md-2">
</div>
</div>
<div class="row col-md-12">
</div>
</div>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<div id="renderedMarkdown">
<div class="metadata">
<h1>Churn to CSV</h1>
<span class="authors"><a href="/feed?authors=amiyaguchi">amiyaguchi</a></span>
<span class="date_created">March 07, 2016</span>
<span class="date_updated">(Last Updated: April 18, 2017)</span>
<span class="tldr"><p>Convert telemetry-parquet/churn to csv</p></span>
</div>
<h1 id="churn-to-csv">Churn to CSV</h1>
<p><a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1345217">Bug 1345217</a></p>
<p>This script turns the parquet dataset generated by <a href="https://github.com/mozilla/mozilla-reports/blob/master/etl/churn.kp/knowledge.md">churn notebook</a> into csv files.</p>
<div class="codehilite"><pre><span></span><span class="kn">import</span> <span class="nn">boto3</span>
<span class="kn">import</span> <span class="nn">botocore</span>
<span class="kn">import</span> <span class="nn">gzip</span>
<span class="kn">from</span> <span class="nn">boto3.s3.transfer</span> <span class="kn">import</span> <span class="n">S3Transfer</span>
<span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">datetime</span><span class="p">,</span> <span class="n">timedelta</span>
<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">functions</span> <span class="k">as</span> <span class="n">F</span>
<span class="k">def</span> <span class="nf">csv</span><span class="p">(</span><span class="n">f</span><span class="p">):</span>
<span class="k">return</span> <span class="s2">","</span><span class="o">.</span><span class="n">join</span><span class="p">([</span><span class="nb">unicode</span><span class="p">(</span><span class="n">a</span><span class="p">)</span> <span class="k">for</span> <span class="n">a</span> <span class="ow">in</span> <span class="n">f</span><span class="p">])</span>
<span class="k">def</span> <span class="nf">fmt</span><span class="p">(</span><span class="n">d</span><span class="p">,</span> <span class="n">date_format</span><span class="o">=</span><span class="s2">"%Y%m</span><span class="si">%d</span><span class="s2">"</span><span class="p">):</span>
<span class="k">return</span> <span class="n">datetime</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="n">d</span><span class="p">,</span> <span class="n">date_format</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">collect_and_upload_csv</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">filename</span><span class="p">,</span> <span class="n">upload_config</span><span class="p">):</span>
<span class="sd">""" Collect the dataframe into a csv file and upload to target locations. """</span>
<span class="n">client</span> <span class="o">=</span> <span class="n">boto3</span><span class="o">.</span><span class="n">client</span><span class="p">(</span><span class="s1">'s3'</span><span class="p">,</span> <span class="s1">'us-west-2'</span><span class="p">)</span>
<span class="n">transfer</span> <span class="o">=</span> <span class="n">S3Transfer</span><span class="p">(</span><span class="n">client</span><span class="p">)</span>
<span class="k">print</span><span class="p">(</span><span class="s2">"{}: Writing output to {}"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">datetime</span><span class="o">.</span><span class="n">utcnow</span><span class="p">(),</span> <span class="n">filename</span><span class="p">))</span>
<span class="c1"># Write the file out as gzipped csv</span>
<span class="k">with</span> <span class="n">gzip</span><span class="o">.</span><span class="n">open</span><span class="p">(</span><span class="n">filename</span><span class="p">,</span> <span class="s1">'wb'</span><span class="p">)</span> <span class="k">as</span> <span class="n">fout</span><span class="p">:</span>
<span class="n">fout</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="s2">","</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span> <span class="o">+</span> <span class="s2">"</span><span class="se">\n</span><span class="s2">"</span><span class="p">)</span>
<span class="k">print</span><span class="p">(</span><span class="s2">"{}: Wrote header to {}"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">datetime</span><span class="o">.</span><span class="n">utcnow</span><span class="p">(),</span> <span class="n">filename</span><span class="p">))</span>
<span class="n">records</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">rdd</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="k">for</span> <span class="n">r</span> <span class="ow">in</span> <span class="n">records</span><span class="p">:</span>
<span class="k">try</span><span class="p">:</span>
<span class="n">fout</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="n">csv</span><span class="p">(</span><span class="n">r</span><span class="p">))</span>
<span class="n">fout</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="s2">"</span><span class="se">\n</span><span class="s2">"</span><span class="p">)</span>
<span class="k">except</span> <span class="ne">UnicodeEncodeError</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
<span class="k">print</span><span class="p">(</span><span class="s2">"{}: Error writing line: {} // {}"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">datetime</span><span class="o">.</span><span class="n">utcnow</span><span class="p">(),</span> <span class="n">e</span><span class="p">,</span> <span class="n">r</span><span class="p">))</span>
<span class="k">print</span><span class="p">(</span><span class="s2">"{}: finished writing lines"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">datetime</span><span class="o">.</span><span class="n">utcnow</span><span class="p">()))</span>
<span class="c1"># upload files to s3</span>
<span class="k">try</span><span class="p">:</span>
<span class="k">for</span> <span class="n">config</span> <span class="ow">in</span> <span class="n">upload_config</span><span class="p">:</span>
<span class="k">print</span><span class="p">(</span><span class="s2">"{}: Uploading to {} at s3://{}/{}/{}"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="n">datetime</span><span class="o">.</span><span class="n">utcnow</span><span class="p">(),</span> <span class="n">config</span><span class="p">[</span><span class="s2">"name"</span><span class="p">],</span> <span class="n">config</span><span class="p">[</span><span class="s2">"bucket"</span><span class="p">],</span>
<span class="n">config</span><span class="p">[</span><span class="s2">"prefix"</span><span class="p">],</span> <span class="n">filename</span><span class="p">))</span>
<span class="n">s3_path</span> <span class="o">=</span> <span class="s2">"{}/{}"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">config</span><span class="p">[</span><span class="s2">"prefix"</span><span class="p">],</span> <span class="n">filename</span><span class="p">)</span>
<span class="n">transfer</span><span class="o">.</span><span class="n">upload_file</span><span class="p">(</span><span class="n">filename</span><span class="p">,</span> <span class="n">config</span><span class="p">[</span><span class="s2">"bucket"</span><span class="p">],</span> <span class="n">s3_path</span><span class="p">,</span>
<span class="n">extra_args</span><span class="o">=</span><span class="p">{</span><span class="s1">'ACL'</span><span class="p">:</span> <span class="s1">'bucket-owner-full-control'</span><span class="p">})</span>
<span class="k">except</span> <span class="n">botocore</span><span class="o">.</span><span class="n">exceptions</span><span class="o">.</span><span class="n">ClientError</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
<span class="k">print</span><span class="p">(</span><span class="s2">"File for {} already exists, skipping upload: {}"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">filename</span><span class="p">,</span> <span class="n">e</span><span class="p">))</span>
<span class="k">def</span> <span class="nf">marginalize_dataframe</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">attributes</span><span class="p">,</span> <span class="n">aggregates</span><span class="p">):</span>
<span class="sd">""" Reduce the granularity of the dataset to the original set of attributes.</span>
<span class="sd"> The original set of attributes can be found on commit 2de3ef1 of mozilla-reports. """</span>
<span class="k">return</span> <span class="n">df</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">attributes</span><span class="p">)</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="o">*</span><span class="p">[</span><span class="n">F</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">x</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">aggregates</span><span class="p">])</span>
<span class="k">def</span> <span class="nf">convert_week</span><span class="p">(</span><span class="n">config</span><span class="p">,</span> <span class="n">week_start</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">""" Convert a given retention period from parquet to csv. """</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="n">config</span><span class="p">[</span><span class="s2">"source"</span><span class="p">])</span>
<span class="c1"># find the latest start date based on the dataset if not provided</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">week_start</span><span class="p">:</span>
<span class="n">start_dates</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">"week_start"</span><span class="p">)</span><span class="o">.</span><span class="n">distinct</span><span class="p">()</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="n">week_start</span> <span class="o">=</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">start_dates</span><span class="p">)[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">week_start</span>
<span class="c1"># find the week end for the filename</span>
<span class="n">week_end</span> <span class="o">=</span> <span class="n">fmt</span><span class="p">(</span><span class="n">datetime</span><span class="o">.</span><span class="n">strptime</span><span class="p">(</span><span class="n">week_start</span><span class="p">,</span> <span class="s2">"%Y%m</span><span class="si">%d</span><span class="s2">"</span><span class="p">)</span> <span class="o">+</span> <span class="n">timedelta</span><span class="p">(</span><span class="mi">6</span><span class="p">))</span>
<span class="k">print</span><span class="p">(</span><span class="s2">"Running for the week of {} to {}"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">week_start</span><span class="p">,</span> <span class="n">week_end</span><span class="p">))</span>
<span class="c1"># find the target subset of data</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">week_start</span> <span class="o">==</span> <span class="n">week_start</span><span class="p">)</span>
<span class="c1"># marginalize the dataframe to the original attributes and upload to s3</span>
<span class="n">initial_attributes</span> <span class="o">=</span> <span class="p">[</span><span class="s1">'channel'</span><span class="p">,</span> <span class="s1">'geo'</span><span class="p">,</span> <span class="s1">'is_funnelcake'</span><span class="p">,</span>
<span class="s1">'acquisition_period'</span><span class="p">,</span> <span class="s1">'start_version'</span><span class="p">,</span> <span class="s1">'sync_usage'</span><span class="p">,</span>
<span class="s1">'current_version'</span><span class="p">,</span> <span class="s1">'current_week'</span><span class="p">,</span> <span class="s1">'is_active'</span><span class="p">]</span>
<span class="n">initial_aggregates</span> <span class="o">=</span> <span class="p">[</span><span class="s1">'n_profiles'</span><span class="p">,</span> <span class="s1">'usage_hours'</span><span class="p">,</span> <span class="s1">'sum_squared_usage_hours'</span><span class="p">]</span>
<span class="n">upload_df</span> <span class="o">=</span> <span class="n">marginalize_dataframe</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">initial_attributes</span><span class="p">,</span> <span class="n">initial_aggregates</span><span class="p">)</span>
<span class="n">filename</span> <span class="o">=</span> <span class="s2">"churn-{}-{}.by_activity.csv.gz"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">week_start</span><span class="p">,</span> <span class="n">week_end</span><span class="p">)</span>
<span class="n">collect_and_upload_csv</span><span class="p">(</span><span class="n">upload_df</span><span class="p">,</span> <span class="n">filename</span><span class="p">,</span> <span class="n">config</span><span class="p">[</span><span class="s2">"uploads"</span><span class="p">])</span>
<span class="c1"># Bug 1355988</span>
<span class="c1"># The size of the data explodes significantly with extra dimensions and is too</span>
<span class="c1"># large to fit into the driver memory. We can write directly to s3 from a</span>
<span class="c1"># dataframe.</span>
<span class="n">bucket</span> <span class="o">=</span> <span class="n">config</span><span class="p">[</span><span class="s1">'search_cohort'</span><span class="p">][</span><span class="s1">'bucket'</span><span class="p">]</span>
<span class="n">prefix</span> <span class="o">=</span> <span class="n">config</span><span class="p">[</span><span class="s1">'search_cohort'</span><span class="p">][</span><span class="s1">'prefix'</span><span class="p">]</span>
<span class="n">location</span> <span class="o">=</span> <span class="s2">"s3://{}/{}/week_start={}"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">bucket</span><span class="p">,</span> <span class="n">prefix</span><span class="p">,</span> <span class="n">week_start</span><span class="p">)</span>
<span class="k">print</span><span class="p">(</span><span class="s2">"Saving additional search cohort churn data to {}"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">location</span><span class="p">))</span>
<span class="n">search_attributes</span> <span class="o">=</span> <span class="p">[</span>
<span class="s1">'source'</span><span class="p">,</span> <span class="s1">'medium'</span><span class="p">,</span> <span class="s1">'campaign'</span><span class="p">,</span> <span class="s1">'content'</span><span class="p">,</span>
<span class="s1">'distribution_id'</span><span class="p">,</span> <span class="s1">'default_search_engine'</span><span class="p">,</span> <span class="s1">'locale'</span>
<span class="p">]</span>
<span class="n">attributes</span> <span class="o">=</span> <span class="n">initial_attributes</span> <span class="o">+</span> <span class="n">search_attributes</span>
<span class="n">upload_df</span> <span class="o">=</span> <span class="n">marginalize_dataframe</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">attributes</span><span class="p">,</span> <span class="n">initial_aggregates</span><span class="p">)</span>
<span class="n">upload_df</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">csv</span><span class="p">(</span><span class="n">location</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s1">'overwrite'</span><span class="p">,</span> <span class="n">compression</span><span class="o">=</span><span class="s1">'gzip'</span><span class="p">)</span>
<span class="k">print</span><span class="p">(</span><span class="s2">"Sucessfully finished churn_to_csv"</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">assert_valid_config</span><span class="p">(</span><span class="n">config</span><span class="p">):</span>
<span class="sd">""" Assert that the configuration looks correct. """</span>
<span class="c1"># This could be replaced with python schema's</span>
<span class="k">assert</span> <span class="nb">set</span><span class="p">([</span><span class="s2">"source"</span><span class="p">,</span> <span class="s2">"uploads"</span><span class="p">,</span> <span class="s2">"search_cohort"</span><span class="p">])</span><span class="o">.</span><span class="n">issubset</span><span class="p">(</span><span class="n">config</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>
<span class="k">assert</span> <span class="nb">set</span><span class="p">([</span><span class="s2">"bucket"</span><span class="p">,</span> <span class="s2">"prefix"</span><span class="p">])</span><span class="o">.</span><span class="n">issubset</span><span class="p">(</span><span class="n">config</span><span class="p">[</span><span class="s1">'search_cohort'</span><span class="p">]</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>
<span class="k">for</span> <span class="n">entry</span> <span class="ow">in</span> <span class="n">config</span><span class="p">[</span><span class="s2">"uploads"</span><span class="p">]:</span>
<span class="k">assert</span> <span class="nb">set</span><span class="p">([</span><span class="s2">"name"</span><span class="p">,</span> <span class="s2">"bucket"</span><span class="p">,</span> <span class="s2">"prefix"</span><span class="p">])</span><span class="o">.</span><span class="n">issubset</span><span class="p">(</span><span class="n">entry</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="kn">from</span> <span class="nn">moztelemetry.standards</span> <span class="kn">import</span> <span class="n">snap_to_beginning_of_week</span>
<span class="kn">from</span> <span class="nn">os</span> <span class="kn">import</span> <span class="n">environ</span>
<span class="n">config</span> <span class="o">=</span> <span class="p">{</span>
<span class="s2">"source"</span><span class="p">:</span> <span class="s2">"s3://telemetry-parquet/churn/v2"</span><span class="p">,</span>
<span class="s2">"uploads"</span><span class="p">:</span> <span class="p">[</span>
<span class="p">{</span>
<span class="s2">"name"</span><span class="p">:</span> <span class="s2">"Pipeline-Analysis"</span><span class="p">,</span>
<span class="s2">"bucket"</span><span class="p">:</span> <span class="s2">"net-mozaws-prod-us-west-2-pipeline-analysis"</span><span class="p">,</span>
<span class="s2">"prefix"</span><span class="p">:</span> <span class="s2">"mreid/churn"</span>
<span class="p">},</span>
<span class="p">{</span>
<span class="s2">"name"</span><span class="p">:</span> <span class="s2">"Dashboard"</span><span class="p">,</span>
<span class="s2">"bucket"</span><span class="p">:</span> <span class="s2">"net-mozaws-prod-metrics-data"</span><span class="p">,</span>
<span class="s2">"prefix"</span><span class="p">:</span> <span class="s2">"telemetry-churn"</span>
<span class="p">}</span>
<span class="p">],</span>
<span class="s2">"search_cohort"</span><span class="p">:</span> <span class="p">{</span>
<span class="s2">"bucket"</span><span class="p">:</span> <span class="s2">"net-mozaws-prod-us-west-2-pipeline-analysis"</span><span class="p">,</span>
<span class="s2">"prefix"</span><span class="p">:</span> <span class="s2">"amiyaguchi/churn_csv"</span>
<span class="p">}</span>
<span class="p">}</span>
<span class="n">assert_valid_config</span><span class="p">(</span><span class="n">config</span><span class="p">)</span>
<span class="c1"># Set to True to overwrite the configuration with debugging route</span>
<span class="k">if</span> <span class="bp">False</span><span class="p">:</span>
<span class="n">config</span><span class="p">[</span><span class="s2">"uploads"</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span>
<span class="p">{</span>
<span class="s2">"name"</span><span class="p">:</span> <span class="s2">"Testing"</span><span class="p">,</span>
<span class="s2">"bucket"</span><span class="p">:</span> <span class="s2">"net-mozaws-prod-us-west-2-pipeline-analysis"</span><span class="p">,</span>
<span class="s2">"prefix"</span><span class="p">:</span> <span class="s2">"amiyaguchi/churn_csv_testing"</span>
<span class="p">}</span>
<span class="p">]</span>
<span class="n">config</span><span class="p">[</span><span class="s1">'search_cohort'</span><span class="p">]</span> <span class="o">=</span> <span class="p">{</span>
<span class="s2">"bucket"</span><span class="p">:</span> <span class="s2">"net-mozaws-prod-us-west-2-pipeline-analysis"</span><span class="p">,</span>
<span class="s2">"prefix"</span><span class="p">:</span> <span class="s2">"amiyaguchi/churn_csv_testing"</span>
<span class="p">}</span>
<span class="n">assert_valid_config</span><span class="p">(</span><span class="n">config</span><span class="p">)</span>
<span class="c1"># check for a date, in the case of a backfill</span>
<span class="n">env_date</span> <span class="o">=</span> <span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'date'</span><span class="p">)</span>
<span class="n">week_start</span> <span class="o">=</span> <span class="bp">None</span>
<span class="k">if</span> <span class="n">env_date</span><span class="p">:</span>
<span class="c1"># Churn waits 10 days for pings to be sent from the client</span>
<span class="n">week_start_date</span> <span class="o">=</span> <span class="n">snap_to_beginning_of_week</span><span class="p">(</span>
<span class="n">datetime</span><span class="o">.</span><span class="n">strptime</span><span class="p">(</span><span class="n">env_date</span><span class="p">,</span> <span class="s2">"%Y%m</span><span class="si">%d</span><span class="s2">"</span><span class="p">)</span> <span class="o">-</span> <span class="n">timedelta</span><span class="p">(</span><span class="mi">10</span><span class="p">),</span>
<span class="s2">"Sunday"</span><span class="p">)</span>
<span class="n">week_start</span> <span class="o">=</span> <span class="n">fmt</span><span class="p">(</span><span class="n">week_start_date</span><span class="p">)</span>
<span class="n">convert_week</span><span class="p">(</span><span class="n">config</span><span class="p">,</span> <span class="n">week_start</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
</div>
</div></body></html>
<div>
</div>
<br/>
<div class="row">
<div class="col-md-12">
</div>
</div>
<script type="text/javascript">
$("#searchbar")[0].setSelectionRange(1000, 1000);
$('#searchbar').typeahead({
hint: false,
highlight: true,
minLength: 1
},
{
name: 'knowledge_posts',
limit: 10,
display: function (item) {
return item.title + " - " + item.author;
},
templates: {
empty: Handlebars.compile(
'<div class="tt-not-found">' +
'Unable to find any posts that match the current query' +
'</div>'
),
suggestion: function(data) {
return '<p style="overflow-wrap:break-word"><strong class="text-rausch">' + data.title + '</strong> – ' + data.author + '</p>';
}
},
source: function(q, sync, async) {
$.ajax('/ajax/index/typeahead?search=' + q,
{
success: function(data,status){ async(JSON.parse(data)); }
})
}
});
$('#searchbar').bind('typeahead:select', function(obj, datum, name) {
window.location = '/post/'+encodeURIComponent(datum.path);
});
$('#searchbar').keypress(function(event){
var keycode = (event.keyCode ? event.keyCode : event.which);
if(keycode == '13'){
var path = document.location.pathname;
window.location = '/feed?filters=' + $('#searchbar').val()
}
});
var padding = $('.tt-menu').outerWidth()
$('.tt-menu').width($('#searchbar').width() + padding + "px")
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/tooltips.js" type="text/javascript"></script>
<script type="text/javascript">
$("document").ready(function(){
var is_webeditor = false;
var post_id = "25";
var id = "None";
var post_path = "etl/churn_to_csv.kp"
var data_repo_github_root = ""
tooltipsJx.initializeTooltips(is_webeditor, post_id, id, data_repo_github_root);
$(".btn-rendered").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path);
})
$(".btn-raw").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path) + "?render=raw";
})
$(".btn-webeditor").on("click", function(){
document.location.href = "/edit/" + encodeURI(post_path);
})
});
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/tags.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/icons.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/comments.js" type="text/javascript"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js" type="text/javascript">
</script>
<script>
$(document).on('ready', function(){
// Make the Rendered Markdown Button active
$(".btn-rendered").addClass("btn-active");
// Initialize headers
helpersJx.linkifyHeaders();
// Initialize comments
var post_path = 'etl/churn_to_csv.kp';
$("#post_comment_btn").on('click', function(){
comment_author = 'knowledge_default';
post_author = 'amiyaguchi';
post_title = 'Churn to CSV';
commentsJx.postComment(comment_author, post_author, post_title, post_path);
location.reload();
});
all_comment_delete_buttons = $("[id^=delete_comment]")
$.each(all_comment_delete_buttons, function(i,v){
$(v).on("click", function(){
var id = v.id;
var comment_id = id.split("__")[1];
if(comment_id) {
commentsJx.deleteComment(post_path, comment_id)
location.reload();
}
});
});
$(document.body).on('click',"button[id^=tag-subscription]",function () {
tagsJx.addTagSubscriptionListener($(this)[0]);
});
})
//Turn all the headers to be links
//h1 = Title, don't want that
var all_headers = [$("h2"), $("h3"), $("h4"), $("h5"), $("h6")]
$.each(all_headers, function(index, value){
$.each(value, function(i, v){
var inner_html = v.innerHTML
inner_html_no_special = inner_html.replace(/[^a-zA-Z\- ]/g, "")
var inner_link = "#" + inner_html_no_special.toLowerCase().split(" ").join("-")
v.innerHTML = "<a href='" + inner_link + "' class=link-reset>" + inner_html + "</a>"
})
})
//turn all the tags into links, similar to what's done on the feed page
var tags = $("#renderedMarkdown .metadata .tags")[0]
var tags_list = ['churn', 'etl', 'csv']
var subscriptions_list = []
$.each(tags_list, function(i,tag){
ahref = document.createElement("a")
e_tag = encodeURIComponent(tag)
f_tag = tag.replace("/", "__")
tag_name = "#" + tag
tag_subscription_button_id_name = "tag-subscription-" + i + "__" + f_tag
ahref.setAttribute("data-container", "body")
ahref.setAttribute("data-toggle", "popover")
ahref.setAttribute("data-placement", "bottom")
ahref.setAttribute("data-html", "true")
ahref.setAttribute("data-tag-name", f_tag)
if (subscriptions_list.indexOf(tag) >= 0) {
ahref.setAttribute("class", "label label-subscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-primary btn-unsubscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-remove-sign glyphicon-white'></i>Unsubscribe " +
" </button> " +
" </div>")
} else {
ahref.setAttribute("class", "label label-unsubscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-default btn-subscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-ok-sign glyphicon-filled'></i>Subscribe " +
" </button> " +
" </div>")
}
ahref.setAttribute("href", "/tag_pages?tag=" + e_tag)
ahref.setAttribute("style", "font-weight:normal")
if (i == 0){
ahref.innerHTML = " "
colon = document.createElement("text")
colon.innerHTML = "<b>Tags</b>: "
tags.appendChild(colon)
}
ahref.innerHTML = ahref.innerHTML + tag_name
tags.appendChild(ahref)
if (i != tags_list.length - 1){
comma = document.createElement("text")
comma.innerText = ", "
tags.appendChild(comma)
}
})
tags.nextSibling.remove()
tags.innerHTML += "<i class='glyphicon glyphicon-edit icon-gray' style='font-size:12pt; padding-left:4px' id='tooltip-edit_tags'></i>"
$(".pop").popover({ trigger: "manual" , html: true, animation:false, delay: 100})
.on("mouseenter", function () {
var _this = this;
$(this).popover("show");
$(".popover").on("mouseleave", function () {
$(_this).popover('hide');
});
}).on("mouseleave", function () {
var _this = this;
setTimeout(function () {
if (!$(".popover:hover").length) {
$(_this).popover("hide");
}
}, 300);
});
$('#tooltip-edit_tags').click(function(){
$('#tooltip-edit_tags')[0].setAttribute("style", "display:none")
previousSibling = $("#tooltip-edit_tags")[0].previousSibling
tags_string = tags_list.join(", ")
form = document.createElement("form")
input = document.createElement("input")
tags_text = document.createElement("text")
icon_class = document.createElement("i")
icon_class.setAttribute("class", "glyphicon glyphicon-upload icon-gray")
icon_class.setAttribute("style", "font-size:23px; padding-left:4px")
icon_class.setAttribute("id", "tooltip-save_tags")
tags_text.innerText = "Tags: "
input.setAttribute('type', 'text')
input.setAttribute('name', 'tags_list')
input.setAttribute('value', tags_string)
input.setAttribute('style', 'width:75%; display: inline-block')
input.setAttribute('id' , 'change_tags')
form.appendChild(tags_text)
tags.textContent = " "
form.appendChild(input)
form.appendChild(icon_class)
tags.appendChild(form)
$("#tooltip-save_tags").click(function(){
tags_string = $("#change_tags")[0].value
tags_list = tags_string.split(",")
var re = /^[a-z0-9\-\_\:\/]+$/i
var good = true
for (var i = 0; i < tags_list.length; i++){
tag = tags_list[i]
if (tag.length == 0){
alert("There is a tag with length 0 - possible a trailing comma?")
good = false
break
} else {
tag_name = tag.trim()
if (!(re.test(tag_name))){
alert("The tag contains special characters. Make sure there are only alphanumeric characters in your tag")
good = false
break
}
}
}
if (good) {
var postContent = {}
postContent['tags'] = tags_string
$.ajax({
type: "POST",
dataType: "json",
data: JSON.stringify(postContent),
contentType: "application/json",
url: '/tag_list?post_path=etl/churn_to_csv.kp',
async: false
});
location.reload()
}
})
tags.nextSibling.remove()
// Allow user to edit tags
var edit_icon = iconsJx.createEditTagsIcon();
$(tags).after(edit_icon);
$("#tooltip-edit_tags").on("click", function(){
var edit_tooltip = $("#tooltip-edit_tags");
edit_tooltip.attr("style", "display:none");
var tags_string = tags_list.join(", ");
var form = $("<form>");
var input = $("<input>");
var tags_text = $("<text>");
tags_text.html("Tags: ");
var icon = iconsJx.createSaveTagsIcon();
input.attr("type", "text");
input.attr("name", "tags_list");
input.attr("style", "width:75%; display: inline-block");
input.attr("id", "change_tags");
form.append(tags_text);
tags.textContent = " ";
tags_text.innerHTML = "Tags: ";
form.append(input);
form.append(icon);
tags.appendChild(form[0]);
$("#change_tags")[0].value = tags_string;
$("#change_tags").keypress(function(e){
if (e.which == 13){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/churn_to_csv.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false;
};
});
$("#tooltip-save_tags").click(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/churn_to_csv.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
});
$("form").submit(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/churn_to_csv.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false
})
});
});
</script>

Просмотреть файл

@ -0,0 +1,718 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title> Knowledge </title>
<!-- js includes at the top as post embedded js colliding -->
<script src="/static/modules/jquery/jquery.min.js"></script>
<script src="/static/modules/tether/js/tether.min.js"></script>
<script src="/static/modules/bootstrap/js/bootstrap.min.js"></script>
<script src="/static/modules/bootstrap-slider/js/bootstrap-slider.min.js"></script>
<script src="/static/modules/typeahead.js/typeahead.bundle.min.js"></script>
<script src="/static/modules/handlebars/js/handlebars.js"></script>
<script src="/static/js/helpers.js"></script>
<script src="/static/modules/select2/js/select2.min.js"></script>
<script src="/static/modules/hightlight.pack.js/highlight.pack.js"></script>
<script src="/static/modules/marked.js/marked.js"></script>
<!-- require js is used for plotly, but has a bunch of collisions with other js packages
make sure to have it be last js package imported -->
<script src="/static/modules/require.js/require.min.js"></script>
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.6.2/html5shiv.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/2.1.0/es5-shim.min.js"></script>
<![endif]-->
<link rel="stylesheet" href="/static/modules/bootstrap/css/bootstrap.min.css">
<link rel="stylesheet" href="/static/modules/bootstrap-slider/css/bootstrap-slider.min.css">
<link rel="stylesheet" type='text/css' href="/static/css/custom.css">
<link rel="stylesheet" type='text/css' href="/static/modules/select2/css/select2.min.css">
<link href='https://fonts.googleapis.com/css?family=Lato:400,900|Playfair+Display|Source+Serif+Pro:400,700' rel='stylesheet' type='text/css'>
<link rel="shortcut icon" href="/static/images/favicon.png">
<link rel="stylesheet" href="/static/css/codehilite-friendly.css">
<style>
.spinner {
position: fixed;
top: 50%;
left: 50%;
margin-left: -50px; /* half width of the spinner gif */
margin-top: -50px; /* half height of the spinner gif */
text-align:center;
z-index:1234;
overflow: auto;
width: 100px; /* width of the spinner gif */
height: 102px; /*hight of the spinner gif +2px to fix IE8 issue */
}
.table {
font-size: 14px;
}
.modal-content {
max-width: 1024px;
}
</style>
</head>
<body>
<div class="navbar navbar-knowledge" role="navigation">
<div class="container page-container">
<a href="/" aria-selected="false" class="logo-image navbar-text" style='margin-top:16px; margin-left: -7px'>
<img width="125" src='/static/images/logo-white.svg'></img>
</a>
<a id="feed_tab" href="/" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Home
</a>
<a id="favorites_tab" href="/favorites" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Favorites
</a>
<a id="help_tab" href="/about" aria-selected="false" class="navbar-text" style='margin-top:18px'>
About
</a>
<a id="stats_tab" href="/stats" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Stats
</a>
<a id="webposts_tab" href="/create"
class="btn btn-primary navbar-text pull-right"
style='border-radius:4px; margin-top:10px; margin-right:-10px; background-color: #00a699; color:white; border-color: #00a699'>
Write a Post!
</a>
<div class="pull-right">
<div class="form-group">
<input class="form-control" id="searchbar" placeholder="Search for Knowledge" style="text-align:right">
</div>
</div>
</div>
</div>
<div class="container page-container">
<br>
<div class="container-fluid">
<div class="row">
<div class="col-md-6">
<div class="btn-group" role="group">
<button type="button" class="btn btn-default btn-rendered">
View Post
</button>
<button type="button" class="btn btn-default btn-raw">
View Raw Markdown
</button>
</div>
</div>
<div class="col-md-2">
</div>
<div class="col-md-4 text-right">
<i class="glyphicon glyphicon-eye-open" id="pageview_stats" style='color: #9CA299'></i>
<div id="pageview_stats" style="display: inline-block">
Viewed 1 times by 1 different users
</div>
<i class="glyphicon glyphicon-heart-empty glyphicon-clickable pop" style="font-size:16pt" id="tooltip-like" data-placement="bottom"
data-trigger="#tooltip-like"
data-container="body"
data-toggle="popover"
data-content='<div>Like This Post</div>'></i>
</div>
</div>
<div class="row col-md-12">
</div>
</div>
<div class='container-fluid'>
<div class="row">
<div class="col-md-12">
<div id="renderedMarkdown">
<div class='metadata'>
<h1>Churn to CSV</h1>
<span class='authors'><a href='/feed?authors=amiyaguchi'>amiyaguchi</a></span>
<span class='date_created'>March 07, 2016</span>
<span class='date_updated'>(Last Updated: April 18, 2017)</span>
<span class='tldr'><p>Convert telemetry-parquet/churn to csv</p></span>
<span class='tags'></span>
</div>
<h1 id="churn-to-csv">Churn to CSV</h1>
<p><a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1345217">Bug 1345217</a></p>
<p>This script turns the parquet dataset generated by <a href="https://github.com/mozilla/mozilla-reports/blob/master/etl/churn.kp/knowledge.md">churn notebook</a> into csv files.</p>
<div class="codehilite"><pre><span></span><span class="kn">import</span> <span class="nn">boto3</span>
<span class="kn">import</span> <span class="nn">botocore</span>
<span class="kn">import</span> <span class="nn">gzip</span>
<span class="kn">from</span> <span class="nn">boto3.s3.transfer</span> <span class="kn">import</span> <span class="n">S3Transfer</span>
<span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">datetime</span><span class="p">,</span> <span class="n">timedelta</span>
<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">functions</span> <span class="k">as</span> <span class="n">F</span>
<span class="k">def</span> <span class="nf">csv</span><span class="p">(</span><span class="n">f</span><span class="p">):</span>
<span class="k">return</span> <span class="s2">&quot;,&quot;</span><span class="o">.</span><span class="n">join</span><span class="p">([</span><span class="nb">unicode</span><span class="p">(</span><span class="n">a</span><span class="p">)</span> <span class="k">for</span> <span class="n">a</span> <span class="ow">in</span> <span class="n">f</span><span class="p">])</span>
<span class="k">def</span> <span class="nf">fmt</span><span class="p">(</span><span class="n">d</span><span class="p">,</span> <span class="n">date_format</span><span class="o">=</span><span class="s2">&quot;%Y%m</span><span class="si">%d</span><span class="s2">&quot;</span><span class="p">):</span>
<span class="k">return</span> <span class="n">datetime</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="n">d</span><span class="p">,</span> <span class="n">date_format</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">collect_and_upload_csv</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">filename</span><span class="p">,</span> <span class="n">upload_config</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot; Collect the dataframe into a csv file and upload to target locations. &quot;&quot;&quot;</span>
<span class="n">client</span> <span class="o">=</span> <span class="n">boto3</span><span class="o">.</span><span class="n">client</span><span class="p">(</span><span class="s1">&#39;s3&#39;</span><span class="p">,</span> <span class="s1">&#39;us-west-2&#39;</span><span class="p">)</span>
<span class="n">transfer</span> <span class="o">=</span> <span class="n">S3Transfer</span><span class="p">(</span><span class="n">client</span><span class="p">)</span>
<span class="k">print</span><span class="p">(</span><span class="s2">&quot;{}: Writing output to {}&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">datetime</span><span class="o">.</span><span class="n">utcnow</span><span class="p">(),</span> <span class="n">filename</span><span class="p">))</span>
<span class="c1"># Write the file out as gzipped csv</span>
<span class="k">with</span> <span class="n">gzip</span><span class="o">.</span><span class="n">open</span><span class="p">(</span><span class="n">filename</span><span class="p">,</span> <span class="s1">&#39;wb&#39;</span><span class="p">)</span> <span class="k">as</span> <span class="n">fout</span><span class="p">:</span>
<span class="n">fout</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="s2">&quot;,&quot;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span> <span class="o">+</span> <span class="s2">&quot;</span><span class="se">\n</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="k">print</span><span class="p">(</span><span class="s2">&quot;{}: Wrote header to {}&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">datetime</span><span class="o">.</span><span class="n">utcnow</span><span class="p">(),</span> <span class="n">filename</span><span class="p">))</span>
<span class="n">records</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">rdd</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="k">for</span> <span class="n">r</span> <span class="ow">in</span> <span class="n">records</span><span class="p">:</span>
<span class="k">try</span><span class="p">:</span>
<span class="n">fout</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="n">csv</span><span class="p">(</span><span class="n">r</span><span class="p">))</span>
<span class="n">fout</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="s2">&quot;</span><span class="se">\n</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="k">except</span> <span class="ne">UnicodeEncodeError</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
<span class="k">print</span><span class="p">(</span><span class="s2">&quot;{}: Error writing line: {} // {}&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">datetime</span><span class="o">.</span><span class="n">utcnow</span><span class="p">(),</span> <span class="n">e</span><span class="p">,</span> <span class="n">r</span><span class="p">))</span>
<span class="k">print</span><span class="p">(</span><span class="s2">&quot;{}: finished writing lines&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">datetime</span><span class="o">.</span><span class="n">utcnow</span><span class="p">()))</span>
<span class="c1"># upload files to s3</span>
<span class="k">try</span><span class="p">:</span>
<span class="k">for</span> <span class="n">config</span> <span class="ow">in</span> <span class="n">upload_config</span><span class="p">:</span>
<span class="k">print</span><span class="p">(</span><span class="s2">&quot;{}: Uploading to {} at s3://{}/{}/{}&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="n">datetime</span><span class="o">.</span><span class="n">utcnow</span><span class="p">(),</span> <span class="n">config</span><span class="p">[</span><span class="s2">&quot;name&quot;</span><span class="p">],</span> <span class="n">config</span><span class="p">[</span><span class="s2">&quot;bucket&quot;</span><span class="p">],</span>
<span class="n">config</span><span class="p">[</span><span class="s2">&quot;prefix&quot;</span><span class="p">],</span> <span class="n">filename</span><span class="p">))</span>
<span class="n">s3_path</span> <span class="o">=</span> <span class="s2">&quot;{}/{}&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">config</span><span class="p">[</span><span class="s2">&quot;prefix&quot;</span><span class="p">],</span> <span class="n">filename</span><span class="p">)</span>
<span class="n">transfer</span><span class="o">.</span><span class="n">upload_file</span><span class="p">(</span><span class="n">filename</span><span class="p">,</span> <span class="n">config</span><span class="p">[</span><span class="s2">&quot;bucket&quot;</span><span class="p">],</span> <span class="n">s3_path</span><span class="p">,</span>
<span class="n">extra_args</span><span class="o">=</span><span class="p">{</span><span class="s1">&#39;ACL&#39;</span><span class="p">:</span> <span class="s1">&#39;bucket-owner-full-control&#39;</span><span class="p">})</span>
<span class="k">except</span> <span class="n">botocore</span><span class="o">.</span><span class="n">exceptions</span><span class="o">.</span><span class="n">ClientError</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
<span class="k">print</span><span class="p">(</span><span class="s2">&quot;File for {} already exists, skipping upload: {}&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">filename</span><span class="p">,</span> <span class="n">e</span><span class="p">))</span>
<span class="k">def</span> <span class="nf">marginalize_dataframe</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">attributes</span><span class="p">,</span> <span class="n">aggregates</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot; Reduce the granularity of the dataset to the original set of attributes.</span>
<span class="sd"> The original set of attributes can be found on commit 2de3ef1 of mozilla-reports. &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">df</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">attributes</span><span class="p">)</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="o">*</span><span class="p">[</span><span class="n">F</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">x</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">aggregates</span><span class="p">])</span>
<span class="k">def</span> <span class="nf">convert_week</span><span class="p">(</span><span class="n">config</span><span class="p">,</span> <span class="n">week_start</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot; Convert a given retention period from parquet to csv. &quot;&quot;&quot;</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="n">config</span><span class="p">[</span><span class="s2">&quot;source&quot;</span><span class="p">])</span>
<span class="c1"># find the latest start date based on the dataset if not provided</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">week_start</span><span class="p">:</span>
<span class="n">start_dates</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&quot;week_start&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">distinct</span><span class="p">()</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="n">week_start</span> <span class="o">=</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">start_dates</span><span class="p">)[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">week_start</span>
<span class="c1"># find the week end for the filename</span>
<span class="n">week_end</span> <span class="o">=</span> <span class="n">fmt</span><span class="p">(</span><span class="n">datetime</span><span class="o">.</span><span class="n">strptime</span><span class="p">(</span><span class="n">week_start</span><span class="p">,</span> <span class="s2">&quot;%Y%m</span><span class="si">%d</span><span class="s2">&quot;</span><span class="p">)</span> <span class="o">+</span> <span class="n">timedelta</span><span class="p">(</span><span class="mi">6</span><span class="p">))</span>
<span class="k">print</span><span class="p">(</span><span class="s2">&quot;Running for the week of {} to {}&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">week_start</span><span class="p">,</span> <span class="n">week_end</span><span class="p">))</span>
<span class="c1"># find the target subset of data</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">week_start</span> <span class="o">==</span> <span class="n">week_start</span><span class="p">)</span>
<span class="c1"># marginalize the dataframe to the original attributes and upload to s3</span>
<span class="n">initial_attributes</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;channel&#39;</span><span class="p">,</span> <span class="s1">&#39;geo&#39;</span><span class="p">,</span> <span class="s1">&#39;is_funnelcake&#39;</span><span class="p">,</span>
<span class="s1">&#39;acquisition_period&#39;</span><span class="p">,</span> <span class="s1">&#39;start_version&#39;</span><span class="p">,</span> <span class="s1">&#39;sync_usage&#39;</span><span class="p">,</span>
<span class="s1">&#39;current_version&#39;</span><span class="p">,</span> <span class="s1">&#39;current_week&#39;</span><span class="p">,</span> <span class="s1">&#39;is_active&#39;</span><span class="p">]</span>
<span class="n">initial_aggregates</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;n_profiles&#39;</span><span class="p">,</span> <span class="s1">&#39;usage_hours&#39;</span><span class="p">,</span> <span class="s1">&#39;sum_squared_usage_hours&#39;</span><span class="p">]</span>
<span class="n">upload_df</span> <span class="o">=</span> <span class="n">marginalize_dataframe</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">initial_attributes</span><span class="p">,</span> <span class="n">initial_aggregates</span><span class="p">)</span>
<span class="n">filename</span> <span class="o">=</span> <span class="s2">&quot;churn-{}-{}.by_activity.csv.gz&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">week_start</span><span class="p">,</span> <span class="n">week_end</span><span class="p">)</span>
<span class="n">collect_and_upload_csv</span><span class="p">(</span><span class="n">upload_df</span><span class="p">,</span> <span class="n">filename</span><span class="p">,</span> <span class="n">config</span><span class="p">[</span><span class="s2">&quot;uploads&quot;</span><span class="p">])</span>
<span class="c1"># Bug 1355988</span>
<span class="c1"># The size of the data explodes significantly with extra dimensions and is too</span>
<span class="c1"># large to fit into the driver memory. We can write directly to s3 from a</span>
<span class="c1"># dataframe.</span>
<span class="n">bucket</span> <span class="o">=</span> <span class="n">config</span><span class="p">[</span><span class="s1">&#39;search_cohort&#39;</span><span class="p">][</span><span class="s1">&#39;bucket&#39;</span><span class="p">]</span>
<span class="n">prefix</span> <span class="o">=</span> <span class="n">config</span><span class="p">[</span><span class="s1">&#39;search_cohort&#39;</span><span class="p">][</span><span class="s1">&#39;prefix&#39;</span><span class="p">]</span>
<span class="n">location</span> <span class="o">=</span> <span class="s2">&quot;s3://{}/{}/week_start={}&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">bucket</span><span class="p">,</span> <span class="n">prefix</span><span class="p">,</span> <span class="n">week_start</span><span class="p">)</span>
<span class="k">print</span><span class="p">(</span><span class="s2">&quot;Saving additional search cohort churn data to {}&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">location</span><span class="p">))</span>
<span class="n">search_attributes</span> <span class="o">=</span> <span class="p">[</span>
<span class="s1">&#39;source&#39;</span><span class="p">,</span> <span class="s1">&#39;medium&#39;</span><span class="p">,</span> <span class="s1">&#39;campaign&#39;</span><span class="p">,</span> <span class="s1">&#39;content&#39;</span><span class="p">,</span>
<span class="s1">&#39;distribution_id&#39;</span><span class="p">,</span> <span class="s1">&#39;default_search_engine&#39;</span><span class="p">,</span> <span class="s1">&#39;locale&#39;</span>
<span class="p">]</span>
<span class="n">attributes</span> <span class="o">=</span> <span class="n">initial_attributes</span> <span class="o">+</span> <span class="n">search_attributes</span>
<span class="n">upload_df</span> <span class="o">=</span> <span class="n">marginalize_dataframe</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">attributes</span><span class="p">,</span> <span class="n">initial_aggregates</span><span class="p">)</span>
<span class="n">upload_df</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">csv</span><span class="p">(</span><span class="n">location</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s1">&#39;overwrite&#39;</span><span class="p">,</span> <span class="n">compression</span><span class="o">=</span><span class="s1">&#39;gzip&#39;</span><span class="p">)</span>
<span class="k">print</span><span class="p">(</span><span class="s2">&quot;Sucessfully finished churn_to_csv&quot;</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">assert_valid_config</span><span class="p">(</span><span class="n">config</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot; Assert that the configuration looks correct. &quot;&quot;&quot;</span>
<span class="c1"># This could be replaced with python schema&#39;s</span>
<span class="k">assert</span> <span class="nb">set</span><span class="p">([</span><span class="s2">&quot;source&quot;</span><span class="p">,</span> <span class="s2">&quot;uploads&quot;</span><span class="p">,</span> <span class="s2">&quot;search_cohort&quot;</span><span class="p">])</span><span class="o">.</span><span class="n">issubset</span><span class="p">(</span><span class="n">config</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>
<span class="k">assert</span> <span class="nb">set</span><span class="p">([</span><span class="s2">&quot;bucket&quot;</span><span class="p">,</span> <span class="s2">&quot;prefix&quot;</span><span class="p">])</span><span class="o">.</span><span class="n">issubset</span><span class="p">(</span><span class="n">config</span><span class="p">[</span><span class="s1">&#39;search_cohort&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>
<span class="k">for</span> <span class="n">entry</span> <span class="ow">in</span> <span class="n">config</span><span class="p">[</span><span class="s2">&quot;uploads&quot;</span><span class="p">]:</span>
<span class="k">assert</span> <span class="nb">set</span><span class="p">([</span><span class="s2">&quot;name&quot;</span><span class="p">,</span> <span class="s2">&quot;bucket&quot;</span><span class="p">,</span> <span class="s2">&quot;prefix&quot;</span><span class="p">])</span><span class="o">.</span><span class="n">issubset</span><span class="p">(</span><span class="n">entry</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="kn">from</span> <span class="nn">moztelemetry.standards</span> <span class="kn">import</span> <span class="n">snap_to_beginning_of_week</span>
<span class="kn">from</span> <span class="nn">os</span> <span class="kn">import</span> <span class="n">environ</span>
<span class="n">config</span> <span class="o">=</span> <span class="p">{</span>
<span class="s2">&quot;source&quot;</span><span class="p">:</span> <span class="s2">&quot;s3://telemetry-parquet/churn/v2&quot;</span><span class="p">,</span>
<span class="s2">&quot;uploads&quot;</span><span class="p">:</span> <span class="p">[</span>
<span class="p">{</span>
<span class="s2">&quot;name&quot;</span><span class="p">:</span> <span class="s2">&quot;Pipeline-Analysis&quot;</span><span class="p">,</span>
<span class="s2">&quot;bucket&quot;</span><span class="p">:</span> <span class="s2">&quot;net-mozaws-prod-us-west-2-pipeline-analysis&quot;</span><span class="p">,</span>
<span class="s2">&quot;prefix&quot;</span><span class="p">:</span> <span class="s2">&quot;mreid/churn&quot;</span>
<span class="p">},</span>
<span class="p">{</span>
<span class="s2">&quot;name&quot;</span><span class="p">:</span> <span class="s2">&quot;Dashboard&quot;</span><span class="p">,</span>
<span class="s2">&quot;bucket&quot;</span><span class="p">:</span> <span class="s2">&quot;net-mozaws-prod-metrics-data&quot;</span><span class="p">,</span>
<span class="s2">&quot;prefix&quot;</span><span class="p">:</span> <span class="s2">&quot;telemetry-churn&quot;</span>
<span class="p">}</span>
<span class="p">],</span>
<span class="s2">&quot;search_cohort&quot;</span><span class="p">:</span> <span class="p">{</span>
<span class="s2">&quot;bucket&quot;</span><span class="p">:</span> <span class="s2">&quot;net-mozaws-prod-us-west-2-pipeline-analysis&quot;</span><span class="p">,</span>
<span class="s2">&quot;prefix&quot;</span><span class="p">:</span> <span class="s2">&quot;amiyaguchi/churn_csv&quot;</span>
<span class="p">}</span>
<span class="p">}</span>
<span class="n">assert_valid_config</span><span class="p">(</span><span class="n">config</span><span class="p">)</span>
<span class="c1"># Set to True to overwrite the configuration with debugging route</span>
<span class="k">if</span> <span class="bp">False</span><span class="p">:</span>
<span class="n">config</span><span class="p">[</span><span class="s2">&quot;uploads&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span>
<span class="p">{</span>
<span class="s2">&quot;name&quot;</span><span class="p">:</span> <span class="s2">&quot;Testing&quot;</span><span class="p">,</span>
<span class="s2">&quot;bucket&quot;</span><span class="p">:</span> <span class="s2">&quot;net-mozaws-prod-us-west-2-pipeline-analysis&quot;</span><span class="p">,</span>
<span class="s2">&quot;prefix&quot;</span><span class="p">:</span> <span class="s2">&quot;amiyaguchi/churn_csv_testing&quot;</span>
<span class="p">}</span>
<span class="p">]</span>
<span class="n">config</span><span class="p">[</span><span class="s1">&#39;search_cohort&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="p">{</span>
<span class="s2">&quot;bucket&quot;</span><span class="p">:</span> <span class="s2">&quot;net-mozaws-prod-us-west-2-pipeline-analysis&quot;</span><span class="p">,</span>
<span class="s2">&quot;prefix&quot;</span><span class="p">:</span> <span class="s2">&quot;amiyaguchi/churn_csv_testing&quot;</span>
<span class="p">}</span>
<span class="n">assert_valid_config</span><span class="p">(</span><span class="n">config</span><span class="p">)</span>
<span class="c1"># check for a date, in the case of a backfill</span>
<span class="n">env_date</span> <span class="o">=</span> <span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;date&#39;</span><span class="p">)</span>
<span class="n">week_start</span> <span class="o">=</span> <span class="bp">None</span>
<span class="k">if</span> <span class="n">env_date</span><span class="p">:</span>
<span class="c1"># Churn waits 10 days for pings to be sent from the client</span>
<span class="n">week_start_date</span> <span class="o">=</span> <span class="n">snap_to_beginning_of_week</span><span class="p">(</span>
<span class="n">datetime</span><span class="o">.</span><span class="n">strptime</span><span class="p">(</span><span class="n">env_date</span><span class="p">,</span> <span class="s2">&quot;%Y%m</span><span class="si">%d</span><span class="s2">&quot;</span><span class="p">)</span> <span class="o">-</span> <span class="n">timedelta</span><span class="p">(</span><span class="mi">10</span><span class="p">),</span>
<span class="s2">&quot;Sunday&quot;</span><span class="p">)</span>
<span class="n">week_start</span> <span class="o">=</span> <span class="n">fmt</span><span class="p">(</span><span class="n">week_start_date</span><span class="p">)</span>
<span class="n">convert_week</span><span class="p">(</span><span class="n">config</span><span class="p">,</span> <span class="n">week_start</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
</div>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<h2> 0 Comments </h2>
</br>
<div>
<div class="row">
<div class="col-md-12">
<textarea class="form-control" type="text" id="comment-text" style="height:87px;" placeholder="Leave a comment..."></textarea>
</div>
</div>
</div>
</br>
<div>
<button class="btn btn-primary" id="post_comment_btn">Post Comment</button>
</div>
</div>
</div>
<br>
<div class="row">
<div class="col-md-12">
</div>
</br>
</div>
</div>
</div>
</div>
<div class="footer">
Served with <span class="glyphicon glyphicon-heart"></span> by <a href="https://github.com/airbnb/knowledge-repo">Knowledge Repo</a> <a href="https://github.com/airbnb/knowledge-repo/releases/tag/v0.7.4">0.7.4</a><br />
<i title="Last checked for updates: 4 minutes ago">Last indexed: 14 hours ago</i>
</div>
<script type='text/javascript'>
$("#searchbar")[0].setSelectionRange(1000, 1000);
$('#searchbar').typeahead({
hint: false,
highlight: true,
minLength: 1
},
{
name: 'knowledge_posts',
limit: 10,
display: function (item) {
return item.title + " - " + item.author;
},
templates: {
empty: Handlebars.compile(
'<div class="tt-not-found">' +
'Unable to find any posts that match the current query' +
'</div>'
),
suggestion: function(data) {
return '<p style="overflow-wrap:break-word"><strong class="text-rausch">' + data.title + '</strong> – ' + data.author + '</p>';
}
},
source: function(q, sync, async) {
$.ajax('/ajax/index/typeahead?search=' + q,
{
success: function(data,status){ async(JSON.parse(data)); }
})
}
});
$('#searchbar').bind('typeahead:select', function(obj, datum, name) {
window.location = '/post/'+encodeURIComponent(datum.path);
});
$('#searchbar').keypress(function(event){
var keycode = (event.keyCode ? event.keyCode : event.which);
if(keycode == '13'){
var path = document.location.pathname;
window.location = '/feed?filters=' + $('#searchbar').val()
}
});
var padding = $('.tt-menu').outerWidth()
$('.tt-menu').width($('#searchbar').width() + padding + "px")
</script>
<script src="/static/js/tooltips.js" type="text/javascript"></script>
<script type="text/javascript">
$("document").ready(function(){
var is_webeditor = false;
var post_id = "25";
var id = "None";
var post_path = "etl/churn_to_csv.kp"
var data_repo_github_root = ""
tooltipsJx.initializeTooltips(is_webeditor, post_id, id, data_repo_github_root);
$(".btn-rendered").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path);
})
$(".btn-raw").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path) + "?render=raw";
})
$(".btn-webeditor").on("click", function(){
document.location.href = "/edit/" + encodeURI(post_path);
})
});
</script>
<script src="/static/js/helpers.js"></script>
<script src="/static/js/tags.js" type="text/javascript"></script>
<script src="/static/js/icons.js" type="text/javascript"></script>
<script src="/static/js/comments.js" type="text/javascript"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script type="text/javascript"
src="https://cdn.mathjax.org/mathjax/latest/MathJax.js">
</script>
<script>
$(document).on('ready', function(){
// Make the Rendered Markdown Button active
$(".btn-rendered").addClass("btn-active");
// Initialize headers
helpersJx.linkifyHeaders();
// Initialize comments
var post_path = 'etl/churn_to_csv.kp';
$("#post_comment_btn").on('click', function(){
comment_author = 'knowledge_default';
post_author = 'amiyaguchi';
post_title = 'Churn to CSV';
commentsJx.postComment(comment_author, post_author, post_title, post_path);
location.reload();
});
all_comment_delete_buttons = $("[id^=delete_comment]")
$.each(all_comment_delete_buttons, function(i,v){
$(v).on("click", function(){
var id = v.id;
var comment_id = id.split("__")[1];
if(comment_id) {
commentsJx.deleteComment(post_path, comment_id)
location.reload();
}
});
});
$(document.body).on('click',"button[id^=tag-subscription]",function () {
tagsJx.addTagSubscriptionListener($(this)[0]);
});
})
//Turn all the headers to be links
//h1 = Title, don't want that
var all_headers = [$("h2"), $("h3"), $("h4"), $("h5"), $("h6")]
$.each(all_headers, function(index, value){
$.each(value, function(i, v){
var inner_html = v.innerHTML
inner_html_no_special = inner_html.replace(/[^a-zA-Z\- ]/g, "")
var inner_link = "#" + inner_html_no_special.toLowerCase().split(" ").join("-")
v.innerHTML = "<a href='" + inner_link + "' class=link-reset>" + inner_html + "</a>"
})
})
//turn all the tags into links, similar to what's done on the feed page
var tags = $("#renderedMarkdown .metadata .tags")[0]
var tags_list = ['churn', 'etl', 'csv']
var subscriptions_list = []
$.each(tags_list, function(i,tag){
ahref = document.createElement("a")
e_tag = encodeURIComponent(tag)
f_tag = tag.replace("/", "__")
tag_name = "#" + tag
tag_subscription_button_id_name = "tag-subscription-" + i + "__" + f_tag
ahref.setAttribute("data-container", "body")
ahref.setAttribute("data-toggle", "popover")
ahref.setAttribute("data-placement", "bottom")
ahref.setAttribute("data-html", "true")
ahref.setAttribute("data-tag-name", f_tag)
if (subscriptions_list.indexOf(tag) >= 0) {
ahref.setAttribute("class", "label label-subscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-primary btn-unsubscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-remove-sign glyphicon-white'></i>Unsubscribe " +
" </button> " +
" </div>")
} else {
ahref.setAttribute("class", "label label-unsubscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-default btn-subscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-ok-sign glyphicon-filled'></i>Subscribe " +
" </button> " +
" </div>")
}
ahref.setAttribute("href", "/tag_pages?tag=" + e_tag)
ahref.setAttribute("style", "font-weight:normal")
if (i == 0){
ahref.innerHTML = " "
colon = document.createElement("text")
colon.innerHTML = "<b>Tags</b>: "
tags.appendChild(colon)
}
ahref.innerHTML = ahref.innerHTML + tag_name
tags.appendChild(ahref)
if (i != tags_list.length - 1){
comma = document.createElement("text")
comma.innerText = ", "
tags.appendChild(comma)
}
})
tags.nextSibling.remove()
tags.innerHTML += "<i class='glyphicon glyphicon-edit icon-gray' style='font-size:12pt; padding-left:4px' id='tooltip-edit_tags'></i>"
$(".pop").popover({ trigger: "manual" , html: true, animation:false, delay: 100})
.on("mouseenter", function () {
var _this = this;
$(this).popover("show");
$(".popover").on("mouseleave", function () {
$(_this).popover('hide');
});
}).on("mouseleave", function () {
var _this = this;
setTimeout(function () {
if (!$(".popover:hover").length) {
$(_this).popover("hide");
}
}, 300);
});
$('#tooltip-edit_tags').click(function(){
$('#tooltip-edit_tags')[0].setAttribute("style", "display:none")
previousSibling = $("#tooltip-edit_tags")[0].previousSibling
tags_string = tags_list.join(", ")
form = document.createElement("form")
input = document.createElement("input")
tags_text = document.createElement("text")
icon_class = document.createElement("i")
icon_class.setAttribute("class", "glyphicon glyphicon-upload icon-gray")
icon_class.setAttribute("style", "font-size:23px; padding-left:4px")
icon_class.setAttribute("id", "tooltip-save_tags")
tags_text.innerText = "Tags: "
input.setAttribute('type', 'text')
input.setAttribute('name', 'tags_list')
input.setAttribute('value', tags_string)
input.setAttribute('style', 'width:75%; display: inline-block')
input.setAttribute('id' , 'change_tags')
form.appendChild(tags_text)
tags.textContent = " "
form.appendChild(input)
form.appendChild(icon_class)
tags.appendChild(form)
$("#tooltip-save_tags").click(function(){
tags_string = $("#change_tags")[0].value
tags_list = tags_string.split(",")
var re = /^[a-z0-9\-\_\:\/]+$/i
var good = true
for (var i = 0; i < tags_list.length; i++){
tag = tags_list[i]
if (tag.length == 0){
alert("There is a tag with length 0 - possible a trailing comma?")
good = false
break
} else {
tag_name = tag.trim()
if (!(re.test(tag_name))){
alert("The tag contains special characters. Make sure there are only alphanumeric characters in your tag")
good = false
break
}
}
}
if (good) {
var postContent = {}
postContent['tags'] = tags_string
$.ajax({
type: "POST",
dataType: "json",
data: JSON.stringify(postContent),
contentType: "application/json",
url: '/tag_list?post_path=etl/churn_to_csv.kp',
async: false
});
location.reload()
}
})
tags.nextSibling.remove()
// Allow user to edit tags
var edit_icon = iconsJx.createEditTagsIcon();
$(tags).after(edit_icon);
$("#tooltip-edit_tags").on("click", function(){
var edit_tooltip = $("#tooltip-edit_tags");
edit_tooltip.attr("style", "display:none");
var tags_string = tags_list.join(", ");
var form = $("<form>");
var input = $("<input>");
var tags_text = $("<text>");
tags_text.html("Tags: ");
var icon = iconsJx.createSaveTagsIcon();
input.attr("type", "text");
input.attr("name", "tags_list");
input.attr("style", "width:75%; display: inline-block");
input.attr("id", "change_tags");
form.append(tags_text);
tags.textContent = " ";
tags_text.innerHTML = "Tags: ";
form.append(input);
form.append(icon);
tags.appendChild(form[0]);
$("#change_tags")[0].value = tags_string;
$("#change_tags").keypress(function(e){
if (e.which == 13){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/churn_to_csv.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false;
};
});
$("#tooltip-save_tags").click(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/churn_to_csv.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
});
$("form").submit(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/churn_to_csv.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false
})
});
});
</script>
</body>
</html>

Просмотреть файл

@ -0,0 +1,14 @@
{
"title": "Churn to CSV",
"authors": [
"amiyaguchi"
],
"tags": [
"churn",
"etl",
"csv"
],
"publish_date": "2016-03-07",
"updated_at": "2016-03-07",
"tldr": "Convert telemetry-parquet/churn to csv"
}

Просмотреть файл

@ -0,0 +1,561 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title> Knowledge </title>
<!-- js includes at the top as post embedded js colliding -->
<script src="https://reports.telemetry.mozilla.org/static/modules/jquery/jquery.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/tether/js/tether.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap/js/bootstrap.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/js/bootstrap-slider.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/typeahead.js/typeahead.bundle.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/handlebars/js/handlebars.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/select2/js/select2.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/hightlight.pack.js/highlight.pack.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/marked.js/marked.js"></script>
<!-- require js is used for plotly, but has a bunch of collisions with other js packages
make sure to have it be last js package imported -->
<script src="https://reports.telemetry.mozilla.org/static/modules/require.js/require.min.js"></script>
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.6.2/html5shiv.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/2.1.0/es5-shim.min.js"></script>
<![endif]-->
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap/css/bootstrap.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/css/bootstrap-slider.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/css/custom.css" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/select2/css/select2.min.css" rel="stylesheet" type="text/css"/>
<link href="https://fonts.googleapis.com/css?family=Lato:400,900|Playfair+Display|Source+Serif+Pro:400,700" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/images/favicon.png" rel="shortcut icon"/>
<link href="https://reports.telemetry.mozilla.org/static/css/codehilite-friendly.css" rel="stylesheet"/>
<style>
.spinner {
position: fixed;
top: 50%;
left: 50%;
margin-left: -50px; /* half width of the spinner gif */
margin-top: -50px; /* half height of the spinner gif */
text-align:center;
z-index:1234;
overflow: auto;
width: 100px; /* width of the spinner gif */
height: 102px; /*hight of the spinner gif +2px to fix IE8 issue */
}
.table {
font-size: 14px;
}
.modal-content {
max-width: 1024px;
}
</style>
</head>
<body>
<div class="container page-container">
<br/>
<div class="container-fluid">
<div class="row">
<div class="col-md-6">
</div>
<div class="col-md-2">
</div>
</div>
<div class="row col-md-12">
</div>
</div>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<div id="renderedMarkdown">
<div class="metadata">
<h1>Containers Testpilot Pipeline</h1>
<span class="authors"><a href="/feed?authors=Ryan+Harter+%28%3Aharter%29">Ryan Harter (:harter)</a></span>
<span class="date_created">March 08, 2017</span>
<span class="date_updated">(Last Updated: March 08, 2017)</span>
<span class="tldr"><p>Populates containers_testpilottest</p></span>
</div>
<div class="codehilite"><pre><span></span><span class="c1"># %load ~/cliqz_ping_pipeline/transform.py</span>
<span class="kn">import</span> <span class="nn">ujson</span> <span class="kn">as</span> <span class="nn">json</span>
<span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="o">*</span>
<span class="kn">import</span> <span class="nn">pandas</span> <span class="kn">as</span> <span class="nn">pd</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="o">*</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">split</span>
<span class="kn">import</span> <span class="nn">base64</span>
<span class="kn">from</span> <span class="nn">Crypto.Cipher</span> <span class="kn">import</span> <span class="n">AES</span>
<span class="kn">from</span> <span class="nn">moztelemetry</span> <span class="kn">import</span> <span class="n">get_pings_properties</span>
<span class="kn">from</span> <span class="nn">moztelemetry.dataset</span> <span class="kn">import</span> <span class="n">Dataset</span>
<span class="k">class</span> <span class="nc">ColumnConfig</span><span class="p">:</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">,</span> <span class="n">path</span><span class="p">,</span> <span class="n">cleaning_func</span><span class="p">,</span> <span class="n">struct_type</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">name</span>
<span class="bp">self</span><span class="o">.</span><span class="n">path</span> <span class="o">=</span> <span class="n">path</span>
<span class="bp">self</span><span class="o">.</span><span class="n">cleaning_func</span> <span class="o">=</span> <span class="n">cleaning_func</span>
<span class="bp">self</span><span class="o">.</span><span class="n">struct_type</span> <span class="o">=</span> <span class="n">struct_type</span>
<span class="k">class</span> <span class="nc">DataFrameConfig</span><span class="p">:</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">col_configs</span><span class="p">,</span> <span class="n">ping_filter</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="p">[</span><span class="n">ColumnConfig</span><span class="p">(</span><span class="o">*</span><span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">col_configs</span><span class="p">]</span>
<span class="bp">self</span><span class="o">.</span><span class="n">ping_filter</span> <span class="o">=</span> <span class="n">ping_filter</span>
<span class="k">def</span> <span class="nf">toStructType</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="n">StructType</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">StructField</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="n">col</span><span class="o">.</span><span class="n">struct_type</span><span class="p">,</span> <span class="bp">True</span><span class="p">),</span>
<span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">))</span>
<span class="k">def</span> <span class="nf">get_names</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">col</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">get_paths</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">col</span><span class="o">.</span><span class="n">path</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">pings_to_df</span><span class="p">(</span><span class="n">sqlContext</span><span class="p">,</span> <span class="n">pings</span><span class="p">,</span> <span class="n">data_frame_config</span><span class="p">):</span>
<span class="sd">"""Performs simple data pipelining on raw pings</span>
<span class="sd"> Arguments:</span>
<span class="sd"> data_frame_config: a list of tuples of the form:</span>
<span class="sd"> (name, path, cleaning_func, column_type)</span>
<span class="sd"> """</span>
<span class="n">filtered_pings</span> <span class="o">=</span> <span class="n">get_pings_properties</span><span class="p">(</span><span class="n">pings</span><span class="p">,</span> <span class="n">data_frame_config</span><span class="o">.</span><span class="n">get_paths</span><span class="p">())</span>\
<span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">data_frame_config</span><span class="o">.</span><span class="n">ping_filter</span><span class="p">)</span>
<span class="k">return</span> <span class="n">config_to_df</span><span class="p">(</span><span class="n">sqlContext</span><span class="p">,</span> <span class="n">filtered_pings</span><span class="p">,</span> <span class="n">data_frame_config</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">config_to_df</span><span class="p">(</span><span class="n">sqlContext</span><span class="p">,</span> <span class="n">raw_data</span><span class="p">,</span> <span class="n">data_frame_config</span><span class="p">):</span>
<span class="sd">"""Performs simple data pipelining on raw pings</span>
<span class="sd"> Arguments:</span>
<span class="sd"> data_frame_config: a list of tuples of the form:</span>
<span class="sd"> (name, path, cleaning_func, column_type)</span>
<span class="sd"> """</span>
<span class="k">def</span> <span class="nf">build_cell</span><span class="p">(</span><span class="n">ping</span><span class="p">,</span> <span class="n">column_config</span><span class="p">):</span>
<span class="sd">"""Takes a json ping and a column config and returns a cleaned cell"""</span>
<span class="n">raw_value</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="n">column_config</span><span class="o">.</span><span class="n">path</span><span class="p">]</span>
<span class="n">func</span> <span class="o">=</span> <span class="n">column_config</span><span class="o">.</span><span class="n">cleaning_func</span>
<span class="k">if</span> <span class="n">func</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">func</span><span class="p">(</span><span class="n">raw_value</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">raw_value</span>
<span class="k">def</span> <span class="nf">ping_to_row</span><span class="p">(</span><span class="n">ping</span><span class="p">):</span>
<span class="k">return</span> <span class="p">[</span><span class="n">build_cell</span><span class="p">(</span><span class="n">ping</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">data_frame_config</span><span class="o">.</span><span class="n">columns</span><span class="p">]</span>
<span class="k">return</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span>
<span class="n">raw_data</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">ping_to_row</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">(),</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">data_frame_config</span><span class="o">.</span><span class="n">toStructType</span><span class="p">())</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">save_df</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">name</span><span class="p">,</span> <span class="n">date_partition</span><span class="p">,</span> <span class="n">partitions</span><span class="o">=</span><span class="mi">1</span><span class="p">):</span>
<span class="k">if</span> <span class="n">date_partition</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">partition_str</span> <span class="o">=</span> <span class="s2">"/submission_date={day}"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">day</span><span class="o">=</span><span class="n">date_partition</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">partition_str</span><span class="o">=</span><span class="s2">""</span>
<span class="c1"># TODO: this name should include the experiment name</span>
<span class="n">path_fmt</span> <span class="o">=</span> <span class="s2">"s3n://telemetry-parquet/harter/containers_{name}/v1{partition_str}"</span>
<span class="n">path</span> <span class="o">=</span> <span class="n">path_fmt</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">,</span> <span class="n">partition_str</span><span class="o">=</span><span class="n">partition_str</span><span class="p">)</span>
<span class="n">df</span><span class="o">.</span><span class="n">repartition</span><span class="p">(</span><span class="n">partitions</span><span class="p">)</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">mode</span><span class="p">(</span><span class="s2">"overwrite"</span><span class="p">)</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">__main__</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">sqlContext</span><span class="p">,</span> <span class="n">day</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">save</span><span class="o">=</span><span class="bp">True</span><span class="p">):</span>
<span class="k">if</span> <span class="n">day</span> <span class="ow">is</span> <span class="bp">None</span><span class="p">:</span>
<span class="c1"># Set day to yesterday</span>
<span class="n">day</span> <span class="o">=</span> <span class="p">(</span><span class="n">date</span><span class="o">.</span><span class="n">today</span><span class="p">()</span> <span class="o">-</span> <span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">))</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">"%Y%m</span><span class="si">%d</span><span class="s2">"</span><span class="p">)</span>
<span class="n">get_doctype_pings</span> <span class="o">=</span> <span class="k">lambda</span> <span class="n">docType</span><span class="p">:</span> <span class="n">Dataset</span><span class="o">.</span><span class="n">from_source</span><span class="p">(</span><span class="s2">"telemetry"</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">docType</span><span class="o">=</span><span class="n">docType</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">submissionDate</span><span class="o">=</span><span class="n">day</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">appName</span><span class="o">=</span><span class="s2">"Firefox"</span><span class="p">)</span> \
<span class="o">.</span><span class="n">records</span><span class="p">(</span><span class="n">sc</span><span class="p">)</span>
<span class="n">testpilottest_df</span> <span class="o">=</span> <span class="n">pings_to_df</span><span class="p">(</span>
<span class="n">sqlContext</span><span class="p">,</span>
<span class="n">get_doctype_pings</span><span class="p">(</span><span class="s2">"testpilottest"</span><span class="p">),</span>
<span class="n">DataFrameConfig</span><span class="p">(</span>
<span class="p">[</span>
<span class="p">(</span><span class="s2">"uuid"</span><span class="p">,</span> <span class="s2">"payload/payload/uuid"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"userContextId"</span><span class="p">,</span> <span class="s2">"payload/payload/userContextId"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"clickedContainerTabCount"</span><span class="p">,</span> <span class="s2">"payload/payload/clickedContainerTabCount"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"eventSource"</span><span class="p">,</span> <span class="s2">"payload/payload/eventSource"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"event"</span><span class="p">,</span> <span class="s2">"payload/payload/event"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"hiddenContainersCount"</span><span class="p">,</span> <span class="s2">"payload/payload/hiddenContainersCount"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"shownContainersCount"</span><span class="p">,</span> <span class="s2">"payload/payload/shownContainersCount"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"totalContainersCount"</span><span class="p">,</span> <span class="s2">"payload/payload/totalContainersCount"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"totalContainerTabsCount"</span><span class="p">,</span> <span class="s2">"payload/payload/totalContainerTabsCount"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"totalNonContainerTabsCount"</span><span class="p">,</span> <span class="s2">"payload/payload/totalNonContainerTabsCount"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"test"</span><span class="p">,</span> <span class="s2">"payload/test"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">],</span>
<span class="k">lambda</span> <span class="n">ping</span><span class="p">:</span> <span class="n">ping</span><span class="p">[</span><span class="s1">'payload/test'</span><span class="p">]</span> <span class="o">==</span> <span class="s2">"@testpilot-containers"</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">save</span><span class="p">:</span>
<span class="n">save_df</span><span class="p">(</span><span class="n">testpilottest_df</span><span class="p">,</span> <span class="s2">"testpilottest"</span><span class="p">,</span> <span class="n">day</span><span class="p">,</span> <span class="n">partitions</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="k">return</span> <span class="n">testpilottest_df</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">tpt</span> <span class="o">=</span> <span class="n">__main__</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">sqlContext</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">tpt</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span>
</pre></div>
</div>
</div>
</div>
</div>
</div></body></html>
<div>
</div>
<br/>
<div class="row">
<div class="col-md-12">
</div>
</div>
<script type="text/javascript">
$("#searchbar")[0].setSelectionRange(1000, 1000);
$('#searchbar').typeahead({
hint: false,
highlight: true,
minLength: 1
},
{
name: 'knowledge_posts',
limit: 10,
display: function (item) {
return item.title + " - " + item.author;
},
templates: {
empty: Handlebars.compile(
'<div class="tt-not-found">' +
'Unable to find any posts that match the current query' +
'</div>'
),
suggestion: function(data) {
return '<p style="overflow-wrap:break-word"><strong class="text-rausch">' + data.title + '</strong> – ' + data.author + '</p>';
}
},
source: function(q, sync, async) {
$.ajax('/ajax/index/typeahead?search=' + q,
{
success: function(data,status){ async(JSON.parse(data)); }
})
}
});
$('#searchbar').bind('typeahead:select', function(obj, datum, name) {
window.location = '/post/'+encodeURIComponent(datum.path);
});
$('#searchbar').keypress(function(event){
var keycode = (event.keyCode ? event.keyCode : event.which);
if(keycode == '13'){
var path = document.location.pathname;
window.location = '/feed?filters=' + $('#searchbar').val()
}
});
var padding = $('.tt-menu').outerWidth()
$('.tt-menu').width($('#searchbar').width() + padding + "px")
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/tooltips.js" type="text/javascript"></script>
<script type="text/javascript">
$("document").ready(function(){
var is_webeditor = false;
var post_id = "7";
var id = "None";
var post_path = "etl/container_etl.kp"
var data_repo_github_root = ""
tooltipsJx.initializeTooltips(is_webeditor, post_id, id, data_repo_github_root);
$(".btn-rendered").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path);
})
$(".btn-raw").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path) + "?render=raw";
})
$(".btn-webeditor").on("click", function(){
document.location.href = "/edit/" + encodeURI(post_path);
})
});
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/tags.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/icons.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/comments.js" type="text/javascript"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js" type="text/javascript">
</script>
<script>
$(document).on('ready', function(){
// Make the Rendered Markdown Button active
$(".btn-rendered").addClass("btn-active");
// Initialize headers
helpersJx.linkifyHeaders();
// Initialize comments
var post_path = 'etl/container_etl.kp';
$("#post_comment_btn").on('click', function(){
comment_author = 'knowledge_default';
post_author = 'Ryan Harter (:harter)';
post_title = 'Containers Testpilot Pipeline';
commentsJx.postComment(comment_author, post_author, post_title, post_path);
location.reload();
});
all_comment_delete_buttons = $("[id^=delete_comment]")
$.each(all_comment_delete_buttons, function(i,v){
$(v).on("click", function(){
var id = v.id;
var comment_id = id.split("__")[1];
if(comment_id) {
commentsJx.deleteComment(post_path, comment_id)
location.reload();
}
});
});
$(document.body).on('click',"button[id^=tag-subscription]",function () {
tagsJx.addTagSubscriptionListener($(this)[0]);
});
})
//Turn all the headers to be links
//h1 = Title, don't want that
var all_headers = [$("h2"), $("h3"), $("h4"), $("h5"), $("h6")]
$.each(all_headers, function(index, value){
$.each(value, function(i, v){
var inner_html = v.innerHTML
inner_html_no_special = inner_html.replace(/[^a-zA-Z\- ]/g, "")
var inner_link = "#" + inner_html_no_special.toLowerCase().split(" ").join("-")
v.innerHTML = "<a href='" + inner_link + "' class=link-reset>" + inner_html + "</a>"
})
})
//turn all the tags into links, similar to what's done on the feed page
var tags = $("#renderedMarkdown .metadata .tags")[0]
var tags_list = ['Spark', 'ATMO', 'ETL']
var subscriptions_list = []
$.each(tags_list, function(i,tag){
ahref = document.createElement("a")
e_tag = encodeURIComponent(tag)
f_tag = tag.replace("/", "__")
tag_name = "#" + tag
tag_subscription_button_id_name = "tag-subscription-" + i + "__" + f_tag
ahref.setAttribute("data-container", "body")
ahref.setAttribute("data-toggle", "popover")
ahref.setAttribute("data-placement", "bottom")
ahref.setAttribute("data-html", "true")
ahref.setAttribute("data-tag-name", f_tag)
if (subscriptions_list.indexOf(tag) >= 0) {
ahref.setAttribute("class", "label label-subscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-primary btn-unsubscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-remove-sign glyphicon-white'></i>Unsubscribe " +
" </button> " +
" </div>")
} else {
ahref.setAttribute("class", "label label-unsubscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-default btn-subscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-ok-sign glyphicon-filled'></i>Subscribe " +
" </button> " +
" </div>")
}
ahref.setAttribute("href", "/tag_pages?tag=" + e_tag)
ahref.setAttribute("style", "font-weight:normal")
if (i == 0){
ahref.innerHTML = " "
colon = document.createElement("text")
colon.innerHTML = "<b>Tags</b>: "
tags.appendChild(colon)
}
ahref.innerHTML = ahref.innerHTML + tag_name
tags.appendChild(ahref)
if (i != tags_list.length - 1){
comma = document.createElement("text")
comma.innerText = ", "
tags.appendChild(comma)
}
})
tags.nextSibling.remove()
tags.innerHTML += "<i class='glyphicon glyphicon-edit icon-gray' style='font-size:12pt; padding-left:4px' id='tooltip-edit_tags'></i>"
$(".pop").popover({ trigger: "manual" , html: true, animation:false, delay: 100})
.on("mouseenter", function () {
var _this = this;
$(this).popover("show");
$(".popover").on("mouseleave", function () {
$(_this).popover('hide');
});
}).on("mouseleave", function () {
var _this = this;
setTimeout(function () {
if (!$(".popover:hover").length) {
$(_this).popover("hide");
}
}, 300);
});
$('#tooltip-edit_tags').click(function(){
$('#tooltip-edit_tags')[0].setAttribute("style", "display:none")
previousSibling = $("#tooltip-edit_tags")[0].previousSibling
tags_string = tags_list.join(", ")
form = document.createElement("form")
input = document.createElement("input")
tags_text = document.createElement("text")
icon_class = document.createElement("i")
icon_class.setAttribute("class", "glyphicon glyphicon-upload icon-gray")
icon_class.setAttribute("style", "font-size:23px; padding-left:4px")
icon_class.setAttribute("id", "tooltip-save_tags")
tags_text.innerText = "Tags: "
input.setAttribute('type', 'text')
input.setAttribute('name', 'tags_list')
input.setAttribute('value', tags_string)
input.setAttribute('style', 'width:75%; display: inline-block')
input.setAttribute('id' , 'change_tags')
form.appendChild(tags_text)
tags.textContent = " "
form.appendChild(input)
form.appendChild(icon_class)
tags.appendChild(form)
$("#tooltip-save_tags").click(function(){
tags_string = $("#change_tags")[0].value
tags_list = tags_string.split(",")
var re = /^[a-z0-9\-\_\:\/]+$/i
var good = true
for (var i = 0; i < tags_list.length; i++){
tag = tags_list[i]
if (tag.length == 0){
alert("There is a tag with length 0 - possible a trailing comma?")
good = false
break
} else {
tag_name = tag.trim()
if (!(re.test(tag_name))){
alert("The tag contains special characters. Make sure there are only alphanumeric characters in your tag")
good = false
break
}
}
}
if (good) {
var postContent = {}
postContent['tags'] = tags_string
$.ajax({
type: "POST",
dataType: "json",
data: JSON.stringify(postContent),
contentType: "application/json",
url: '/tag_list?post_path=etl/container_etl.kp',
async: false
});
location.reload()
}
})
tags.nextSibling.remove()
// Allow user to edit tags
var edit_icon = iconsJx.createEditTagsIcon();
$(tags).after(edit_icon);
$("#tooltip-edit_tags").on("click", function(){
var edit_tooltip = $("#tooltip-edit_tags");
edit_tooltip.attr("style", "display:none");
var tags_string = tags_list.join(", ");
var form = $("<form>");
var input = $("<input>");
var tags_text = $("<text>");
tags_text.html("Tags: ");
var icon = iconsJx.createSaveTagsIcon();
input.attr("type", "text");
input.attr("name", "tags_list");
input.attr("style", "width:75%; display: inline-block");
input.attr("id", "change_tags");
form.append(tags_text);
tags.textContent = " ";
tags_text.innerHTML = "Tags: ";
form.append(input);
form.append(icon);
tags.appendChild(form[0]);
$("#change_tags")[0].value = tags_string;
$("#change_tags").keypress(function(e){
if (e.which == 13){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/container_etl.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false;
};
});
$("#tooltip-save_tags").click(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/container_etl.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
});
$("form").submit(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/container_etl.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false
})
});
});
</script>

Просмотреть файл

@ -0,0 +1,681 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title> Knowledge </title>
<!-- js includes at the top as post embedded js colliding -->
<script src="/static/modules/jquery/jquery.min.js"></script>
<script src="/static/modules/tether/js/tether.min.js"></script>
<script src="/static/modules/bootstrap/js/bootstrap.min.js"></script>
<script src="/static/modules/bootstrap-slider/js/bootstrap-slider.min.js"></script>
<script src="/static/modules/typeahead.js/typeahead.bundle.min.js"></script>
<script src="/static/modules/handlebars/js/handlebars.js"></script>
<script src="/static/js/helpers.js"></script>
<script src="/static/modules/select2/js/select2.min.js"></script>
<script src="/static/modules/hightlight.pack.js/highlight.pack.js"></script>
<script src="/static/modules/marked.js/marked.js"></script>
<!-- require js is used for plotly, but has a bunch of collisions with other js packages
make sure to have it be last js package imported -->
<script src="/static/modules/require.js/require.min.js"></script>
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.6.2/html5shiv.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/2.1.0/es5-shim.min.js"></script>
<![endif]-->
<link rel="stylesheet" href="/static/modules/bootstrap/css/bootstrap.min.css">
<link rel="stylesheet" href="/static/modules/bootstrap-slider/css/bootstrap-slider.min.css">
<link rel="stylesheet" type='text/css' href="/static/css/custom.css">
<link rel="stylesheet" type='text/css' href="/static/modules/select2/css/select2.min.css">
<link href='https://fonts.googleapis.com/css?family=Lato:400,900|Playfair+Display|Source+Serif+Pro:400,700' rel='stylesheet' type='text/css'>
<link rel="shortcut icon" href="/static/images/favicon.png">
<link rel="stylesheet" href="/static/css/codehilite-friendly.css">
<style>
.spinner {
position: fixed;
top: 50%;
left: 50%;
margin-left: -50px; /* half width of the spinner gif */
margin-top: -50px; /* half height of the spinner gif */
text-align:center;
z-index:1234;
overflow: auto;
width: 100px; /* width of the spinner gif */
height: 102px; /*hight of the spinner gif +2px to fix IE8 issue */
}
.table {
font-size: 14px;
}
.modal-content {
max-width: 1024px;
}
</style>
</head>
<body>
<div class="navbar navbar-knowledge" role="navigation">
<div class="container page-container">
<a href="/" aria-selected="false" class="logo-image navbar-text" style='margin-top:16px; margin-left: -7px'>
<img width="125" src='/static/images/logo-white.svg'></img>
</a>
<a id="feed_tab" href="/" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Home
</a>
<a id="favorites_tab" href="/favorites" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Favorites
</a>
<a id="help_tab" href="/about" aria-selected="false" class="navbar-text" style='margin-top:18px'>
About
</a>
<a id="stats_tab" href="/stats" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Stats
</a>
<a id="webposts_tab" href="/create"
class="btn btn-primary navbar-text pull-right"
style='border-radius:4px; margin-top:10px; margin-right:-10px; background-color: #00a699; color:white; border-color: #00a699'>
Write a Post!
</a>
<div class="pull-right">
<div class="form-group">
<input class="form-control" id="searchbar" placeholder="Search for Knowledge" style="text-align:right">
</div>
</div>
</div>
</div>
<div class="container page-container">
<br>
<div class="container-fluid">
<div class="row">
<div class="col-md-6">
<div class="btn-group" role="group">
<button type="button" class="btn btn-default btn-rendered">
View Post
</button>
<button type="button" class="btn btn-default btn-raw">
View Raw Markdown
</button>
</div>
</div>
<div class="col-md-2">
</div>
<div class="col-md-4 text-right">
<i class="glyphicon glyphicon-eye-open" id="pageview_stats" style='color: #9CA299'></i>
<div id="pageview_stats" style="display: inline-block">
Viewed 2 times by 1 different users
</div>
<i class="glyphicon glyphicon-heart-empty glyphicon-clickable pop" style="font-size:16pt" id="tooltip-like" data-placement="bottom"
data-trigger="#tooltip-like"
data-container="body"
data-toggle="popover"
data-content='<div>Like This Post</div>'></i>
</div>
</div>
<div class="row col-md-12">
</div>
</div>
<div class='container-fluid'>
<div class="row">
<div class="col-md-12">
<div id="renderedMarkdown">
<div class='metadata'>
<h1>Containers Testpilot Pipeline</h1>
<span class='authors'><a href='/feed?authors=Ryan+Harter+%28%3Aharter%29'>Ryan Harter (:harter)</a></span>
<span class='date_created'>March 08, 2017</span>
<span class='date_updated'>(Last Updated: March 08, 2017)</span>
<span class='tldr'><p>Populates containers_testpilottest</p></span>
<span class='tags'></span>
</div>
<div class="codehilite"><pre><span></span><span class="c1"># %load ~/cliqz_ping_pipeline/transform.py</span>
<span class="kn">import</span> <span class="nn">ujson</span> <span class="kn">as</span> <span class="nn">json</span>
<span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="o">*</span>
<span class="kn">import</span> <span class="nn">pandas</span> <span class="kn">as</span> <span class="nn">pd</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="o">*</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">split</span>
<span class="kn">import</span> <span class="nn">base64</span>
<span class="kn">from</span> <span class="nn">Crypto.Cipher</span> <span class="kn">import</span> <span class="n">AES</span>
<span class="kn">from</span> <span class="nn">moztelemetry</span> <span class="kn">import</span> <span class="n">get_pings_properties</span>
<span class="kn">from</span> <span class="nn">moztelemetry.dataset</span> <span class="kn">import</span> <span class="n">Dataset</span>
<span class="k">class</span> <span class="nc">ColumnConfig</span><span class="p">:</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">,</span> <span class="n">path</span><span class="p">,</span> <span class="n">cleaning_func</span><span class="p">,</span> <span class="n">struct_type</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">name</span>
<span class="bp">self</span><span class="o">.</span><span class="n">path</span> <span class="o">=</span> <span class="n">path</span>
<span class="bp">self</span><span class="o">.</span><span class="n">cleaning_func</span> <span class="o">=</span> <span class="n">cleaning_func</span>
<span class="bp">self</span><span class="o">.</span><span class="n">struct_type</span> <span class="o">=</span> <span class="n">struct_type</span>
<span class="k">class</span> <span class="nc">DataFrameConfig</span><span class="p">:</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">col_configs</span><span class="p">,</span> <span class="n">ping_filter</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="p">[</span><span class="n">ColumnConfig</span><span class="p">(</span><span class="o">*</span><span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">col_configs</span><span class="p">]</span>
<span class="bp">self</span><span class="o">.</span><span class="n">ping_filter</span> <span class="o">=</span> <span class="n">ping_filter</span>
<span class="k">def</span> <span class="nf">toStructType</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="n">StructType</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">StructField</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="n">col</span><span class="o">.</span><span class="n">struct_type</span><span class="p">,</span> <span class="bp">True</span><span class="p">),</span>
<span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">))</span>
<span class="k">def</span> <span class="nf">get_names</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">col</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">get_paths</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">col</span><span class="o">.</span><span class="n">path</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">pings_to_df</span><span class="p">(</span><span class="n">sqlContext</span><span class="p">,</span> <span class="n">pings</span><span class="p">,</span> <span class="n">data_frame_config</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Performs simple data pipelining on raw pings</span>
<span class="sd"> Arguments:</span>
<span class="sd"> data_frame_config: a list of tuples of the form:</span>
<span class="sd"> (name, path, cleaning_func, column_type)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">filtered_pings</span> <span class="o">=</span> <span class="n">get_pings_properties</span><span class="p">(</span><span class="n">pings</span><span class="p">,</span> <span class="n">data_frame_config</span><span class="o">.</span><span class="n">get_paths</span><span class="p">())</span>\
<span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">data_frame_config</span><span class="o">.</span><span class="n">ping_filter</span><span class="p">)</span>
<span class="k">return</span> <span class="n">config_to_df</span><span class="p">(</span><span class="n">sqlContext</span><span class="p">,</span> <span class="n">filtered_pings</span><span class="p">,</span> <span class="n">data_frame_config</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">config_to_df</span><span class="p">(</span><span class="n">sqlContext</span><span class="p">,</span> <span class="n">raw_data</span><span class="p">,</span> <span class="n">data_frame_config</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Performs simple data pipelining on raw pings</span>
<span class="sd"> Arguments:</span>
<span class="sd"> data_frame_config: a list of tuples of the form:</span>
<span class="sd"> (name, path, cleaning_func, column_type)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="nf">build_cell</span><span class="p">(</span><span class="n">ping</span><span class="p">,</span> <span class="n">column_config</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Takes a json ping and a column config and returns a cleaned cell&quot;&quot;&quot;</span>
<span class="n">raw_value</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="n">column_config</span><span class="o">.</span><span class="n">path</span><span class="p">]</span>
<span class="n">func</span> <span class="o">=</span> <span class="n">column_config</span><span class="o">.</span><span class="n">cleaning_func</span>
<span class="k">if</span> <span class="n">func</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">func</span><span class="p">(</span><span class="n">raw_value</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">raw_value</span>
<span class="k">def</span> <span class="nf">ping_to_row</span><span class="p">(</span><span class="n">ping</span><span class="p">):</span>
<span class="k">return</span> <span class="p">[</span><span class="n">build_cell</span><span class="p">(</span><span class="n">ping</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">data_frame_config</span><span class="o">.</span><span class="n">columns</span><span class="p">]</span>
<span class="k">return</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span>
<span class="n">raw_data</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">ping_to_row</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">(),</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">data_frame_config</span><span class="o">.</span><span class="n">toStructType</span><span class="p">())</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">save_df</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">name</span><span class="p">,</span> <span class="n">date_partition</span><span class="p">,</span> <span class="n">partitions</span><span class="o">=</span><span class="mi">1</span><span class="p">):</span>
<span class="k">if</span> <span class="n">date_partition</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">partition_str</span> <span class="o">=</span> <span class="s2">&quot;/submission_date={day}&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">day</span><span class="o">=</span><span class="n">date_partition</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">partition_str</span><span class="o">=</span><span class="s2">&quot;&quot;</span>
<span class="c1"># TODO: this name should include the experiment name</span>
<span class="n">path_fmt</span> <span class="o">=</span> <span class="s2">&quot;s3n://telemetry-parquet/harter/containers_{name}/v1{partition_str}&quot;</span>
<span class="n">path</span> <span class="o">=</span> <span class="n">path_fmt</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">,</span> <span class="n">partition_str</span><span class="o">=</span><span class="n">partition_str</span><span class="p">)</span>
<span class="n">df</span><span class="o">.</span><span class="n">repartition</span><span class="p">(</span><span class="n">partitions</span><span class="p">)</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">mode</span><span class="p">(</span><span class="s2">&quot;overwrite&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">__main__</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">sqlContext</span><span class="p">,</span> <span class="n">day</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">save</span><span class="o">=</span><span class="bp">True</span><span class="p">):</span>
<span class="k">if</span> <span class="n">day</span> <span class="ow">is</span> <span class="bp">None</span><span class="p">:</span>
<span class="c1"># Set day to yesterday</span>
<span class="n">day</span> <span class="o">=</span> <span class="p">(</span><span class="n">date</span><span class="o">.</span><span class="n">today</span><span class="p">()</span> <span class="o">-</span> <span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">))</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">&quot;%Y%m</span><span class="si">%d</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="n">get_doctype_pings</span> <span class="o">=</span> <span class="k">lambda</span> <span class="n">docType</span><span class="p">:</span> <span class="n">Dataset</span><span class="o">.</span><span class="n">from_source</span><span class="p">(</span><span class="s2">&quot;telemetry&quot;</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">docType</span><span class="o">=</span><span class="n">docType</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">submissionDate</span><span class="o">=</span><span class="n">day</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">appName</span><span class="o">=</span><span class="s2">&quot;Firefox&quot;</span><span class="p">)</span> \
<span class="o">.</span><span class="n">records</span><span class="p">(</span><span class="n">sc</span><span class="p">)</span>
<span class="n">testpilottest_df</span> <span class="o">=</span> <span class="n">pings_to_df</span><span class="p">(</span>
<span class="n">sqlContext</span><span class="p">,</span>
<span class="n">get_doctype_pings</span><span class="p">(</span><span class="s2">&quot;testpilottest&quot;</span><span class="p">),</span>
<span class="n">DataFrameConfig</span><span class="p">(</span>
<span class="p">[</span>
<span class="p">(</span><span class="s2">&quot;uuid&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/uuid&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;userContextId&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/userContextId&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;clickedContainerTabCount&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/clickedContainerTabCount&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;eventSource&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/eventSource&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;event&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/event&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;hiddenContainersCount&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/hiddenContainersCount&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;shownContainersCount&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/shownContainersCount&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;totalContainersCount&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/totalContainersCount&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;totalContainerTabsCount&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/totalContainerTabsCount&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;totalNonContainerTabsCount&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/totalNonContainerTabsCount&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;test&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/test&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">],</span>
<span class="k">lambda</span> <span class="n">ping</span><span class="p">:</span> <span class="n">ping</span><span class="p">[</span><span class="s1">&#39;payload/test&#39;</span><span class="p">]</span> <span class="o">==</span> <span class="s2">&quot;@testpilot-containers&quot;</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">save</span><span class="p">:</span>
<span class="n">save_df</span><span class="p">(</span><span class="n">testpilottest_df</span><span class="p">,</span> <span class="s2">&quot;testpilottest&quot;</span><span class="p">,</span> <span class="n">day</span><span class="p">,</span> <span class="n">partitions</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="k">return</span> <span class="n">testpilottest_df</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">tpt</span> <span class="o">=</span> <span class="n">__main__</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">sqlContext</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">tpt</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span>
</pre></div>
</div>
</div>
</div>
</div>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<h2> 0 Comments </h2>
</br>
<div>
<div class="row">
<div class="col-md-12">
<textarea class="form-control" type="text" id="comment-text" style="height:87px;" placeholder="Leave a comment..."></textarea>
</div>
</div>
</div>
</br>
<div>
<button class="btn btn-primary" id="post_comment_btn">Post Comment</button>
</div>
</div>
</div>
<br>
<div class="row">
<div class="col-md-12">
</div>
</br>
</div>
</div>
</div>
</div>
<div class="footer">
Served with <span class="glyphicon glyphicon-heart"></span> by <a href="https://github.com/airbnb/knowledge-repo">Knowledge Repo</a> <a href="https://github.com/airbnb/knowledge-repo/releases/tag/v0.7.4">0.7.4</a><br />
<i title="Last checked for updates: 11 seconds ago">Last indexed: 17 hours ago</i>
</div>
<script type='text/javascript'>
$("#searchbar")[0].setSelectionRange(1000, 1000);
$('#searchbar').typeahead({
hint: false,
highlight: true,
minLength: 1
},
{
name: 'knowledge_posts',
limit: 10,
display: function (item) {
return item.title + " - " + item.author;
},
templates: {
empty: Handlebars.compile(
'<div class="tt-not-found">' +
'Unable to find any posts that match the current query' +
'</div>'
),
suggestion: function(data) {
return '<p style="overflow-wrap:break-word"><strong class="text-rausch">' + data.title + '</strong> – ' + data.author + '</p>';
}
},
source: function(q, sync, async) {
$.ajax('/ajax/index/typeahead?search=' + q,
{
success: function(data,status){ async(JSON.parse(data)); }
})
}
});
$('#searchbar').bind('typeahead:select', function(obj, datum, name) {
window.location = '/post/'+encodeURIComponent(datum.path);
});
$('#searchbar').keypress(function(event){
var keycode = (event.keyCode ? event.keyCode : event.which);
if(keycode == '13'){
var path = document.location.pathname;
window.location = '/feed?filters=' + $('#searchbar').val()
}
});
var padding = $('.tt-menu').outerWidth()
$('.tt-menu').width($('#searchbar').width() + padding + "px")
</script>
<script src="/static/js/tooltips.js" type="text/javascript"></script>
<script type="text/javascript">
$("document").ready(function(){
var is_webeditor = false;
var post_id = "7";
var id = "None";
var post_path = "etl/container_etl.kp"
var data_repo_github_root = ""
tooltipsJx.initializeTooltips(is_webeditor, post_id, id, data_repo_github_root);
$(".btn-rendered").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path);
})
$(".btn-raw").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path) + "?render=raw";
})
$(".btn-webeditor").on("click", function(){
document.location.href = "/edit/" + encodeURI(post_path);
})
});
</script>
<script src="/static/js/helpers.js"></script>
<script src="/static/js/tags.js" type="text/javascript"></script>
<script src="/static/js/icons.js" type="text/javascript"></script>
<script src="/static/js/comments.js" type="text/javascript"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script type="text/javascript"
src="https://cdn.mathjax.org/mathjax/latest/MathJax.js">
</script>
<script>
$(document).on('ready', function(){
// Make the Rendered Markdown Button active
$(".btn-rendered").addClass("btn-active");
// Initialize headers
helpersJx.linkifyHeaders();
// Initialize comments
var post_path = 'etl/container_etl.kp';
$("#post_comment_btn").on('click', function(){
comment_author = 'knowledge_default';
post_author = 'Ryan Harter (:harter)';
post_title = 'Containers Testpilot Pipeline';
commentsJx.postComment(comment_author, post_author, post_title, post_path);
location.reload();
});
all_comment_delete_buttons = $("[id^=delete_comment]")
$.each(all_comment_delete_buttons, function(i,v){
$(v).on("click", function(){
var id = v.id;
var comment_id = id.split("__")[1];
if(comment_id) {
commentsJx.deleteComment(post_path, comment_id)
location.reload();
}
});
});
$(document.body).on('click',"button[id^=tag-subscription]",function () {
tagsJx.addTagSubscriptionListener($(this)[0]);
});
})
//Turn all the headers to be links
//h1 = Title, don't want that
var all_headers = [$("h2"), $("h3"), $("h4"), $("h5"), $("h6")]
$.each(all_headers, function(index, value){
$.each(value, function(i, v){
var inner_html = v.innerHTML
inner_html_no_special = inner_html.replace(/[^a-zA-Z\- ]/g, "")
var inner_link = "#" + inner_html_no_special.toLowerCase().split(" ").join("-")
v.innerHTML = "<a href='" + inner_link + "' class=link-reset>" + inner_html + "</a>"
})
})
//turn all the tags into links, similar to what's done on the feed page
var tags = $("#renderedMarkdown .metadata .tags")[0]
var tags_list = ['Spark', 'ATMO', 'ETL']
var subscriptions_list = []
$.each(tags_list, function(i,tag){
ahref = document.createElement("a")
e_tag = encodeURIComponent(tag)
f_tag = tag.replace("/", "__")
tag_name = "#" + tag
tag_subscription_button_id_name = "tag-subscription-" + i + "__" + f_tag
ahref.setAttribute("data-container", "body")
ahref.setAttribute("data-toggle", "popover")
ahref.setAttribute("data-placement", "bottom")
ahref.setAttribute("data-html", "true")
ahref.setAttribute("data-tag-name", f_tag)
if (subscriptions_list.indexOf(tag) >= 0) {
ahref.setAttribute("class", "label label-subscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-primary btn-unsubscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-remove-sign glyphicon-white'></i>Unsubscribe " +
" </button> " +
" </div>")
} else {
ahref.setAttribute("class", "label label-unsubscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-default btn-subscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-ok-sign glyphicon-filled'></i>Subscribe " +
" </button> " +
" </div>")
}
ahref.setAttribute("href", "/tag_pages?tag=" + e_tag)
ahref.setAttribute("style", "font-weight:normal")
if (i == 0){
ahref.innerHTML = " "
colon = document.createElement("text")
colon.innerHTML = "<b>Tags</b>: "
tags.appendChild(colon)
}
ahref.innerHTML = ahref.innerHTML + tag_name
tags.appendChild(ahref)
if (i != tags_list.length - 1){
comma = document.createElement("text")
comma.innerText = ", "
tags.appendChild(comma)
}
})
tags.nextSibling.remove()
tags.innerHTML += "<i class='glyphicon glyphicon-edit icon-gray' style='font-size:12pt; padding-left:4px' id='tooltip-edit_tags'></i>"
$(".pop").popover({ trigger: "manual" , html: true, animation:false, delay: 100})
.on("mouseenter", function () {
var _this = this;
$(this).popover("show");
$(".popover").on("mouseleave", function () {
$(_this).popover('hide');
});
}).on("mouseleave", function () {
var _this = this;
setTimeout(function () {
if (!$(".popover:hover").length) {
$(_this).popover("hide");
}
}, 300);
});
$('#tooltip-edit_tags').click(function(){
$('#tooltip-edit_tags')[0].setAttribute("style", "display:none")
previousSibling = $("#tooltip-edit_tags")[0].previousSibling
tags_string = tags_list.join(", ")
form = document.createElement("form")
input = document.createElement("input")
tags_text = document.createElement("text")
icon_class = document.createElement("i")
icon_class.setAttribute("class", "glyphicon glyphicon-upload icon-gray")
icon_class.setAttribute("style", "font-size:23px; padding-left:4px")
icon_class.setAttribute("id", "tooltip-save_tags")
tags_text.innerText = "Tags: "
input.setAttribute('type', 'text')
input.setAttribute('name', 'tags_list')
input.setAttribute('value', tags_string)
input.setAttribute('style', 'width:75%; display: inline-block')
input.setAttribute('id' , 'change_tags')
form.appendChild(tags_text)
tags.textContent = " "
form.appendChild(input)
form.appendChild(icon_class)
tags.appendChild(form)
$("#tooltip-save_tags").click(function(){
tags_string = $("#change_tags")[0].value
tags_list = tags_string.split(",")
var re = /^[a-z0-9\-\_\:\/]+$/i
var good = true
for (var i = 0; i < tags_list.length; i++){
tag = tags_list[i]
if (tag.length == 0){
alert("There is a tag with length 0 - possible a trailing comma?")
good = false
break
} else {
tag_name = tag.trim()
if (!(re.test(tag_name))){
alert("The tag contains special characters. Make sure there are only alphanumeric characters in your tag")
good = false
break
}
}
}
if (good) {
var postContent = {}
postContent['tags'] = tags_string
$.ajax({
type: "POST",
dataType: "json",
data: JSON.stringify(postContent),
contentType: "application/json",
url: '/tag_list?post_path=etl/container_etl.kp',
async: false
});
location.reload()
}
})
tags.nextSibling.remove()
// Allow user to edit tags
var edit_icon = iconsJx.createEditTagsIcon();
$(tags).after(edit_icon);
$("#tooltip-edit_tags").on("click", function(){
var edit_tooltip = $("#tooltip-edit_tags");
edit_tooltip.attr("style", "display:none");
var tags_string = tags_list.join(", ");
var form = $("<form>");
var input = $("<input>");
var tags_text = $("<text>");
tags_text.html("Tags: ");
var icon = iconsJx.createSaveTagsIcon();
input.attr("type", "text");
input.attr("name", "tags_list");
input.attr("style", "width:75%; display: inline-block");
input.attr("id", "change_tags");
form.append(tags_text);
tags.textContent = " ";
tags_text.innerHTML = "Tags: ";
form.append(input);
form.append(icon);
tags.appendChild(form[0]);
$("#change_tags")[0].value = tags_string;
$("#change_tags").keypress(function(e){
if (e.which == 13){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/container_etl.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false;
};
});
$("#tooltip-save_tags").click(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/container_etl.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
});
$("form").submit(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/container_etl.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false
})
});
});
</script>
</body>
</html>

Просмотреть файл

@ -0,0 +1,14 @@
{
"title": "Containers Testpilot Pipeline",
"authors": [
"Ryan Harter (:harter)"
],
"tags": [
"Spark",
"ATMO",
"ETL"
],
"publish_date": "2017-03-08",
"updated_at": "2017-03-08",
"tldr": "Populates containers_testpilottest"
}

Просмотреть файл

@ -0,0 +1,681 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title> Knowledge </title>
<!-- js includes at the top as post embedded js colliding -->
<script src="https://reports.telemetry.mozilla.org/static/modules/jquery/jquery.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/tether/js/tether.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap/js/bootstrap.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/js/bootstrap-slider.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/typeahead.js/typeahead.bundle.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/handlebars/js/handlebars.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/select2/js/select2.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/hightlight.pack.js/highlight.pack.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/marked.js/marked.js"></script>
<!-- require js is used for plotly, but has a bunch of collisions with other js packages
make sure to have it be last js package imported -->
<script src="https://reports.telemetry.mozilla.org/static/modules/require.js/require.min.js"></script>
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.6.2/html5shiv.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/2.1.0/es5-shim.min.js"></script>
<![endif]-->
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap/css/bootstrap.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/css/bootstrap-slider.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/css/custom.css" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/select2/css/select2.min.css" rel="stylesheet" type="text/css"/>
<link href="https://fonts.googleapis.com/css?family=Lato:400,900|Playfair+Display|Source+Serif+Pro:400,700" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/images/favicon.png" rel="shortcut icon"/>
<link href="https://reports.telemetry.mozilla.org/static/css/codehilite-friendly.css" rel="stylesheet"/>
<style>
.spinner {
position: fixed;
top: 50%;
left: 50%;
margin-left: -50px; /* half width of the spinner gif */
margin-top: -50px; /* half height of the spinner gif */
text-align:center;
z-index:1234;
overflow: auto;
width: 100px; /* width of the spinner gif */
height: 102px; /*hight of the spinner gif +2px to fix IE8 issue */
}
.table {
font-size: 14px;
}
.modal-content {
max-width: 1024px;
}
</style>
</head>
<body>
<div class="container page-container">
<br/>
<div class="container-fluid">
<div class="row">
<div class="col-md-6">
</div>
<div class="col-md-2">
</div>
</div>
<div class="row col-md-12">
</div>
</div>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<div id="renderedMarkdown">
<div class="metadata">
<h1>Experiment Job</h1>
<span class="authors"><a href="/feed?authors=Frank+Bertsch">Frank Bertsch</a></span>
<span class="date_created">February 01, 2017</span>
<span class="date_updated">(Last Updated: February 09, 2017)</span>
<span class="tldr"><p>We take all the pings from yesterday, get the information about any experiments: those that started, those running, and those that ended. These are aggregated by channel and outputted to files in s3.</p></span>
</div>
<div class="codehilite"><pre><span></span><span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">datetime</span> <span class="k">as</span> <span class="n">dt</span><span class="p">,</span> <span class="n">timedelta</span><span class="p">,</span> <span class="n">date</span>
<span class="kn">import</span> <span class="nn">moztelemetry</span>
<span class="kn">from</span> <span class="nn">os</span> <span class="kn">import</span> <span class="n">environ</span>
<span class="c1"># get the desired target date from the environment, or run</span>
<span class="c1"># on 'yesterday' by default.</span>
<span class="n">yesterday</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="n">dt</span><span class="o">.</span><span class="n">utcnow</span><span class="p">()</span> <span class="o">-</span> <span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">),</span> <span class="s2">"%Y%m</span><span class="si">%d</span><span class="s2">"</span><span class="p">)</span>
<span class="n">target_date</span> <span class="o">=</span> <span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'date'</span><span class="p">,</span> <span class="n">yesterday</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="kn">from</span> <span class="nn">moztelemetry.dataset</span> <span class="kn">import</span> <span class="n">Dataset</span>
<span class="n">sample_rate</span> <span class="o">=</span> <span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'sample'</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
<span class="n">pings</span> <span class="o">=</span> <span class="n">Dataset</span><span class="o">.</span><span class="n">from_source</span><span class="p">(</span><span class="s2">"telemetry-experiments"</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">submissionDate</span><span class="o">=</span><span class="n">target_date</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">docType</span><span class="o">=</span><span class="s2">"main"</span><span class="p">)</span> \
<span class="o">.</span><span class="n">records</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">sample</span><span class="o">=</span><span class="n">sample_rate</span><span class="p">)</span> \
<span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"environment"</span><span class="p">,</span> <span class="p">{})</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"build"</span><span class="p">,</span> <span class="p">{})</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"applicationName"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"Firefox"</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="kn">from</span> <span class="nn">moztelemetry</span> <span class="kn">import</span> <span class="n">get_pings_properties</span>
<span class="n">subset</span> <span class="o">=</span> <span class="n">get_pings_properties</span><span class="p">(</span><span class="n">pings</span><span class="p">,</span> <span class="p">{</span>
<span class="s2">"appUpdateChannel"</span><span class="p">:</span> <span class="s2">"meta/appUpdateChannel"</span><span class="p">,</span>
<span class="s2">"log"</span><span class="p">:</span> <span class="s2">"payload/log"</span><span class="p">,</span>
<span class="s2">"activeExperiment"</span><span class="p">:</span> <span class="s2">"environment/addons/activeExperiment/id"</span><span class="p">,</span>
<span class="s2">"activeExperimentBranch"</span><span class="p">:</span> <span class="s2">"environment/addons/activeExperiment/branch"</span>
<span class="p">})</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="kn">from</span> <span class="nn">collections</span> <span class="kn">import</span> <span class="n">defaultdict</span>
<span class="kn">from</span> <span class="nn">copy</span> <span class="kn">import</span> <span class="n">deepcopy</span>
<span class="c1">### Setup data structures and constants ###</span>
<span class="n">ALLOWED_ENTRY_TYPES</span> <span class="o">=</span> <span class="p">(</span><span class="s1">'EXPERIMENT_ACTIVATION'</span><span class="p">,</span> <span class="s1">'EXPERIMENT_TERMINATION'</span><span class="p">)</span>
<span class="n">experiment</span> <span class="o">=</span> <span class="p">{</span>
<span class="s1">'EXPERIMENT_ACTIVATION'</span><span class="p">:</span> <span class="n">defaultdict</span><span class="p">(</span><span class="nb">int</span><span class="p">),</span>
<span class="s1">'active'</span><span class="p">:</span> <span class="n">defaultdict</span><span class="p">(</span><span class="nb">int</span><span class="p">),</span>
<span class="s1">'EXPERIMENT_TERMINATION'</span><span class="p">:</span> <span class="n">defaultdict</span><span class="p">(</span><span class="nb">int</span><span class="p">)</span>
<span class="p">}</span>
<span class="n">channel</span> <span class="o">=</span> <span class="p">{</span>
<span class="s1">'errors'</span><span class="p">:</span> <span class="p">[],</span>
<span class="s1">'experiments'</span><span class="p">:</span> <span class="p">{}</span>
<span class="p">}</span>
<span class="k">def</span> <span class="nf">get_empty_channel</span><span class="p">():</span>
<span class="k">return</span> <span class="n">deepcopy</span><span class="p">(</span><span class="n">channel</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="kn">import</span> <span class="nn">gzip</span>
<span class="kn">import</span> <span class="nn">ujson</span>
<span class="kn">import</span> <span class="nn">requests</span>
<span class="c1"># This is a json object with {Date =&gt; {channel: count}}. It is created</span>
<span class="c1"># by the main_channel_counts plugin, and may be inaccurate if the ec2</span>
<span class="c1"># box crashed, but only for the day of the crash. If it crashes, the</span>
<span class="c1"># previous data will be lost.</span>
<span class="n">COUNTS_JSON_URI</span> <span class="o">=</span> <span class="s2">"https://pipeline-cep.prod.mozaws.net/dashboard_output/analysis.frank.main_channel_counts.counts.json"</span>
<span class="c1">### Aggregation functions, Spark job, output file creation ###</span>
<span class="k">def</span> <span class="nf">channel_ping_agg</span><span class="p">(</span><span class="n">channel_agg</span><span class="p">,</span> <span class="n">ping</span><span class="p">):</span>
<span class="sd">"""Aggregate a channel with a ping"""</span>
<span class="k">try</span><span class="p">:</span>
<span class="k">for</span> <span class="n">item</span> <span class="ow">in</span> <span class="p">(</span><span class="n">ping</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"log"</span><span class="p">)</span> <span class="ow">or</span> <span class="p">[]):</span>
<span class="k">if</span> <span class="n">item</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="ow">in</span> <span class="n">ALLOWED_ENTRY_TYPES</span><span class="p">:</span>
<span class="n">entry</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">reason</span><span class="p">,</span> <span class="n">exp_id</span> <span class="o">=</span> <span class="n">item</span><span class="p">[:</span><span class="mi">4</span><span class="p">]</span>
<span class="n">data</span> <span class="o">=</span> <span class="n">item</span><span class="p">[</span><span class="mi">4</span><span class="p">:]</span>
<span class="k">if</span> <span class="n">exp_id</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">channel_agg</span><span class="p">[</span><span class="s1">'experiments'</span><span class="p">]:</span>
<span class="n">channel_agg</span><span class="p">[</span><span class="s1">'experiments'</span><span class="p">][</span><span class="n">exp_id</span><span class="p">]</span> <span class="o">=</span> <span class="n">deepcopy</span><span class="p">(</span><span class="n">experiment</span><span class="p">)</span>
<span class="n">channel_agg</span><span class="p">[</span><span class="s1">'experiments'</span><span class="p">][</span><span class="n">exp_id</span><span class="p">][</span><span class="n">entry</span><span class="p">][</span><span class="nb">tuple</span><span class="p">([</span><span class="n">reason</span><span class="p">]</span> <span class="o">+</span> <span class="n">data</span><span class="p">)]</span> <span class="o">+=</span> <span class="mi">1</span>
<span class="n">exp_id</span> <span class="o">=</span> <span class="n">ping</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"activeExperiment"</span><span class="p">)</span>
<span class="n">branch</span> <span class="o">=</span> <span class="n">ping</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"activeExperimentBranch"</span><span class="p">)</span>
<span class="k">if</span> <span class="n">exp_id</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span> <span class="ow">and</span> <span class="n">branch</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="k">if</span> <span class="n">exp_id</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">channel_agg</span><span class="p">[</span><span class="s1">'experiments'</span><span class="p">]:</span>
<span class="n">channel_agg</span><span class="p">[</span><span class="s1">'experiments'</span><span class="p">][</span><span class="n">exp_id</span><span class="p">]</span> <span class="o">=</span> <span class="n">deepcopy</span><span class="p">(</span><span class="n">experiment</span><span class="p">)</span>
<span class="n">channel_agg</span><span class="p">[</span><span class="s1">'experiments'</span><span class="p">][</span><span class="n">exp_id</span><span class="p">][</span><span class="s1">'active'</span><span class="p">][</span><span class="n">branch</span><span class="p">]</span> <span class="o">+=</span> <span class="mi">1</span>
<span class="k">except</span> <span class="ne">Exception</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
<span class="n">channel_agg</span><span class="p">[</span><span class="s1">'errors'</span><span class="p">]</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s1">'{}: {}'</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">e</span><span class="o">.</span><span class="vm">__class__</span><span class="p">,</span> <span class="nb">str</span><span class="p">(</span><span class="n">e</span><span class="p">)))</span>
<span class="k">return</span> <span class="n">channel_agg</span>
<span class="k">def</span> <span class="nf">channel_channel_agg</span><span class="p">(</span><span class="n">channel_agg_1</span><span class="p">,</span> <span class="n">channel_agg_2</span><span class="p">):</span>
<span class="sd">"""Aggregate a channel with a channel"""</span>
<span class="n">channel_agg_1</span><span class="p">[</span><span class="s1">'errors'</span><span class="p">]</span> <span class="o">+=</span> <span class="n">channel_agg_2</span><span class="p">[</span><span class="s1">'errors'</span><span class="p">]</span>
<span class="k">for</span> <span class="n">exp_id</span><span class="p">,</span> <span class="n">exp</span> <span class="ow">in</span> <span class="n">channel_agg_2</span><span class="p">[</span><span class="s1">'experiments'</span><span class="p">]</span><span class="o">.</span><span class="n">iteritems</span><span class="p">():</span>
<span class="k">if</span> <span class="n">exp_id</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">channel_agg_1</span><span class="p">[</span><span class="s1">'experiments'</span><span class="p">]:</span>
<span class="n">channel_agg_1</span><span class="p">[</span><span class="s1">'experiments'</span><span class="p">][</span><span class="n">exp_id</span><span class="p">]</span> <span class="o">=</span> <span class="n">deepcopy</span><span class="p">(</span><span class="n">experiment</span><span class="p">)</span>
<span class="k">for</span> <span class="n">entry</span><span class="p">,</span> <span class="n">exp_activities</span> <span class="ow">in</span> <span class="n">exp</span><span class="o">.</span><span class="n">iteritems</span><span class="p">():</span>
<span class="k">for</span> <span class="n">exp_activity</span><span class="p">,</span> <span class="n">counts</span> <span class="ow">in</span> <span class="n">exp_activities</span><span class="o">.</span><span class="n">iteritems</span><span class="p">():</span>
<span class="n">channel_agg_1</span><span class="p">[</span><span class="s1">'experiments'</span><span class="p">][</span><span class="n">exp_id</span><span class="p">][</span><span class="n">entry</span><span class="p">][</span><span class="n">exp_activity</span><span class="p">]</span> <span class="o">+=</span> <span class="n">counts</span>
<span class="k">return</span> <span class="n">channel_agg_1</span>
<span class="k">def</span> <span class="nf">get_channel_or_other</span><span class="p">(</span><span class="n">ping</span><span class="p">):</span>
<span class="n">channel</span> <span class="o">=</span> <span class="n">ping</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"appUpdateChannel"</span><span class="p">)</span>
<span class="k">if</span> <span class="n">channel</span> <span class="ow">in</span> <span class="p">(</span><span class="s2">"release"</span><span class="p">,</span> <span class="s2">"nightly"</span><span class="p">,</span> <span class="s2">"beta"</span><span class="p">,</span> <span class="s2">"aurora"</span><span class="p">):</span>
<span class="k">return</span> <span class="n">channel</span>
<span class="k">return</span> <span class="s2">"OTHER"</span>
<span class="k">def</span> <span class="nf">aggregate_pings</span><span class="p">(</span><span class="n">pings</span><span class="p">):</span>
<span class="sd">"""Get the channel experiments from an rdd of pings"""</span>
<span class="k">return</span> <span class="n">pings</span>\
<span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="p">(</span><span class="n">get_channel_or_other</span><span class="p">(</span><span class="n">x</span><span class="p">),</span> <span class="n">x</span><span class="p">))</span>\
<span class="o">.</span><span class="n">aggregateByKey</span><span class="p">(</span><span class="n">get_empty_channel</span><span class="p">(),</span> <span class="n">channel_ping_agg</span><span class="p">,</span> <span class="n">channel_channel_agg</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">add_counts</span><span class="p">(</span><span class="n">result</span><span class="p">):</span>
<span class="sd">"""Add counts from a running CEP"""</span>
<span class="n">counts</span> <span class="o">=</span> <span class="n">requests</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">COUNTS_JSON_URI</span><span class="p">)</span><span class="o">.</span><span class="n">json</span><span class="p">()</span>
<span class="k">for</span> <span class="n">cname</span><span class="p">,</span> <span class="n">channel</span> <span class="ow">in</span> <span class="n">result</span><span class="p">:</span>
<span class="n">channel</span><span class="p">[</span><span class="s1">'total'</span><span class="p">]</span> <span class="o">=</span> <span class="n">counts</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">target_date</span><span class="p">,</span> <span class="p">{})</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">cname</span><span class="p">,</span> <span class="bp">None</span><span class="p">)</span>
<span class="k">return</span> <span class="n">result</span>
<span class="k">def</span> <span class="nf">write_aggregate</span><span class="p">(</span><span class="n">agg</span><span class="p">,</span> <span class="n">date</span><span class="p">,</span> <span class="n">filename_prefix</span><span class="o">=</span><span class="s1">'experiments'</span><span class="p">):</span>
<span class="n">filenames</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">cname</span><span class="p">,</span> <span class="n">channel</span> <span class="ow">in</span> <span class="n">agg</span><span class="p">:</span>
<span class="n">d</span> <span class="o">=</span> <span class="p">{</span>
<span class="s2">"total"</span><span class="p">:</span> <span class="n">channel</span><span class="p">[</span><span class="s1">'total'</span><span class="p">],</span>
<span class="s2">"experiments"</span><span class="p">:</span> <span class="p">{}</span>
<span class="p">}</span>
<span class="k">for</span> <span class="n">exp_id</span><span class="p">,</span> <span class="n">experiment</span> <span class="ow">in</span> <span class="n">channel</span><span class="p">[</span><span class="s1">'experiments'</span><span class="p">]</span><span class="o">.</span><span class="n">iteritems</span><span class="p">():</span>
<span class="n">d</span><span class="p">[</span><span class="s2">"experiments"</span><span class="p">][</span><span class="n">exp_id</span><span class="p">]</span> <span class="o">=</span> <span class="p">{</span>
<span class="s2">"active"</span><span class="p">:</span> <span class="n">experiment</span><span class="p">[</span><span class="s1">'active'</span><span class="p">],</span>
<span class="s2">"activations"</span><span class="p">:</span> <span class="n">experiment</span><span class="p">[</span><span class="s1">'EXPERIMENT_ACTIVATION'</span><span class="p">]</span><span class="o">.</span><span class="n">items</span><span class="p">(),</span>
<span class="s2">"terminations"</span><span class="p">:</span> <span class="n">experiment</span><span class="p">[</span><span class="s1">'EXPERIMENT_TERMINATION'</span><span class="p">]</span><span class="o">.</span><span class="n">items</span><span class="p">()</span>
<span class="p">}</span>
<span class="n">filename</span> <span class="o">=</span> <span class="s2">"{}{}-{}.json.gz"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">filename_prefix</span><span class="p">,</span> <span class="n">date</span><span class="p">,</span> <span class="n">cname</span><span class="p">)</span>
<span class="n">filenames</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">filename</span><span class="p">)</span>
<span class="k">with</span> <span class="n">gzip</span><span class="o">.</span><span class="n">open</span><span class="p">(</span><span class="n">filename</span><span class="p">,</span> <span class="s2">"wb"</span><span class="p">)</span> <span class="k">as</span> <span class="n">fd</span><span class="p">:</span>
<span class="n">ujson</span><span class="o">.</span><span class="n">dump</span><span class="p">(</span><span class="n">d</span><span class="p">,</span> <span class="n">fd</span><span class="p">)</span>
<span class="k">return</span> <span class="n">filenames</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="c1">### Setup Test Pings ###</span>
<span class="k">def</span> <span class="nf">make_ping</span><span class="p">(</span><span class="n">ae</span><span class="p">,</span> <span class="n">aeb</span><span class="p">,</span> <span class="n">chan</span><span class="p">,</span> <span class="n">log</span><span class="p">):</span>
<span class="k">return</span> <span class="p">{</span><span class="s1">'activeExperiment'</span><span class="p">:</span> <span class="n">ae</span><span class="p">,</span>
<span class="s1">'activeExperimentBranch'</span><span class="p">:</span> <span class="n">aeb</span><span class="p">,</span>
<span class="s1">'appUpdateChannel'</span><span class="p">:</span> <span class="n">chan</span><span class="p">,</span>
<span class="s1">'log'</span><span class="p">:</span> <span class="n">log</span><span class="p">}</span>
<span class="n">NUM_ACTIVATIONS</span> <span class="o">=</span> <span class="mi">5</span>
<span class="n">NUM_ACTIVES</span> <span class="o">=</span> <span class="mi">7</span>
<span class="n">NUM_TERMINATIONS</span> <span class="o">=</span> <span class="mi">3</span>
<span class="n">TOTAL</span> <span class="o">=</span> <span class="n">NUM_ACTIVATIONS</span> <span class="o">+</span> <span class="n">NUM_ACTIVES</span> <span class="o">+</span> <span class="n">NUM_TERMINATIONS</span>
<span class="n">_channel</span><span class="p">,</span> <span class="n">exp_id</span><span class="p">,</span> <span class="n">the_date</span> <span class="o">=</span> <span class="s1">'release'</span><span class="p">,</span> <span class="s1">'tls13-compat-ff51@experiments.mozilla.org'</span><span class="p">,</span> <span class="s1">'20140101'</span>
<span class="n">branch</span><span class="p">,</span> <span class="n">reason</span><span class="p">,</span> <span class="n">data</span> <span class="o">=</span> <span class="s1">'branch'</span><span class="p">,</span> <span class="s1">'REJECTED'</span><span class="p">,</span> <span class="p">[</span><span class="s1">'minBuildId'</span><span class="p">]</span>
<span class="n">log</span> <span class="o">=</span> <span class="p">[</span><span class="mi">17786</span><span class="p">,</span> <span class="n">reason</span><span class="p">,</span> <span class="n">exp_id</span><span class="p">]</span> <span class="o">+</span> <span class="n">data</span>
<span class="n">pings</span> <span class="o">=</span> <span class="p">[</span><span class="n">make_ping</span><span class="p">(</span><span class="n">exp_id</span><span class="p">,</span> <span class="n">branch</span><span class="p">,</span> <span class="n">_channel</span><span class="p">,</span> <span class="p">[])</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">xrange</span><span class="p">(</span><span class="n">NUM_ACTIVES</span><span class="p">)]</span> <span class="o">+</span>\
<span class="p">[</span><span class="n">make_ping</span><span class="p">(</span><span class="n">exp_id</span><span class="p">,</span> <span class="n">branch</span><span class="p">,</span> <span class="n">_channel</span><span class="p">,</span> <span class="p">[[</span><span class="s1">'EXPERIMENT_ACTIVATION'</span><span class="p">]</span> <span class="o">+</span> <span class="n">log</span><span class="p">])</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">xrange</span><span class="p">(</span><span class="n">NUM_ACTIVATIONS</span><span class="p">)]</span> <span class="o">+</span>\
<span class="p">[</span><span class="n">make_ping</span><span class="p">(</span><span class="n">exp_id</span><span class="p">,</span> <span class="n">branch</span><span class="p">,</span> <span class="n">_channel</span><span class="p">,</span> <span class="p">[[</span><span class="s1">'EXPERIMENT_TERMINATION'</span><span class="p">]</span> <span class="o">+</span> <span class="n">log</span><span class="p">])</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">xrange</span><span class="p">(</span><span class="n">NUM_TERMINATIONS</span><span class="p">)]</span>
<span class="c1">### Setup expected result aggregate ###</span>
<span class="k">def</span> <span class="nf">channels_agg_assert</span><span class="p">(</span><span class="n">channels</span><span class="p">,</span> <span class="n">counts</span><span class="o">=</span><span class="mi">1</span><span class="p">):</span>
<span class="c1">#Should just be the channel we provided</span>
<span class="k">assert</span> <span class="n">channels</span><span class="o">.</span><span class="n">viewkeys</span><span class="p">()</span> <span class="o">==</span> <span class="nb">set</span><span class="p">([</span><span class="n">_channel</span><span class="p">]),</span> <span class="s1">'Incorrect channels: '</span> <span class="o">+</span> <span class="s1">','</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">channels</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>
<span class="c1">#just check this one channel now</span>
<span class="n">release</span> <span class="o">=</span> <span class="n">channels</span><span class="p">[</span><span class="n">_channel</span><span class="p">]</span>
<span class="k">assert</span> <span class="nb">len</span><span class="p">(</span><span class="n">release</span><span class="p">[</span><span class="s1">'errors'</span><span class="p">])</span> <span class="o">==</span> <span class="mi">0</span><span class="p">,</span> <span class="s1">'Had Errors: '</span> <span class="o">+</span> <span class="s1">','</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">release</span><span class="p">[</span><span class="s1">'errors'</span><span class="p">])</span>
<span class="c1">#now check experiment totals</span>
<span class="k">assert</span> <span class="n">release</span><span class="p">[</span><span class="s1">'experiments'</span><span class="p">][</span><span class="n">exp_id</span><span class="p">][</span><span class="s1">'EXPERIMENT_ACTIVATION'</span><span class="p">][</span><span class="nb">tuple</span><span class="p">([</span><span class="n">reason</span><span class="p">]</span> <span class="o">+</span> <span class="n">data</span><span class="p">)]</span> <span class="o">==</span> <span class="n">NUM_ACTIVATIONS</span> <span class="o">*</span> <span class="n">counts</span><span class="p">,</span>\
<span class="s1">'Expected '</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">NUM_ACTIVATIONS</span> <span class="o">*</span> <span class="n">counts</span><span class="p">)</span> <span class="o">+</span> \
<span class="s1">', Got '</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">release</span><span class="p">[</span><span class="s1">'experiments'</span><span class="p">][</span><span class="n">exp_id</span><span class="p">][</span><span class="s1">'EXPERIMENT_ACTIVATION'</span><span class="p">][</span><span class="nb">tuple</span><span class="p">([</span><span class="n">reason</span><span class="p">]</span> <span class="o">+</span> <span class="n">data</span><span class="p">)])</span>
<span class="k">assert</span> <span class="n">release</span><span class="p">[</span><span class="s1">'experiments'</span><span class="p">][</span><span class="n">exp_id</span><span class="p">][</span><span class="s1">'EXPERIMENT_TERMINATION'</span><span class="p">][</span><span class="nb">tuple</span><span class="p">([</span><span class="n">reason</span><span class="p">]</span> <span class="o">+</span> <span class="n">data</span><span class="p">)]</span> <span class="o">==</span> <span class="n">NUM_TERMINATIONS</span> <span class="o">*</span> <span class="n">counts</span><span class="p">,</span>\
<span class="s1">'Expected '</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">NUM_TERMINATIONS</span> <span class="o">*</span> <span class="n">counts</span><span class="p">)</span> <span class="o">+</span> \
<span class="s1">', Got '</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">release</span><span class="p">[</span><span class="s1">'experiments'</span><span class="p">][</span><span class="n">exp_id</span><span class="p">][</span><span class="s1">'EXPERIMENT_TERMINATION'</span><span class="p">][</span><span class="nb">tuple</span><span class="p">([</span><span class="n">reason</span><span class="p">]</span> <span class="o">+</span> <span class="n">data</span><span class="p">)])</span>
<span class="c1">#`active` is counted for both just active, and for activations and terminations above</span>
<span class="k">assert</span> <span class="n">release</span><span class="p">[</span><span class="s1">'experiments'</span><span class="p">][</span><span class="n">exp_id</span><span class="p">][</span><span class="s1">'active'</span><span class="p">][</span><span class="n">branch</span><span class="p">]</span> <span class="o">==</span> <span class="n">TOTAL</span> <span class="o">*</span> <span class="n">counts</span><span class="p">,</span>\
<span class="s1">'Expected '</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">TOTAL</span> <span class="o">*</span> <span class="n">counts</span><span class="p">)</span> <span class="o">+</span>\
<span class="s1">'Got '</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">release</span><span class="p">[</span><span class="s1">'experiments'</span><span class="p">][</span><span class="n">exp_id</span><span class="p">][</span><span class="s1">'active'</span><span class="p">][</span><span class="n">branch</span><span class="p">])</span>
<span class="c1">### Test non-spark - easier debugging ###</span>
<span class="n">channel_1</span><span class="p">,</span> <span class="n">channel_2</span> <span class="o">=</span> <span class="n">get_empty_channel</span><span class="p">(),</span> <span class="n">get_empty_channel</span><span class="p">()</span>
<span class="k">for</span> <span class="n">ping</span> <span class="ow">in</span> <span class="n">pings</span><span class="p">:</span>
<span class="n">channel_1</span> <span class="o">=</span> <span class="n">channel_ping_agg</span><span class="p">(</span><span class="n">channel_1</span><span class="p">,</span> <span class="n">ping</span><span class="p">)</span>
<span class="n">channel_2</span> <span class="o">=</span> <span class="n">channel_ping_agg</span><span class="p">(</span><span class="n">channel_2</span><span class="p">,</span> <span class="n">ping</span><span class="p">)</span>
<span class="c1"># no actual key-value reduce, so just have to add the channel as key</span>
<span class="n">res_chan</span> <span class="o">=</span> <span class="p">((</span><span class="n">_channel</span><span class="p">,</span> <span class="n">channel_channel_agg</span><span class="p">(</span><span class="n">channel_1</span><span class="p">,</span> <span class="n">channel_2</span><span class="p">)),)</span>
<span class="n">res_chan</span> <span class="o">=</span> <span class="n">add_counts</span><span class="p">(</span><span class="n">res_chan</span><span class="p">)</span>
<span class="c1"># we've agggregated over the pings twice, so counts=2</span>
<span class="n">channels_agg_assert</span><span class="p">({</span><span class="n">channel</span><span class="p">:</span> <span class="n">agg</span> <span class="k">for</span> <span class="n">channel</span><span class="p">,</span> <span class="n">agg</span> <span class="ow">in</span> <span class="n">res_chan</span><span class="p">},</span> <span class="n">counts</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
<span class="n">write_aggregate</span><span class="p">(</span><span class="n">res_chan</span><span class="p">,</span> <span class="n">the_date</span><span class="p">,</span> <span class="n">filename_prefix</span><span class="o">=</span><span class="s2">"nonspark_test"</span><span class="p">)</span>
<span class="c1">#### Test Spark ###</span>
<span class="n">res</span> <span class="o">=</span> <span class="n">aggregate_pings</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">pings</span><span class="p">))</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="n">res</span> <span class="o">=</span> <span class="n">add_counts</span><span class="p">(</span><span class="n">res</span><span class="p">)</span>
<span class="n">channels</span> <span class="o">=</span> <span class="p">{</span><span class="n">channel</span><span class="p">:</span> <span class="n">agg</span> <span class="k">for</span> <span class="n">channel</span><span class="p">,</span> <span class="n">agg</span> <span class="ow">in</span> <span class="n">res</span><span class="p">}</span>
<span class="n">channels_agg_assert</span><span class="p">(</span><span class="n">channels</span><span class="p">,</span> <span class="n">counts</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="n">write_aggregate</span><span class="p">(</span><span class="n">res</span><span class="p">,</span> <span class="n">the_date</span><span class="p">,</span> <span class="n">filename_prefix</span><span class="o">=</span><span class="s2">"spark_test"</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span>['spark_test20140101-release.json.gz']
</pre></div>
<div class="codehilite"><pre><span></span><span class="c1">### Run on actual data - use CEP to get counts ###</span>
<span class="n">result</span> <span class="o">=</span> <span class="n">aggregate_pings</span><span class="p">(</span><span class="n">subset</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="n">result</span> <span class="o">=</span> <span class="n">add_counts</span><span class="p">(</span><span class="n">result</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="c1">### Upload target day's data files ###</span>
<span class="kn">import</span> <span class="nn">boto3</span>
<span class="kn">import</span> <span class="nn">botocore</span>
<span class="kn">from</span> <span class="nn">boto3.s3.transfer</span> <span class="kn">import</span> <span class="n">S3Transfer</span>
<span class="n">output_files</span> <span class="o">=</span> <span class="n">write_aggregate</span><span class="p">(</span><span class="n">result</span><span class="p">,</span> <span class="n">target_date</span><span class="p">)</span>
<span class="n">data_bucket</span> <span class="o">=</span> <span class="s2">"telemetry-public-analysis-2"</span>
<span class="n">s3path</span> <span class="o">=</span> <span class="s2">"experiments/data"</span>
<span class="n">gz_csv_args</span> <span class="o">=</span> <span class="p">{</span><span class="s1">'ContentEncoding'</span><span class="p">:</span> <span class="s1">'gzip'</span><span class="p">,</span> <span class="s1">'ContentType'</span><span class="p">:</span> <span class="s1">'text/csv'</span><span class="p">}</span>
<span class="n">client</span> <span class="o">=</span> <span class="n">boto3</span><span class="o">.</span><span class="n">client</span><span class="p">(</span><span class="s1">'s3'</span><span class="p">,</span> <span class="s1">'us-west-2'</span><span class="p">)</span>
<span class="n">transfer</span> <span class="o">=</span> <span class="n">S3Transfer</span><span class="p">(</span><span class="n">client</span><span class="p">)</span>
<span class="k">for</span> <span class="n">output_file</span> <span class="ow">in</span> <span class="n">output_files</span><span class="p">:</span>
<span class="n">transfer</span><span class="o">.</span><span class="n">upload_file</span><span class="p">(</span>
<span class="n">output_file</span><span class="p">,</span>
<span class="n">data_bucket</span><span class="p">,</span>
<span class="s2">"{}/{}"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">s3path</span><span class="p">,</span> <span class="n">output_file</span><span class="p">),</span>
<span class="n">extra_args</span><span class="o">=</span><span class="n">gz_csv_args</span>
<span class="p">)</span>
</pre></div>
</div>
</div>
</div>
</div>
</div></body></html>
<div>
</div>
<br/>
<div class="row">
<div class="col-md-12">
</div>
</div>
<script type="text/javascript">
$("#searchbar")[0].setSelectionRange(1000, 1000);
$('#searchbar').typeahead({
hint: false,
highlight: true,
minLength: 1
},
{
name: 'knowledge_posts',
limit: 10,
display: function (item) {
return item.title + " - " + item.author;
},
templates: {
empty: Handlebars.compile(
'<div class="tt-not-found">' +
'Unable to find any posts that match the current query' +
'</div>'
),
suggestion: function(data) {
return '<p style="overflow-wrap:break-word"><strong class="text-rausch">' + data.title + '</strong> – ' + data.author + '</p>';
}
},
source: function(q, sync, async) {
$.ajax('/ajax/index/typeahead?search=' + q,
{
success: function(data,status){ async(JSON.parse(data)); }
})
}
});
$('#searchbar').bind('typeahead:select', function(obj, datum, name) {
window.location = '/post/'+encodeURIComponent(datum.path);
});
$('#searchbar').keypress(function(event){
var keycode = (event.keyCode ? event.keyCode : event.which);
if(keycode == '13'){
var path = document.location.pathname;
window.location = '/feed?filters=' + $('#searchbar').val()
}
});
var padding = $('.tt-menu').outerWidth()
$('.tt-menu').width($('#searchbar').width() + padding + "px")
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/tooltips.js" type="text/javascript"></script>
<script type="text/javascript">
$("document").ready(function(){
var is_webeditor = false;
var post_id = "15";
var id = "None";
var post_path = "etl/experiments.kp"
var data_repo_github_root = ""
tooltipsJx.initializeTooltips(is_webeditor, post_id, id, data_repo_github_root);
$(".btn-rendered").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path);
})
$(".btn-raw").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path) + "?render=raw";
})
$(".btn-webeditor").on("click", function(){
document.location.href = "/edit/" + encodeURI(post_path);
})
});
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/tags.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/icons.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/comments.js" type="text/javascript"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js" type="text/javascript">
</script>
<script>
$(document).on('ready', function(){
// Make the Rendered Markdown Button active
$(".btn-rendered").addClass("btn-active");
// Initialize headers
helpersJx.linkifyHeaders();
// Initialize comments
var post_path = 'etl/experiments.kp';
$("#post_comment_btn").on('click', function(){
comment_author = 'knowledge_default';
post_author = 'Frank Bertsch';
post_title = 'Experiment Job';
commentsJx.postComment(comment_author, post_author, post_title, post_path);
location.reload();
});
all_comment_delete_buttons = $("[id^=delete_comment]")
$.each(all_comment_delete_buttons, function(i,v){
$(v).on("click", function(){
var id = v.id;
var comment_id = id.split("__")[1];
if(comment_id) {
commentsJx.deleteComment(post_path, comment_id)
location.reload();
}
});
});
$(document.body).on('click',"button[id^=tag-subscription]",function () {
tagsJx.addTagSubscriptionListener($(this)[0]);
});
})
//Turn all the headers to be links
//h1 = Title, don't want that
var all_headers = [$("h2"), $("h3"), $("h4"), $("h5"), $("h6")]
$.each(all_headers, function(index, value){
$.each(value, function(i, v){
var inner_html = v.innerHTML
inner_html_no_special = inner_html.replace(/[^a-zA-Z\- ]/g, "")
var inner_link = "#" + inner_html_no_special.toLowerCase().split(" ").join("-")
v.innerHTML = "<a href='" + inner_link + "' class=link-reset>" + inner_html + "</a>"
})
})
//turn all the tags into links, similar to what's done on the feed page
var tags = $("#renderedMarkdown .metadata .tags")[0]
var tags_list = ['experiment', 'firefox']
var subscriptions_list = []
$.each(tags_list, function(i,tag){
ahref = document.createElement("a")
e_tag = encodeURIComponent(tag)
f_tag = tag.replace("/", "__")
tag_name = "#" + tag
tag_subscription_button_id_name = "tag-subscription-" + i + "__" + f_tag
ahref.setAttribute("data-container", "body")
ahref.setAttribute("data-toggle", "popover")
ahref.setAttribute("data-placement", "bottom")
ahref.setAttribute("data-html", "true")
ahref.setAttribute("data-tag-name", f_tag)
if (subscriptions_list.indexOf(tag) >= 0) {
ahref.setAttribute("class", "label label-subscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-primary btn-unsubscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-remove-sign glyphicon-white'></i>Unsubscribe " +
" </button> " +
" </div>")
} else {
ahref.setAttribute("class", "label label-unsubscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-default btn-subscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-ok-sign glyphicon-filled'></i>Subscribe " +
" </button> " +
" </div>")
}
ahref.setAttribute("href", "/tag_pages?tag=" + e_tag)
ahref.setAttribute("style", "font-weight:normal")
if (i == 0){
ahref.innerHTML = " "
colon = document.createElement("text")
colon.innerHTML = "<b>Tags</b>: "
tags.appendChild(colon)
}
ahref.innerHTML = ahref.innerHTML + tag_name
tags.appendChild(ahref)
if (i != tags_list.length - 1){
comma = document.createElement("text")
comma.innerText = ", "
tags.appendChild(comma)
}
})
tags.nextSibling.remove()
tags.innerHTML += "<i class='glyphicon glyphicon-edit icon-gray' style='font-size:12pt; padding-left:4px' id='tooltip-edit_tags'></i>"
$(".pop").popover({ trigger: "manual" , html: true, animation:false, delay: 100})
.on("mouseenter", function () {
var _this = this;
$(this).popover("show");
$(".popover").on("mouseleave", function () {
$(_this).popover('hide');
});
}).on("mouseleave", function () {
var _this = this;
setTimeout(function () {
if (!$(".popover:hover").length) {
$(_this).popover("hide");
}
}, 300);
});
$('#tooltip-edit_tags').click(function(){
$('#tooltip-edit_tags')[0].setAttribute("style", "display:none")
previousSibling = $("#tooltip-edit_tags")[0].previousSibling
tags_string = tags_list.join(", ")
form = document.createElement("form")
input = document.createElement("input")
tags_text = document.createElement("text")
icon_class = document.createElement("i")
icon_class.setAttribute("class", "glyphicon glyphicon-upload icon-gray")
icon_class.setAttribute("style", "font-size:23px; padding-left:4px")
icon_class.setAttribute("id", "tooltip-save_tags")
tags_text.innerText = "Tags: "
input.setAttribute('type', 'text')
input.setAttribute('name', 'tags_list')
input.setAttribute('value', tags_string)
input.setAttribute('style', 'width:75%; display: inline-block')
input.setAttribute('id' , 'change_tags')
form.appendChild(tags_text)
tags.textContent = " "
form.appendChild(input)
form.appendChild(icon_class)
tags.appendChild(form)
$("#tooltip-save_tags").click(function(){
tags_string = $("#change_tags")[0].value
tags_list = tags_string.split(",")
var re = /^[a-z0-9\-\_\:\/]+$/i
var good = true
for (var i = 0; i < tags_list.length; i++){
tag = tags_list[i]
if (tag.length == 0){
alert("There is a tag with length 0 - possible a trailing comma?")
good = false
break
} else {
tag_name = tag.trim()
if (!(re.test(tag_name))){
alert("The tag contains special characters. Make sure there are only alphanumeric characters in your tag")
good = false
break
}
}
}
if (good) {
var postContent = {}
postContent['tags'] = tags_string
$.ajax({
type: "POST",
dataType: "json",
data: JSON.stringify(postContent),
contentType: "application/json",
url: '/tag_list?post_path=etl/experiments.kp',
async: false
});
location.reload()
}
})
tags.nextSibling.remove()
// Allow user to edit tags
var edit_icon = iconsJx.createEditTagsIcon();
$(tags).after(edit_icon);
$("#tooltip-edit_tags").on("click", function(){
var edit_tooltip = $("#tooltip-edit_tags");
edit_tooltip.attr("style", "display:none");
var tags_string = tags_list.join(", ");
var form = $("<form>");
var input = $("<input>");
var tags_text = $("<text>");
tags_text.html("Tags: ");
var icon = iconsJx.createSaveTagsIcon();
input.attr("type", "text");
input.attr("name", "tags_list");
input.attr("style", "width:75%; display: inline-block");
input.attr("id", "change_tags");
form.append(tags_text);
tags.textContent = " ";
tags_text.innerHTML = "Tags: ";
form.append(input);
form.append(icon);
tags.appendChild(form[0]);
$("#change_tags")[0].value = tags_string;
$("#change_tags").keypress(function(e){
if (e.which == 13){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/experiments.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false;
};
});
$("#tooltip-save_tags").click(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/experiments.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
});
$("form").submit(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/experiments.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false
})
});
});
</script>

Просмотреть файл

@ -0,0 +1,809 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title> Knowledge </title>
<!-- js includes at the top as post embedded js colliding -->
<script src="/static/modules/jquery/jquery.min.js"></script>
<script src="/static/modules/tether/js/tether.min.js"></script>
<script src="/static/modules/bootstrap/js/bootstrap.min.js"></script>
<script src="/static/modules/bootstrap-slider/js/bootstrap-slider.min.js"></script>
<script src="/static/modules/typeahead.js/typeahead.bundle.min.js"></script>
<script src="/static/modules/handlebars/js/handlebars.js"></script>
<script src="/static/js/helpers.js"></script>
<script src="/static/modules/select2/js/select2.min.js"></script>
<script src="/static/modules/hightlight.pack.js/highlight.pack.js"></script>
<script src="/static/modules/marked.js/marked.js"></script>
<!-- require js is used for plotly, but has a bunch of collisions with other js packages
make sure to have it be last js package imported -->
<script src="/static/modules/require.js/require.min.js"></script>
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.6.2/html5shiv.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/2.1.0/es5-shim.min.js"></script>
<![endif]-->
<link rel="stylesheet" href="/static/modules/bootstrap/css/bootstrap.min.css">
<link rel="stylesheet" href="/static/modules/bootstrap-slider/css/bootstrap-slider.min.css">
<link rel="stylesheet" type='text/css' href="/static/css/custom.css">
<link rel="stylesheet" type='text/css' href="/static/modules/select2/css/select2.min.css">
<link href='https://fonts.googleapis.com/css?family=Lato:400,900|Playfair+Display|Source+Serif+Pro:400,700' rel='stylesheet' type='text/css'>
<link rel="shortcut icon" href="/static/images/favicon.png">
<link rel="stylesheet" href="/static/css/codehilite-friendly.css">
<style>
.spinner {
position: fixed;
top: 50%;
left: 50%;
margin-left: -50px; /* half width of the spinner gif */
margin-top: -50px; /* half height of the spinner gif */
text-align:center;
z-index:1234;
overflow: auto;
width: 100px; /* width of the spinner gif */
height: 102px; /*hight of the spinner gif +2px to fix IE8 issue */
}
.table {
font-size: 14px;
}
.modal-content {
max-width: 1024px;
}
</style>
</head>
<body>
<div class="navbar navbar-knowledge" role="navigation">
<div class="container page-container">
<a href="/" aria-selected="false" class="logo-image navbar-text" style='margin-top:16px; margin-left: -7px'>
<img width="125" src='/static/images/logo-white.svg'></img>
</a>
<a id="feed_tab" href="/" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Home
</a>
<a id="favorites_tab" href="/favorites" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Favorites
</a>
<a id="help_tab" href="/about" aria-selected="false" class="navbar-text" style='margin-top:18px'>
About
</a>
<a id="stats_tab" href="/stats" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Stats
</a>
<a id="webposts_tab" href="/create"
class="btn btn-primary navbar-text pull-right"
style='border-radius:4px; margin-top:10px; margin-right:-10px; background-color: #00a699; color:white; border-color: #00a699'>
Write a Post!
</a>
<div class="pull-right">
<div class="form-group">
<input class="form-control" id="searchbar" placeholder="Search for Knowledge" style="text-align:right">
</div>
</div>
</div>
</div>
<div class="container page-container">
<br>
<div class="container-fluid">
<div class="row">
<div class="col-md-6">
<div class="btn-group" role="group">
<button type="button" class="btn btn-default btn-rendered">
View Post
</button>
<button type="button" class="btn btn-default btn-raw">
View Raw Markdown
</button>
</div>
</div>
<div class="col-md-2">
</div>
<div class="col-md-4 text-right">
<i class="glyphicon glyphicon-eye-open" id="pageview_stats" style='color: #9CA299'></i>
<div id="pageview_stats" style="display: inline-block">
Viewed 1 times by 1 different users
</div>
<i class="glyphicon glyphicon-heart-empty glyphicon-clickable pop" style="font-size:16pt" id="tooltip-like" data-placement="bottom"
data-trigger="#tooltip-like"
data-container="body"
data-toggle="popover"
data-content='<div>Like This Post</div>'></i>
</div>
</div>
<div class="row col-md-12">
</div>
</div>
<div class='container-fluid'>
<div class="row">
<div class="col-md-12">
<div id="renderedMarkdown">
<div class='metadata'>
<h1>Experiment Job</h1>
<span class='authors'><a href='/feed?authors=Frank+Bertsch'>Frank Bertsch</a></span>
<span class='date_created'>February 01, 2017</span>
<span class='date_updated'>(Last Updated: February 09, 2017)</span>
<span class='tldr'><p>We take all the pings from yesterday, get the information about any experiments: those that started, those running, and those that ended. These are aggregated by channel and outputted to files in s3.</p></span>
<span class='tags'></span>
</div>
<div class="codehilite"><pre><span></span><span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">datetime</span> <span class="k">as</span> <span class="n">dt</span><span class="p">,</span> <span class="n">timedelta</span><span class="p">,</span> <span class="n">date</span>
<span class="kn">import</span> <span class="nn">moztelemetry</span>
<span class="kn">from</span> <span class="nn">os</span> <span class="kn">import</span> <span class="n">environ</span>
<span class="c1"># get the desired target date from the environment, or run</span>
<span class="c1"># on &#39;yesterday&#39; by default.</span>
<span class="n">yesterday</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="n">dt</span><span class="o">.</span><span class="n">utcnow</span><span class="p">()</span> <span class="o">-</span> <span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">),</span> <span class="s2">&quot;%Y%m</span><span class="si">%d</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="n">target_date</span> <span class="o">=</span> <span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;date&#39;</span><span class="p">,</span> <span class="n">yesterday</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="kn">from</span> <span class="nn">moztelemetry.dataset</span> <span class="kn">import</span> <span class="n">Dataset</span>
<span class="n">sample_rate</span> <span class="o">=</span> <span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;sample&#39;</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
<span class="n">pings</span> <span class="o">=</span> <span class="n">Dataset</span><span class="o">.</span><span class="n">from_source</span><span class="p">(</span><span class="s2">&quot;telemetry-experiments&quot;</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">submissionDate</span><span class="o">=</span><span class="n">target_date</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">docType</span><span class="o">=</span><span class="s2">&quot;main&quot;</span><span class="p">)</span> \
<span class="o">.</span><span class="n">records</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">sample</span><span class="o">=</span><span class="n">sample_rate</span><span class="p">)</span> \
<span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;environment&quot;</span><span class="p">,</span> <span class="p">{})</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;build&quot;</span><span class="p">,</span> <span class="p">{})</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;applicationName&quot;</span><span class="p">)</span> <span class="o">==</span> <span class="s2">&quot;Firefox&quot;</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="kn">from</span> <span class="nn">moztelemetry</span> <span class="kn">import</span> <span class="n">get_pings_properties</span>
<span class="n">subset</span> <span class="o">=</span> <span class="n">get_pings_properties</span><span class="p">(</span><span class="n">pings</span><span class="p">,</span> <span class="p">{</span>
<span class="s2">&quot;appUpdateChannel&quot;</span><span class="p">:</span> <span class="s2">&quot;meta/appUpdateChannel&quot;</span><span class="p">,</span>
<span class="s2">&quot;log&quot;</span><span class="p">:</span> <span class="s2">&quot;payload/log&quot;</span><span class="p">,</span>
<span class="s2">&quot;activeExperiment&quot;</span><span class="p">:</span> <span class="s2">&quot;environment/addons/activeExperiment/id&quot;</span><span class="p">,</span>
<span class="s2">&quot;activeExperimentBranch&quot;</span><span class="p">:</span> <span class="s2">&quot;environment/addons/activeExperiment/branch&quot;</span>
<span class="p">})</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="kn">from</span> <span class="nn">collections</span> <span class="kn">import</span> <span class="n">defaultdict</span>
<span class="kn">from</span> <span class="nn">copy</span> <span class="kn">import</span> <span class="n">deepcopy</span>
<span class="c1">### Setup data structures and constants ###</span>
<span class="n">ALLOWED_ENTRY_TYPES</span> <span class="o">=</span> <span class="p">(</span><span class="s1">&#39;EXPERIMENT_ACTIVATION&#39;</span><span class="p">,</span> <span class="s1">&#39;EXPERIMENT_TERMINATION&#39;</span><span class="p">)</span>
<span class="n">experiment</span> <span class="o">=</span> <span class="p">{</span>
<span class="s1">&#39;EXPERIMENT_ACTIVATION&#39;</span><span class="p">:</span> <span class="n">defaultdict</span><span class="p">(</span><span class="nb">int</span><span class="p">),</span>
<span class="s1">&#39;active&#39;</span><span class="p">:</span> <span class="n">defaultdict</span><span class="p">(</span><span class="nb">int</span><span class="p">),</span>
<span class="s1">&#39;EXPERIMENT_TERMINATION&#39;</span><span class="p">:</span> <span class="n">defaultdict</span><span class="p">(</span><span class="nb">int</span><span class="p">)</span>
<span class="p">}</span>
<span class="n">channel</span> <span class="o">=</span> <span class="p">{</span>
<span class="s1">&#39;errors&#39;</span><span class="p">:</span> <span class="p">[],</span>
<span class="s1">&#39;experiments&#39;</span><span class="p">:</span> <span class="p">{}</span>
<span class="p">}</span>
<span class="k">def</span> <span class="nf">get_empty_channel</span><span class="p">():</span>
<span class="k">return</span> <span class="n">deepcopy</span><span class="p">(</span><span class="n">channel</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="kn">import</span> <span class="nn">gzip</span>
<span class="kn">import</span> <span class="nn">ujson</span>
<span class="kn">import</span> <span class="nn">requests</span>
<span class="c1"># This is a json object with {Date =&gt; {channel: count}}. It is created</span>
<span class="c1"># by the main_channel_counts plugin, and may be inaccurate if the ec2</span>
<span class="c1"># box crashed, but only for the day of the crash. If it crashes, the</span>
<span class="c1"># previous data will be lost.</span>
<span class="n">COUNTS_JSON_URI</span> <span class="o">=</span> <span class="s2">&quot;https://pipeline-cep.prod.mozaws.net/dashboard_output/analysis.frank.main_channel_counts.counts.json&quot;</span>
<span class="c1">### Aggregation functions, Spark job, output file creation ###</span>
<span class="k">def</span> <span class="nf">channel_ping_agg</span><span class="p">(</span><span class="n">channel_agg</span><span class="p">,</span> <span class="n">ping</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Aggregate a channel with a ping&quot;&quot;&quot;</span>
<span class="k">try</span><span class="p">:</span>
<span class="k">for</span> <span class="n">item</span> <span class="ow">in</span> <span class="p">(</span><span class="n">ping</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;log&quot;</span><span class="p">)</span> <span class="ow">or</span> <span class="p">[]):</span>
<span class="k">if</span> <span class="n">item</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="ow">in</span> <span class="n">ALLOWED_ENTRY_TYPES</span><span class="p">:</span>
<span class="n">entry</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">reason</span><span class="p">,</span> <span class="n">exp_id</span> <span class="o">=</span> <span class="n">item</span><span class="p">[:</span><span class="mi">4</span><span class="p">]</span>
<span class="n">data</span> <span class="o">=</span> <span class="n">item</span><span class="p">[</span><span class="mi">4</span><span class="p">:]</span>
<span class="k">if</span> <span class="n">exp_id</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">channel_agg</span><span class="p">[</span><span class="s1">&#39;experiments&#39;</span><span class="p">]:</span>
<span class="n">channel_agg</span><span class="p">[</span><span class="s1">&#39;experiments&#39;</span><span class="p">][</span><span class="n">exp_id</span><span class="p">]</span> <span class="o">=</span> <span class="n">deepcopy</span><span class="p">(</span><span class="n">experiment</span><span class="p">)</span>
<span class="n">channel_agg</span><span class="p">[</span><span class="s1">&#39;experiments&#39;</span><span class="p">][</span><span class="n">exp_id</span><span class="p">][</span><span class="n">entry</span><span class="p">][</span><span class="nb">tuple</span><span class="p">([</span><span class="n">reason</span><span class="p">]</span> <span class="o">+</span> <span class="n">data</span><span class="p">)]</span> <span class="o">+=</span> <span class="mi">1</span>
<span class="n">exp_id</span> <span class="o">=</span> <span class="n">ping</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;activeExperiment&quot;</span><span class="p">)</span>
<span class="n">branch</span> <span class="o">=</span> <span class="n">ping</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;activeExperimentBranch&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">exp_id</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span> <span class="ow">and</span> <span class="n">branch</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="k">if</span> <span class="n">exp_id</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">channel_agg</span><span class="p">[</span><span class="s1">&#39;experiments&#39;</span><span class="p">]:</span>
<span class="n">channel_agg</span><span class="p">[</span><span class="s1">&#39;experiments&#39;</span><span class="p">][</span><span class="n">exp_id</span><span class="p">]</span> <span class="o">=</span> <span class="n">deepcopy</span><span class="p">(</span><span class="n">experiment</span><span class="p">)</span>
<span class="n">channel_agg</span><span class="p">[</span><span class="s1">&#39;experiments&#39;</span><span class="p">][</span><span class="n">exp_id</span><span class="p">][</span><span class="s1">&#39;active&#39;</span><span class="p">][</span><span class="n">branch</span><span class="p">]</span> <span class="o">+=</span> <span class="mi">1</span>
<span class="k">except</span> <span class="ne">Exception</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
<span class="n">channel_agg</span><span class="p">[</span><span class="s1">&#39;errors&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s1">&#39;{}: {}&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">e</span><span class="o">.</span><span class="vm">__class__</span><span class="p">,</span> <span class="nb">str</span><span class="p">(</span><span class="n">e</span><span class="p">)))</span>
<span class="k">return</span> <span class="n">channel_agg</span>
<span class="k">def</span> <span class="nf">channel_channel_agg</span><span class="p">(</span><span class="n">channel_agg_1</span><span class="p">,</span> <span class="n">channel_agg_2</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Aggregate a channel with a channel&quot;&quot;&quot;</span>
<span class="n">channel_agg_1</span><span class="p">[</span><span class="s1">&#39;errors&#39;</span><span class="p">]</span> <span class="o">+=</span> <span class="n">channel_agg_2</span><span class="p">[</span><span class="s1">&#39;errors&#39;</span><span class="p">]</span>
<span class="k">for</span> <span class="n">exp_id</span><span class="p">,</span> <span class="n">exp</span> <span class="ow">in</span> <span class="n">channel_agg_2</span><span class="p">[</span><span class="s1">&#39;experiments&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">iteritems</span><span class="p">():</span>
<span class="k">if</span> <span class="n">exp_id</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">channel_agg_1</span><span class="p">[</span><span class="s1">&#39;experiments&#39;</span><span class="p">]:</span>
<span class="n">channel_agg_1</span><span class="p">[</span><span class="s1">&#39;experiments&#39;</span><span class="p">][</span><span class="n">exp_id</span><span class="p">]</span> <span class="o">=</span> <span class="n">deepcopy</span><span class="p">(</span><span class="n">experiment</span><span class="p">)</span>
<span class="k">for</span> <span class="n">entry</span><span class="p">,</span> <span class="n">exp_activities</span> <span class="ow">in</span> <span class="n">exp</span><span class="o">.</span><span class="n">iteritems</span><span class="p">():</span>
<span class="k">for</span> <span class="n">exp_activity</span><span class="p">,</span> <span class="n">counts</span> <span class="ow">in</span> <span class="n">exp_activities</span><span class="o">.</span><span class="n">iteritems</span><span class="p">():</span>
<span class="n">channel_agg_1</span><span class="p">[</span><span class="s1">&#39;experiments&#39;</span><span class="p">][</span><span class="n">exp_id</span><span class="p">][</span><span class="n">entry</span><span class="p">][</span><span class="n">exp_activity</span><span class="p">]</span> <span class="o">+=</span> <span class="n">counts</span>
<span class="k">return</span> <span class="n">channel_agg_1</span>
<span class="k">def</span> <span class="nf">get_channel_or_other</span><span class="p">(</span><span class="n">ping</span><span class="p">):</span>
<span class="n">channel</span> <span class="o">=</span> <span class="n">ping</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;appUpdateChannel&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">channel</span> <span class="ow">in</span> <span class="p">(</span><span class="s2">&quot;release&quot;</span><span class="p">,</span> <span class="s2">&quot;nightly&quot;</span><span class="p">,</span> <span class="s2">&quot;beta&quot;</span><span class="p">,</span> <span class="s2">&quot;aurora&quot;</span><span class="p">):</span>
<span class="k">return</span> <span class="n">channel</span>
<span class="k">return</span> <span class="s2">&quot;OTHER&quot;</span>
<span class="k">def</span> <span class="nf">aggregate_pings</span><span class="p">(</span><span class="n">pings</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Get the channel experiments from an rdd of pings&quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">pings</span>\
<span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="p">(</span><span class="n">get_channel_or_other</span><span class="p">(</span><span class="n">x</span><span class="p">),</span> <span class="n">x</span><span class="p">))</span>\
<span class="o">.</span><span class="n">aggregateByKey</span><span class="p">(</span><span class="n">get_empty_channel</span><span class="p">(),</span> <span class="n">channel_ping_agg</span><span class="p">,</span> <span class="n">channel_channel_agg</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">add_counts</span><span class="p">(</span><span class="n">result</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Add counts from a running CEP&quot;&quot;&quot;</span>
<span class="n">counts</span> <span class="o">=</span> <span class="n">requests</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">COUNTS_JSON_URI</span><span class="p">)</span><span class="o">.</span><span class="n">json</span><span class="p">()</span>
<span class="k">for</span> <span class="n">cname</span><span class="p">,</span> <span class="n">channel</span> <span class="ow">in</span> <span class="n">result</span><span class="p">:</span>
<span class="n">channel</span><span class="p">[</span><span class="s1">&#39;total&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">counts</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">target_date</span><span class="p">,</span> <span class="p">{})</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">cname</span><span class="p">,</span> <span class="bp">None</span><span class="p">)</span>
<span class="k">return</span> <span class="n">result</span>
<span class="k">def</span> <span class="nf">write_aggregate</span><span class="p">(</span><span class="n">agg</span><span class="p">,</span> <span class="n">date</span><span class="p">,</span> <span class="n">filename_prefix</span><span class="o">=</span><span class="s1">&#39;experiments&#39;</span><span class="p">):</span>
<span class="n">filenames</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">cname</span><span class="p">,</span> <span class="n">channel</span> <span class="ow">in</span> <span class="n">agg</span><span class="p">:</span>
<span class="n">d</span> <span class="o">=</span> <span class="p">{</span>
<span class="s2">&quot;total&quot;</span><span class="p">:</span> <span class="n">channel</span><span class="p">[</span><span class="s1">&#39;total&#39;</span><span class="p">],</span>
<span class="s2">&quot;experiments&quot;</span><span class="p">:</span> <span class="p">{}</span>
<span class="p">}</span>
<span class="k">for</span> <span class="n">exp_id</span><span class="p">,</span> <span class="n">experiment</span> <span class="ow">in</span> <span class="n">channel</span><span class="p">[</span><span class="s1">&#39;experiments&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">iteritems</span><span class="p">():</span>
<span class="n">d</span><span class="p">[</span><span class="s2">&quot;experiments&quot;</span><span class="p">][</span><span class="n">exp_id</span><span class="p">]</span> <span class="o">=</span> <span class="p">{</span>
<span class="s2">&quot;active&quot;</span><span class="p">:</span> <span class="n">experiment</span><span class="p">[</span><span class="s1">&#39;active&#39;</span><span class="p">],</span>
<span class="s2">&quot;activations&quot;</span><span class="p">:</span> <span class="n">experiment</span><span class="p">[</span><span class="s1">&#39;EXPERIMENT_ACTIVATION&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">items</span><span class="p">(),</span>
<span class="s2">&quot;terminations&quot;</span><span class="p">:</span> <span class="n">experiment</span><span class="p">[</span><span class="s1">&#39;EXPERIMENT_TERMINATION&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">items</span><span class="p">()</span>
<span class="p">}</span>
<span class="n">filename</span> <span class="o">=</span> <span class="s2">&quot;{}{}-{}.json.gz&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">filename_prefix</span><span class="p">,</span> <span class="n">date</span><span class="p">,</span> <span class="n">cname</span><span class="p">)</span>
<span class="n">filenames</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">filename</span><span class="p">)</span>
<span class="k">with</span> <span class="n">gzip</span><span class="o">.</span><span class="n">open</span><span class="p">(</span><span class="n">filename</span><span class="p">,</span> <span class="s2">&quot;wb&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">fd</span><span class="p">:</span>
<span class="n">ujson</span><span class="o">.</span><span class="n">dump</span><span class="p">(</span><span class="n">d</span><span class="p">,</span> <span class="n">fd</span><span class="p">)</span>
<span class="k">return</span> <span class="n">filenames</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="c1">### Setup Test Pings ###</span>
<span class="k">def</span> <span class="nf">make_ping</span><span class="p">(</span><span class="n">ae</span><span class="p">,</span> <span class="n">aeb</span><span class="p">,</span> <span class="n">chan</span><span class="p">,</span> <span class="n">log</span><span class="p">):</span>
<span class="k">return</span> <span class="p">{</span><span class="s1">&#39;activeExperiment&#39;</span><span class="p">:</span> <span class="n">ae</span><span class="p">,</span>
<span class="s1">&#39;activeExperimentBranch&#39;</span><span class="p">:</span> <span class="n">aeb</span><span class="p">,</span>
<span class="s1">&#39;appUpdateChannel&#39;</span><span class="p">:</span> <span class="n">chan</span><span class="p">,</span>
<span class="s1">&#39;log&#39;</span><span class="p">:</span> <span class="n">log</span><span class="p">}</span>
<span class="n">NUM_ACTIVATIONS</span> <span class="o">=</span> <span class="mi">5</span>
<span class="n">NUM_ACTIVES</span> <span class="o">=</span> <span class="mi">7</span>
<span class="n">NUM_TERMINATIONS</span> <span class="o">=</span> <span class="mi">3</span>
<span class="n">TOTAL</span> <span class="o">=</span> <span class="n">NUM_ACTIVATIONS</span> <span class="o">+</span> <span class="n">NUM_ACTIVES</span> <span class="o">+</span> <span class="n">NUM_TERMINATIONS</span>
<span class="n">_channel</span><span class="p">,</span> <span class="n">exp_id</span><span class="p">,</span> <span class="n">the_date</span> <span class="o">=</span> <span class="s1">&#39;release&#39;</span><span class="p">,</span> <span class="s1">&#39;tls13-compat-ff51@experiments.mozilla.org&#39;</span><span class="p">,</span> <span class="s1">&#39;20140101&#39;</span>
<span class="n">branch</span><span class="p">,</span> <span class="n">reason</span><span class="p">,</span> <span class="n">data</span> <span class="o">=</span> <span class="s1">&#39;branch&#39;</span><span class="p">,</span> <span class="s1">&#39;REJECTED&#39;</span><span class="p">,</span> <span class="p">[</span><span class="s1">&#39;minBuildId&#39;</span><span class="p">]</span>
<span class="n">log</span> <span class="o">=</span> <span class="p">[</span><span class="mi">17786</span><span class="p">,</span> <span class="n">reason</span><span class="p">,</span> <span class="n">exp_id</span><span class="p">]</span> <span class="o">+</span> <span class="n">data</span>
<span class="n">pings</span> <span class="o">=</span> <span class="p">[</span><span class="n">make_ping</span><span class="p">(</span><span class="n">exp_id</span><span class="p">,</span> <span class="n">branch</span><span class="p">,</span> <span class="n">_channel</span><span class="p">,</span> <span class="p">[])</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">xrange</span><span class="p">(</span><span class="n">NUM_ACTIVES</span><span class="p">)]</span> <span class="o">+</span>\
<span class="p">[</span><span class="n">make_ping</span><span class="p">(</span><span class="n">exp_id</span><span class="p">,</span> <span class="n">branch</span><span class="p">,</span> <span class="n">_channel</span><span class="p">,</span> <span class="p">[[</span><span class="s1">&#39;EXPERIMENT_ACTIVATION&#39;</span><span class="p">]</span> <span class="o">+</span> <span class="n">log</span><span class="p">])</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">xrange</span><span class="p">(</span><span class="n">NUM_ACTIVATIONS</span><span class="p">)]</span> <span class="o">+</span>\
<span class="p">[</span><span class="n">make_ping</span><span class="p">(</span><span class="n">exp_id</span><span class="p">,</span> <span class="n">branch</span><span class="p">,</span> <span class="n">_channel</span><span class="p">,</span> <span class="p">[[</span><span class="s1">&#39;EXPERIMENT_TERMINATION&#39;</span><span class="p">]</span> <span class="o">+</span> <span class="n">log</span><span class="p">])</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">xrange</span><span class="p">(</span><span class="n">NUM_TERMINATIONS</span><span class="p">)]</span>
<span class="c1">### Setup expected result aggregate ###</span>
<span class="k">def</span> <span class="nf">channels_agg_assert</span><span class="p">(</span><span class="n">channels</span><span class="p">,</span> <span class="n">counts</span><span class="o">=</span><span class="mi">1</span><span class="p">):</span>
<span class="c1">#Should just be the channel we provided</span>
<span class="k">assert</span> <span class="n">channels</span><span class="o">.</span><span class="n">viewkeys</span><span class="p">()</span> <span class="o">==</span> <span class="nb">set</span><span class="p">([</span><span class="n">_channel</span><span class="p">]),</span> <span class="s1">&#39;Incorrect channels: &#39;</span> <span class="o">+</span> <span class="s1">&#39;,&#39;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">channels</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>
<span class="c1">#just check this one channel now</span>
<span class="n">release</span> <span class="o">=</span> <span class="n">channels</span><span class="p">[</span><span class="n">_channel</span><span class="p">]</span>
<span class="k">assert</span> <span class="nb">len</span><span class="p">(</span><span class="n">release</span><span class="p">[</span><span class="s1">&#39;errors&#39;</span><span class="p">])</span> <span class="o">==</span> <span class="mi">0</span><span class="p">,</span> <span class="s1">&#39;Had Errors: &#39;</span> <span class="o">+</span> <span class="s1">&#39;,&#39;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">release</span><span class="p">[</span><span class="s1">&#39;errors&#39;</span><span class="p">])</span>
<span class="c1">#now check experiment totals</span>
<span class="k">assert</span> <span class="n">release</span><span class="p">[</span><span class="s1">&#39;experiments&#39;</span><span class="p">][</span><span class="n">exp_id</span><span class="p">][</span><span class="s1">&#39;EXPERIMENT_ACTIVATION&#39;</span><span class="p">][</span><span class="nb">tuple</span><span class="p">([</span><span class="n">reason</span><span class="p">]</span> <span class="o">+</span> <span class="n">data</span><span class="p">)]</span> <span class="o">==</span> <span class="n">NUM_ACTIVATIONS</span> <span class="o">*</span> <span class="n">counts</span><span class="p">,</span>\
<span class="s1">&#39;Expected &#39;</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">NUM_ACTIVATIONS</span> <span class="o">*</span> <span class="n">counts</span><span class="p">)</span> <span class="o">+</span> \
<span class="s1">&#39;, Got &#39;</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">release</span><span class="p">[</span><span class="s1">&#39;experiments&#39;</span><span class="p">][</span><span class="n">exp_id</span><span class="p">][</span><span class="s1">&#39;EXPERIMENT_ACTIVATION&#39;</span><span class="p">][</span><span class="nb">tuple</span><span class="p">([</span><span class="n">reason</span><span class="p">]</span> <span class="o">+</span> <span class="n">data</span><span class="p">)])</span>
<span class="k">assert</span> <span class="n">release</span><span class="p">[</span><span class="s1">&#39;experiments&#39;</span><span class="p">][</span><span class="n">exp_id</span><span class="p">][</span><span class="s1">&#39;EXPERIMENT_TERMINATION&#39;</span><span class="p">][</span><span class="nb">tuple</span><span class="p">([</span><span class="n">reason</span><span class="p">]</span> <span class="o">+</span> <span class="n">data</span><span class="p">)]</span> <span class="o">==</span> <span class="n">NUM_TERMINATIONS</span> <span class="o">*</span> <span class="n">counts</span><span class="p">,</span>\
<span class="s1">&#39;Expected &#39;</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">NUM_TERMINATIONS</span> <span class="o">*</span> <span class="n">counts</span><span class="p">)</span> <span class="o">+</span> \
<span class="s1">&#39;, Got &#39;</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">release</span><span class="p">[</span><span class="s1">&#39;experiments&#39;</span><span class="p">][</span><span class="n">exp_id</span><span class="p">][</span><span class="s1">&#39;EXPERIMENT_TERMINATION&#39;</span><span class="p">][</span><span class="nb">tuple</span><span class="p">([</span><span class="n">reason</span><span class="p">]</span> <span class="o">+</span> <span class="n">data</span><span class="p">)])</span>
<span class="c1">#`active` is counted for both just active, and for activations and terminations above</span>
<span class="k">assert</span> <span class="n">release</span><span class="p">[</span><span class="s1">&#39;experiments&#39;</span><span class="p">][</span><span class="n">exp_id</span><span class="p">][</span><span class="s1">&#39;active&#39;</span><span class="p">][</span><span class="n">branch</span><span class="p">]</span> <span class="o">==</span> <span class="n">TOTAL</span> <span class="o">*</span> <span class="n">counts</span><span class="p">,</span>\
<span class="s1">&#39;Expected &#39;</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">TOTAL</span> <span class="o">*</span> <span class="n">counts</span><span class="p">)</span> <span class="o">+</span>\
<span class="s1">&#39;Got &#39;</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">release</span><span class="p">[</span><span class="s1">&#39;experiments&#39;</span><span class="p">][</span><span class="n">exp_id</span><span class="p">][</span><span class="s1">&#39;active&#39;</span><span class="p">][</span><span class="n">branch</span><span class="p">])</span>
<span class="c1">### Test non-spark - easier debugging ###</span>
<span class="n">channel_1</span><span class="p">,</span> <span class="n">channel_2</span> <span class="o">=</span> <span class="n">get_empty_channel</span><span class="p">(),</span> <span class="n">get_empty_channel</span><span class="p">()</span>
<span class="k">for</span> <span class="n">ping</span> <span class="ow">in</span> <span class="n">pings</span><span class="p">:</span>
<span class="n">channel_1</span> <span class="o">=</span> <span class="n">channel_ping_agg</span><span class="p">(</span><span class="n">channel_1</span><span class="p">,</span> <span class="n">ping</span><span class="p">)</span>
<span class="n">channel_2</span> <span class="o">=</span> <span class="n">channel_ping_agg</span><span class="p">(</span><span class="n">channel_2</span><span class="p">,</span> <span class="n">ping</span><span class="p">)</span>
<span class="c1"># no actual key-value reduce, so just have to add the channel as key</span>
<span class="n">res_chan</span> <span class="o">=</span> <span class="p">((</span><span class="n">_channel</span><span class="p">,</span> <span class="n">channel_channel_agg</span><span class="p">(</span><span class="n">channel_1</span><span class="p">,</span> <span class="n">channel_2</span><span class="p">)),)</span>
<span class="n">res_chan</span> <span class="o">=</span> <span class="n">add_counts</span><span class="p">(</span><span class="n">res_chan</span><span class="p">)</span>
<span class="c1"># we&#39;ve agggregated over the pings twice, so counts=2</span>
<span class="n">channels_agg_assert</span><span class="p">({</span><span class="n">channel</span><span class="p">:</span> <span class="n">agg</span> <span class="k">for</span> <span class="n">channel</span><span class="p">,</span> <span class="n">agg</span> <span class="ow">in</span> <span class="n">res_chan</span><span class="p">},</span> <span class="n">counts</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
<span class="n">write_aggregate</span><span class="p">(</span><span class="n">res_chan</span><span class="p">,</span> <span class="n">the_date</span><span class="p">,</span> <span class="n">filename_prefix</span><span class="o">=</span><span class="s2">&quot;nonspark_test&quot;</span><span class="p">)</span>
<span class="c1">#### Test Spark ###</span>
<span class="n">res</span> <span class="o">=</span> <span class="n">aggregate_pings</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">pings</span><span class="p">))</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="n">res</span> <span class="o">=</span> <span class="n">add_counts</span><span class="p">(</span><span class="n">res</span><span class="p">)</span>
<span class="n">channels</span> <span class="o">=</span> <span class="p">{</span><span class="n">channel</span><span class="p">:</span> <span class="n">agg</span> <span class="k">for</span> <span class="n">channel</span><span class="p">,</span> <span class="n">agg</span> <span class="ow">in</span> <span class="n">res</span><span class="p">}</span>
<span class="n">channels_agg_assert</span><span class="p">(</span><span class="n">channels</span><span class="p">,</span> <span class="n">counts</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="n">write_aggregate</span><span class="p">(</span><span class="n">res</span><span class="p">,</span> <span class="n">the_date</span><span class="p">,</span> <span class="n">filename_prefix</span><span class="o">=</span><span class="s2">&quot;spark_test&quot;</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span>[&#39;spark_test20140101-release.json.gz&#39;]
</pre></div>
<div class="codehilite"><pre><span></span><span class="c1">### Run on actual data - use CEP to get counts ###</span>
<span class="n">result</span> <span class="o">=</span> <span class="n">aggregate_pings</span><span class="p">(</span><span class="n">subset</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="n">result</span> <span class="o">=</span> <span class="n">add_counts</span><span class="p">(</span><span class="n">result</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="c1">### Upload target day&#39;s data files ###</span>
<span class="kn">import</span> <span class="nn">boto3</span>
<span class="kn">import</span> <span class="nn">botocore</span>
<span class="kn">from</span> <span class="nn">boto3.s3.transfer</span> <span class="kn">import</span> <span class="n">S3Transfer</span>
<span class="n">output_files</span> <span class="o">=</span> <span class="n">write_aggregate</span><span class="p">(</span><span class="n">result</span><span class="p">,</span> <span class="n">target_date</span><span class="p">)</span>
<span class="n">data_bucket</span> <span class="o">=</span> <span class="s2">&quot;telemetry-public-analysis-2&quot;</span>
<span class="n">s3path</span> <span class="o">=</span> <span class="s2">&quot;experiments/data&quot;</span>
<span class="n">gz_csv_args</span> <span class="o">=</span> <span class="p">{</span><span class="s1">&#39;ContentEncoding&#39;</span><span class="p">:</span> <span class="s1">&#39;gzip&#39;</span><span class="p">,</span> <span class="s1">&#39;ContentType&#39;</span><span class="p">:</span> <span class="s1">&#39;text/csv&#39;</span><span class="p">}</span>
<span class="n">client</span> <span class="o">=</span> <span class="n">boto3</span><span class="o">.</span><span class="n">client</span><span class="p">(</span><span class="s1">&#39;s3&#39;</span><span class="p">,</span> <span class="s1">&#39;us-west-2&#39;</span><span class="p">)</span>
<span class="n">transfer</span> <span class="o">=</span> <span class="n">S3Transfer</span><span class="p">(</span><span class="n">client</span><span class="p">)</span>
<span class="k">for</span> <span class="n">output_file</span> <span class="ow">in</span> <span class="n">output_files</span><span class="p">:</span>
<span class="n">transfer</span><span class="o">.</span><span class="n">upload_file</span><span class="p">(</span>
<span class="n">output_file</span><span class="p">,</span>
<span class="n">data_bucket</span><span class="p">,</span>
<span class="s2">&quot;{}/{}&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">s3path</span><span class="p">,</span> <span class="n">output_file</span><span class="p">),</span>
<span class="n">extra_args</span><span class="o">=</span><span class="n">gz_csv_args</span>
<span class="p">)</span>
</pre></div>
</div>
</div>
</div>
</div>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<h2> 0 Comments </h2>
</br>
<div>
<div class="row">
<div class="col-md-12">
<textarea class="form-control" type="text" id="comment-text" style="height:87px;" placeholder="Leave a comment..."></textarea>
</div>
</div>
</div>
</br>
<div>
<button class="btn btn-primary" id="post_comment_btn">Post Comment</button>
</div>
</div>
</div>
<br>
<div class="row">
<div class="col-md-12">
</div>
</br>
</div>
</div>
</div>
</div>
<div class="footer">
Served with <span class="glyphicon glyphicon-heart"></span> by <a href="https://github.com/airbnb/knowledge-repo">Knowledge Repo</a> <a href="https://github.com/airbnb/knowledge-repo/releases/tag/v0.7.4">0.7.4</a><br />
<i title="Last checked for updates: 4 minutes ago">Last indexed: 14 hours ago</i>
</div>
<script type='text/javascript'>
$("#searchbar")[0].setSelectionRange(1000, 1000);
$('#searchbar').typeahead({
hint: false,
highlight: true,
minLength: 1
},
{
name: 'knowledge_posts',
limit: 10,
display: function (item) {
return item.title + " - " + item.author;
},
templates: {
empty: Handlebars.compile(
'<div class="tt-not-found">' +
'Unable to find any posts that match the current query' +
'</div>'
),
suggestion: function(data) {
return '<p style="overflow-wrap:break-word"><strong class="text-rausch">' + data.title + '</strong> – ' + data.author + '</p>';
}
},
source: function(q, sync, async) {
$.ajax('/ajax/index/typeahead?search=' + q,
{
success: function(data,status){ async(JSON.parse(data)); }
})
}
});
$('#searchbar').bind('typeahead:select', function(obj, datum, name) {
window.location = '/post/'+encodeURIComponent(datum.path);
});
$('#searchbar').keypress(function(event){
var keycode = (event.keyCode ? event.keyCode : event.which);
if(keycode == '13'){
var path = document.location.pathname;
window.location = '/feed?filters=' + $('#searchbar').val()
}
});
var padding = $('.tt-menu').outerWidth()
$('.tt-menu').width($('#searchbar').width() + padding + "px")
</script>
<script src="/static/js/tooltips.js" type="text/javascript"></script>
<script type="text/javascript">
$("document").ready(function(){
var is_webeditor = false;
var post_id = "15";
var id = "None";
var post_path = "etl/experiments.kp"
var data_repo_github_root = ""
tooltipsJx.initializeTooltips(is_webeditor, post_id, id, data_repo_github_root);
$(".btn-rendered").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path);
})
$(".btn-raw").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path) + "?render=raw";
})
$(".btn-webeditor").on("click", function(){
document.location.href = "/edit/" + encodeURI(post_path);
})
});
</script>
<script src="/static/js/helpers.js"></script>
<script src="/static/js/tags.js" type="text/javascript"></script>
<script src="/static/js/icons.js" type="text/javascript"></script>
<script src="/static/js/comments.js" type="text/javascript"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script type="text/javascript"
src="https://cdn.mathjax.org/mathjax/latest/MathJax.js">
</script>
<script>
$(document).on('ready', function(){
// Make the Rendered Markdown Button active
$(".btn-rendered").addClass("btn-active");
// Initialize headers
helpersJx.linkifyHeaders();
// Initialize comments
var post_path = 'etl/experiments.kp';
$("#post_comment_btn").on('click', function(){
comment_author = 'knowledge_default';
post_author = 'Frank Bertsch';
post_title = 'Experiment Job';
commentsJx.postComment(comment_author, post_author, post_title, post_path);
location.reload();
});
all_comment_delete_buttons = $("[id^=delete_comment]")
$.each(all_comment_delete_buttons, function(i,v){
$(v).on("click", function(){
var id = v.id;
var comment_id = id.split("__")[1];
if(comment_id) {
commentsJx.deleteComment(post_path, comment_id)
location.reload();
}
});
});
$(document.body).on('click',"button[id^=tag-subscription]",function () {
tagsJx.addTagSubscriptionListener($(this)[0]);
});
})
//Turn all the headers to be links
//h1 = Title, don't want that
var all_headers = [$("h2"), $("h3"), $("h4"), $("h5"), $("h6")]
$.each(all_headers, function(index, value){
$.each(value, function(i, v){
var inner_html = v.innerHTML
inner_html_no_special = inner_html.replace(/[^a-zA-Z\- ]/g, "")
var inner_link = "#" + inner_html_no_special.toLowerCase().split(" ").join("-")
v.innerHTML = "<a href='" + inner_link + "' class=link-reset>" + inner_html + "</a>"
})
})
//turn all the tags into links, similar to what's done on the feed page
var tags = $("#renderedMarkdown .metadata .tags")[0]
var tags_list = ['experiment', 'firefox']
var subscriptions_list = []
$.each(tags_list, function(i,tag){
ahref = document.createElement("a")
e_tag = encodeURIComponent(tag)
f_tag = tag.replace("/", "__")
tag_name = "#" + tag
tag_subscription_button_id_name = "tag-subscription-" + i + "__" + f_tag
ahref.setAttribute("data-container", "body")
ahref.setAttribute("data-toggle", "popover")
ahref.setAttribute("data-placement", "bottom")
ahref.setAttribute("data-html", "true")
ahref.setAttribute("data-tag-name", f_tag)
if (subscriptions_list.indexOf(tag) >= 0) {
ahref.setAttribute("class", "label label-subscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-primary btn-unsubscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-remove-sign glyphicon-white'></i>Unsubscribe " +
" </button> " +
" </div>")
} else {
ahref.setAttribute("class", "label label-unsubscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-default btn-subscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-ok-sign glyphicon-filled'></i>Subscribe " +
" </button> " +
" </div>")
}
ahref.setAttribute("href", "/tag_pages?tag=" + e_tag)
ahref.setAttribute("style", "font-weight:normal")
if (i == 0){
ahref.innerHTML = " "
colon = document.createElement("text")
colon.innerHTML = "<b>Tags</b>: "
tags.appendChild(colon)
}
ahref.innerHTML = ahref.innerHTML + tag_name
tags.appendChild(ahref)
if (i != tags_list.length - 1){
comma = document.createElement("text")
comma.innerText = ", "
tags.appendChild(comma)
}
})
tags.nextSibling.remove()
tags.innerHTML += "<i class='glyphicon glyphicon-edit icon-gray' style='font-size:12pt; padding-left:4px' id='tooltip-edit_tags'></i>"
$(".pop").popover({ trigger: "manual" , html: true, animation:false, delay: 100})
.on("mouseenter", function () {
var _this = this;
$(this).popover("show");
$(".popover").on("mouseleave", function () {
$(_this).popover('hide');
});
}).on("mouseleave", function () {
var _this = this;
setTimeout(function () {
if (!$(".popover:hover").length) {
$(_this).popover("hide");
}
}, 300);
});
$('#tooltip-edit_tags').click(function(){
$('#tooltip-edit_tags')[0].setAttribute("style", "display:none")
previousSibling = $("#tooltip-edit_tags")[0].previousSibling
tags_string = tags_list.join(", ")
form = document.createElement("form")
input = document.createElement("input")
tags_text = document.createElement("text")
icon_class = document.createElement("i")
icon_class.setAttribute("class", "glyphicon glyphicon-upload icon-gray")
icon_class.setAttribute("style", "font-size:23px; padding-left:4px")
icon_class.setAttribute("id", "tooltip-save_tags")
tags_text.innerText = "Tags: "
input.setAttribute('type', 'text')
input.setAttribute('name', 'tags_list')
input.setAttribute('value', tags_string)
input.setAttribute('style', 'width:75%; display: inline-block')
input.setAttribute('id' , 'change_tags')
form.appendChild(tags_text)
tags.textContent = " "
form.appendChild(input)
form.appendChild(icon_class)
tags.appendChild(form)
$("#tooltip-save_tags").click(function(){
tags_string = $("#change_tags")[0].value
tags_list = tags_string.split(",")
var re = /^[a-z0-9\-\_\:\/]+$/i
var good = true
for (var i = 0; i < tags_list.length; i++){
tag = tags_list[i]
if (tag.length == 0){
alert("There is a tag with length 0 - possible a trailing comma?")
good = false
break
} else {
tag_name = tag.trim()
if (!(re.test(tag_name))){
alert("The tag contains special characters. Make sure there are only alphanumeric characters in your tag")
good = false
break
}
}
}
if (good) {
var postContent = {}
postContent['tags'] = tags_string
$.ajax({
type: "POST",
dataType: "json",
data: JSON.stringify(postContent),
contentType: "application/json",
url: '/tag_list?post_path=etl/experiments.kp',
async: false
});
location.reload()
}
})
tags.nextSibling.remove()
// Allow user to edit tags
var edit_icon = iconsJx.createEditTagsIcon();
$(tags).after(edit_icon);
$("#tooltip-edit_tags").on("click", function(){
var edit_tooltip = $("#tooltip-edit_tags");
edit_tooltip.attr("style", "display:none");
var tags_string = tags_list.join(", ");
var form = $("<form>");
var input = $("<input>");
var tags_text = $("<text>");
tags_text.html("Tags: ");
var icon = iconsJx.createSaveTagsIcon();
input.attr("type", "text");
input.attr("name", "tags_list");
input.attr("style", "width:75%; display: inline-block");
input.attr("id", "change_tags");
form.append(tags_text);
tags.textContent = " ";
tags_text.innerHTML = "Tags: ";
form.append(input);
form.append(icon);
tags.appendChild(form[0]);
$("#change_tags")[0].value = tags_string;
$("#change_tags").keypress(function(e){
if (e.which == 13){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/experiments.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false;
};
});
$("#tooltip-save_tags").click(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/experiments.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
});
$("form").submit(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/experiments.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false
})
});
});
</script>
</body>
</html>

Просмотреть файл

@ -0,0 +1,13 @@
{
"title": "Experiment Job",
"authors": [
"Frank Bertsch"
],
"tags": [
"experiment",
"firefox"
],
"publish_date": "2017-02-01",
"updated_at": "2016-02-08",
"tldr": "We take all the pings from yesterday, get the information about any experiments: those that started, those running, and those that ended. These are aggregated by channel and outputted to files in s3."
}

Просмотреть файл

@ -0,0 +1,606 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title> Knowledge </title>
<!-- js includes at the top as post embedded js colliding -->
<script src="https://reports.telemetry.mozilla.org/static/modules/jquery/jquery.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/tether/js/tether.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap/js/bootstrap.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/js/bootstrap-slider.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/typeahead.js/typeahead.bundle.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/handlebars/js/handlebars.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/select2/js/select2.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/hightlight.pack.js/highlight.pack.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/marked.js/marked.js"></script>
<!-- require js is used for plotly, but has a bunch of collisions with other js packages
make sure to have it be last js package imported -->
<script src="https://reports.telemetry.mozilla.org/static/modules/require.js/require.min.js"></script>
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.6.2/html5shiv.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/2.1.0/es5-shim.min.js"></script>
<![endif]-->
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap/css/bootstrap.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/css/bootstrap-slider.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/css/custom.css" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/select2/css/select2.min.css" rel="stylesheet" type="text/css"/>
<link href="https://fonts.googleapis.com/css?family=Lato:400,900|Playfair+Display|Source+Serif+Pro:400,700" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/images/favicon.png" rel="shortcut icon"/>
<link href="https://reports.telemetry.mozilla.org/static/css/codehilite-friendly.css" rel="stylesheet"/>
<style>
.spinner {
position: fixed;
top: 50%;
left: 50%;
margin-left: -50px; /* half width of the spinner gif */
margin-top: -50px; /* half height of the spinner gif */
text-align:center;
z-index:1234;
overflow: auto;
width: 100px; /* width of the spinner gif */
height: 102px; /*hight of the spinner gif +2px to fix IE8 issue */
}
.table {
font-size: 14px;
}
.modal-content {
max-width: 1024px;
}
</style>
</head>
<body>
<div class="container page-container">
<br/>
<div class="container-fluid">
<div class="row">
<div class="col-md-6">
</div>
<div class="col-md-2">
</div>
</div>
<div class="row col-md-12">
</div>
</div>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<div id="renderedMarkdown">
<div class="metadata">
<h1>Mobile Clients ETL Job</h1>
<span class="authors"><a href="/feed?authors=Frank+Bertsch">Frank Bertsch</a></span>
<span class="date_created">February 17, 2017</span>
<span class="date_updated">(Last Updated: May 01, 2017)</span>
<span class="tldr"><p>This job basically just takes core pings and puts them in parquet format.</p></span>
</div>
<div class="codehilite"><pre><span></span><span class="kn">import</span> <span class="nn">os</span>
<span class="kn">import</span> <span class="nn">datetime</span> <span class="kn">as</span> <span class="nn">dt</span>
<span class="kn">import</span> <span class="nn">pandas</span> <span class="kn">as</span> <span class="nn">pd</span>
<span class="kn">import</span> <span class="nn">ujson</span> <span class="kn">as</span> <span class="nn">json</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="o">*</span>
<span class="kn">from</span> <span class="nn">moztelemetry</span> <span class="kn">import</span> <span class="n">get_pings</span><span class="p">,</span> <span class="n">get_pings_properties</span>
<span class="o">%</span><span class="n">pylab</span> <span class="n">inline</span>
</pre></div>
<div class="codehilite"><pre><span></span>
</pre></div>
<p>Take the set of pings, make sure we have actual clientIds and remove duplicate pings. We collect each unique ping.</p>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">dedupe_pings</span><span class="p">(</span><span class="n">rdd</span><span class="p">):</span>
<span class="k">return</span> <span class="n">rdd</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="n">p</span><span class="p">[</span><span class="s2">"meta/clientId"</span><span class="p">]</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">)</span>\
<span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="p">(</span><span class="n">p</span><span class="p">[</span><span class="s2">"meta/documentId"</span><span class="p">],</span> <span class="n">p</span><span class="p">))</span>\
<span class="o">.</span><span class="n">reduceByKey</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span><span class="p">)</span>\
<span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
</pre></div>
<p>Transform and sanitize the pings into arrays.</p>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">transform</span><span class="p">(</span><span class="n">ping</span><span class="p">):</span>
<span class="c1"># Should not be None since we filter those out.</span>
<span class="n">clientId</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"meta/clientId"</span><span class="p">]</span>
<span class="c1"># Added via the ingestion process so should not be None.</span>
<span class="n">submissionDate</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">strptime</span><span class="p">(</span><span class="n">ping</span><span class="p">[</span><span class="s2">"meta/submissionDate"</span><span class="p">],</span> <span class="s2">"%Y%m</span><span class="si">%d</span><span class="s2">"</span><span class="p">)</span>
<span class="n">geoCountry</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"meta/geoCountry"</span><span class="p">]</span>
<span class="n">profileDate</span> <span class="o">=</span> <span class="bp">None</span>
<span class="n">profileDaynum</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"profileDate"</span><span class="p">]</span>
<span class="k">if</span> <span class="n">profileDaynum</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="k">try</span><span class="p">:</span>
<span class="c1"># Bad data could push profileDaynum &gt; 32767 (size of a C int) and throw exception</span>
<span class="n">profileDate</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="p">(</span><span class="mi">1970</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> <span class="o">+</span> <span class="n">dt</span><span class="o">.</span><span class="n">timedelta</span><span class="p">(</span><span class="nb">int</span><span class="p">(</span><span class="n">profileDaynum</span><span class="p">))</span>
<span class="k">except</span><span class="p">:</span>
<span class="n">profileDate</span> <span class="o">=</span> <span class="bp">None</span>
<span class="c1"># Create date can be an improper string (~.03% of the time, so ignore)</span>
<span class="c1"># Year can be &lt; 2000 (~.005% of the time, so ignore)</span>
<span class="k">try</span><span class="p">:</span>
<span class="c1"># Create date should already be in ISO format</span>
<span class="n">creationDate</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"created"</span><span class="p">]</span>
<span class="k">if</span> <span class="n">creationDate</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="c1"># This is only accurate because we know the creation date is always in 'Z' (zulu) time.</span>
<span class="n">creationDate</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">strptime</span><span class="p">(</span><span class="n">ping</span><span class="p">[</span><span class="s2">"created"</span><span class="p">],</span> <span class="s2">"%Y-%m-</span><span class="si">%d</span><span class="s2">"</span><span class="p">)</span>
<span class="k">if</span> <span class="n">creationDate</span><span class="o">.</span><span class="n">year</span> <span class="o">&lt;</span> <span class="mi">2000</span><span class="p">:</span>
<span class="n">creationDate</span> <span class="o">=</span> <span class="bp">None</span>
<span class="k">except</span> <span class="ne">ValueError</span><span class="p">:</span>
<span class="n">creationDate</span> <span class="o">=</span> <span class="bp">None</span>
<span class="n">appVersion</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"meta/appVersion"</span><span class="p">]</span>
<span class="n">buildId</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"meta/appBuildId"</span><span class="p">]</span>
<span class="n">locale</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"locale"</span><span class="p">]</span>
<span class="n">os</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"os"</span><span class="p">]</span>
<span class="n">osVersion</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"osversion"</span><span class="p">]</span>
<span class="n">device</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"device"</span><span class="p">]</span>
<span class="n">arch</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"arch"</span><span class="p">]</span>
<span class="n">defaultSearch</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"defaultSearch"</span><span class="p">]</span>
<span class="n">distributionId</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"distributionId"</span><span class="p">]</span>
<span class="n">experiments</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"experiments"</span><span class="p">]</span>
<span class="k">if</span> <span class="n">experiments</span> <span class="ow">is</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">experiments</span> <span class="o">=</span> <span class="p">[]</span>
<span class="c1">#bug 1315028</span>
<span class="n">defaultNewTabExperience</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"defaultNewTabExperience"</span><span class="p">]</span>
<span class="n">defaultMailClient</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"defaultMailClient"</span><span class="p">]</span>
<span class="c1">#bug 1307419</span>
<span class="n">searches</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"searches"</span><span class="p">]</span>
<span class="n">durations</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"durations"</span><span class="p">]</span>
<span class="n">sessions</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">"sessions"</span><span class="p">]</span>
<span class="k">return</span> <span class="p">[</span><span class="n">clientId</span><span class="p">,</span> <span class="n">submissionDate</span><span class="p">,</span> <span class="n">creationDate</span><span class="p">,</span> <span class="n">profileDate</span><span class="p">,</span> <span class="n">geoCountry</span><span class="p">,</span> <span class="n">locale</span><span class="p">,</span> <span class="n">os</span><span class="p">,</span>
<span class="n">osVersion</span><span class="p">,</span> <span class="n">buildId</span><span class="p">,</span> <span class="n">appVersion</span><span class="p">,</span> <span class="n">device</span><span class="p">,</span> <span class="n">arch</span><span class="p">,</span> <span class="n">defaultSearch</span><span class="p">,</span> <span class="n">distributionId</span><span class="p">,</span>
<span class="n">json</span><span class="o">.</span><span class="n">dumps</span><span class="p">(</span><span class="n">experiments</span><span class="p">),</span> <span class="n">defaultNewTabExperience</span><span class="p">,</span> <span class="n">defaultMailClient</span><span class="p">,</span> <span class="n">searches</span><span class="p">,</span>
<span class="n">durations</span><span class="p">,</span> <span class="n">sessions</span><span class="p">]</span>
</pre></div>
<p>Create a set of pings from “core” to build a set of core client data. Output the data to CSV or Parquet.</p>
<p>This script is designed to loop over a range of days and output a single day for the given channels. Use explicit date ranges for backfilling, or now() - 1day for automated runs.</p>
<div class="codehilite"><pre><span></span><span class="n">channels</span> <span class="o">=</span> <span class="p">[</span><span class="s2">"nightly"</span><span class="p">,</span> <span class="s2">"aurora"</span><span class="p">,</span> <span class="s2">"beta"</span><span class="p">,</span> <span class="s2">"release"</span><span class="p">]</span>
<span class="n">batch_date</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'date'</span><span class="p">)</span>
<span class="k">if</span> <span class="n">batch_date</span><span class="p">:</span>
<span class="n">start</span> <span class="o">=</span> <span class="n">end</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">strptime</span><span class="p">(</span><span class="n">batch_date</span><span class="p">,</span> <span class="s1">'%Y%m</span><span class="si">%d</span><span class="s1">'</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">start</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">now</span><span class="p">()</span> <span class="o">-</span> <span class="n">dt</span><span class="o">.</span><span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="n">end</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">now</span><span class="p">()</span> <span class="o">-</span> <span class="n">dt</span><span class="o">.</span><span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="n">day</span> <span class="o">=</span> <span class="n">start</span>
<span class="k">while</span> <span class="n">day</span> <span class="o">&lt;=</span> <span class="n">end</span><span class="p">:</span>
<span class="k">for</span> <span class="n">channel</span> <span class="ow">in</span> <span class="n">channels</span><span class="p">:</span>
<span class="k">print</span> <span class="s2">"</span><span class="se">\n</span><span class="s2">channel: "</span> <span class="o">+</span> <span class="n">channel</span> <span class="o">+</span> <span class="s2">", date: "</span> <span class="o">+</span> <span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">"%Y%m</span><span class="si">%d</span><span class="s2">"</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="nb">dict</span><span class="p">(</span>
<span class="n">doc_type</span><span class="o">=</span><span class="s2">"core"</span><span class="p">,</span>
<span class="n">submission_date</span><span class="o">=</span><span class="p">(</span><span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">"%Y%m</span><span class="si">%d</span><span class="s2">"</span><span class="p">),</span> <span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">"%Y%m</span><span class="si">%d</span><span class="s2">"</span><span class="p">)),</span>
<span class="n">channel</span><span class="o">=</span><span class="n">channel</span><span class="p">,</span>
<span class="n">app</span><span class="o">=</span><span class="s2">"Fennec"</span><span class="p">,</span>
<span class="n">fraction</span><span class="o">=</span><span class="mi">1</span>
<span class="p">)</span>
<span class="c1"># Grab all available source_version pings</span>
<span class="n">pings</span> <span class="o">=</span> <span class="n">get_pings</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">source_version</span><span class="o">=</span><span class="s2">"*"</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="n">subset</span> <span class="o">=</span> <span class="n">get_pings_properties</span><span class="p">(</span><span class="n">pings</span><span class="p">,</span> <span class="p">[</span><span class="s2">"meta/clientId"</span><span class="p">,</span>
<span class="s2">"meta/documentId"</span><span class="p">,</span>
<span class="s2">"meta/submissionDate"</span><span class="p">,</span>
<span class="s2">"meta/appVersion"</span><span class="p">,</span>
<span class="s2">"meta/appBuildId"</span><span class="p">,</span>
<span class="s2">"meta/geoCountry"</span><span class="p">,</span>
<span class="s2">"locale"</span><span class="p">,</span>
<span class="s2">"os"</span><span class="p">,</span>
<span class="s2">"osversion"</span><span class="p">,</span>
<span class="s2">"device"</span><span class="p">,</span>
<span class="s2">"arch"</span><span class="p">,</span>
<span class="s2">"profileDate"</span><span class="p">,</span>
<span class="s2">"created"</span><span class="p">,</span>
<span class="s2">"defaultSearch"</span><span class="p">,</span>
<span class="s2">"distributionId"</span><span class="p">,</span>
<span class="s2">"experiments"</span><span class="p">,</span>
<span class="s2">"defaultNewTabExperience"</span><span class="p">,</span>
<span class="s2">"defaultMailClient"</span><span class="p">,</span>
<span class="s2">"searches"</span><span class="p">,</span>
<span class="s2">"durations"</span><span class="p">,</span>
<span class="s2">"sessions"</span><span class="p">])</span>
<span class="n">subset</span> <span class="o">=</span> <span class="n">dedupe_pings</span><span class="p">(</span><span class="n">subset</span><span class="p">)</span>
<span class="k">print</span> <span class="s2">"</span><span class="se">\n</span><span class="s2">De-duped pings:"</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">subset</span><span class="o">.</span><span class="n">count</span><span class="p">())</span>
<span class="k">print</span> <span class="n">subset</span><span class="o">.</span><span class="n">first</span><span class="p">()</span>
<span class="n">transformed</span> <span class="o">=</span> <span class="n">subset</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">transform</span><span class="p">)</span>
<span class="k">print</span> <span class="s2">"</span><span class="se">\n</span><span class="s2">Transformed pings:"</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">transformed</span><span class="o">.</span><span class="n">count</span><span class="p">())</span>
<span class="k">print</span> <span class="n">transformed</span><span class="o">.</span><span class="n">first</span><span class="p">()</span>
<span class="n">s3_output</span> <span class="o">=</span> <span class="s2">"s3n://net-mozaws-prod-us-west-2-pipeline-analysis/mobile/mobile_clients"</span>
<span class="n">s3_output</span> <span class="o">+=</span> <span class="s2">"/v2/channel="</span> <span class="o">+</span> <span class="n">channel</span> <span class="o">+</span> <span class="s2">"/submission="</span> <span class="o">+</span> <span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">"%Y%m</span><span class="si">%d</span><span class="s2">"</span><span class="p">)</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">StructType</span><span class="p">([</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"clientid"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">False</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"submissiondate"</span><span class="p">,</span> <span class="n">TimestampType</span><span class="p">(),</span> <span class="bp">False</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"creationdate"</span><span class="p">,</span> <span class="n">TimestampType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"profiledate"</span><span class="p">,</span> <span class="n">TimestampType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"geocountry"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"locale"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"os"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"osversion"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"buildid"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"appversion"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"device"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"arch"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"defaultsearch"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"distributionid"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"experiments"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"default_new_tab_experience"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"default_mail_client"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"searches"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"durations"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">"sessions"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">)</span>
<span class="p">])</span>
<span class="c1"># Make parquet parition file size large, but not too large for s3 to handle</span>
<span class="n">coalesce</span> <span class="o">=</span> <span class="mi">1</span>
<span class="k">if</span> <span class="n">channel</span> <span class="o">==</span> <span class="s2">"release"</span><span class="p">:</span>
<span class="n">coalesce</span> <span class="o">=</span> <span class="mi">4</span>
<span class="n">grouped</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">transformed</span><span class="p">,</span> <span class="n">schema</span><span class="p">)</span>
<span class="n">grouped</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">coalesce</span><span class="p">)</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">mode</span><span class="p">(</span><span class="s1">'overwrite'</span><span class="p">)</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="n">s3_output</span><span class="p">)</span>
<span class="n">day</span> <span class="o">+=</span> <span class="n">dt</span><span class="o">.</span><span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
</div>
</div></body></html>
<div>
</div>
<br/>
<div class="row">
<div class="col-md-12">
</div>
</div>
<script type="text/javascript">
$("#searchbar")[0].setSelectionRange(1000, 1000);
$('#searchbar').typeahead({
hint: false,
highlight: true,
minLength: 1
},
{
name: 'knowledge_posts',
limit: 10,
display: function (item) {
return item.title + " - " + item.author;
},
templates: {
empty: Handlebars.compile(
'<div class="tt-not-found">' +
'Unable to find any posts that match the current query' +
'</div>'
),
suggestion: function(data) {
return '<p style="overflow-wrap:break-word"><strong class="text-rausch">' + data.title + '</strong> – ' + data.author + '</p>';
}
},
source: function(q, sync, async) {
$.ajax('/ajax/index/typeahead?search=' + q,
{
success: function(data,status){ async(JSON.parse(data)); }
})
}
});
$('#searchbar').bind('typeahead:select', function(obj, datum, name) {
window.location = '/post/'+encodeURIComponent(datum.path);
});
$('#searchbar').keypress(function(event){
var keycode = (event.keyCode ? event.keyCode : event.which);
if(keycode == '13'){
var path = document.location.pathname;
window.location = '/feed?filters=' + $('#searchbar').val()
}
});
var padding = $('.tt-menu').outerWidth()
$('.tt-menu').width($('#searchbar').width() + padding + "px")
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/tooltips.js" type="text/javascript"></script>
<script type="text/javascript">
$("document").ready(function(){
var is_webeditor = false;
var post_id = "11";
var id = "None";
var post_path = "etl/mobile-clients.kp"
var data_repo_github_root = ""
tooltipsJx.initializeTooltips(is_webeditor, post_id, id, data_repo_github_root);
$(".btn-rendered").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path);
})
$(".btn-raw").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path) + "?render=raw";
})
$(".btn-webeditor").on("click", function(){
document.location.href = "/edit/" + encodeURI(post_path);
})
});
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/tags.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/icons.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/comments.js" type="text/javascript"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js" type="text/javascript">
</script>
<script>
$(document).on('ready', function(){
// Make the Rendered Markdown Button active
$(".btn-rendered").addClass("btn-active");
// Initialize headers
helpersJx.linkifyHeaders();
// Initialize comments
var post_path = 'etl/mobile-clients.kp';
$("#post_comment_btn").on('click', function(){
comment_author = 'knowledge_default';
post_author = 'Frank Bertsch';
post_title = 'Mobile Clients ETL Job';
commentsJx.postComment(comment_author, post_author, post_title, post_path);
location.reload();
});
all_comment_delete_buttons = $("[id^=delete_comment]")
$.each(all_comment_delete_buttons, function(i,v){
$(v).on("click", function(){
var id = v.id;
var comment_id = id.split("__")[1];
if(comment_id) {
commentsJx.deleteComment(post_path, comment_id)
location.reload();
}
});
});
$(document.body).on('click',"button[id^=tag-subscription]",function () {
tagsJx.addTagSubscriptionListener($(this)[0]);
});
})
//Turn all the headers to be links
//h1 = Title, don't want that
var all_headers = [$("h2"), $("h3"), $("h4"), $("h5"), $("h6")]
$.each(all_headers, function(index, value){
$.each(value, function(i, v){
var inner_html = v.innerHTML
inner_html_no_special = inner_html.replace(/[^a-zA-Z\- ]/g, "")
var inner_link = "#" + inner_html_no_special.toLowerCase().split(" ").join("-")
v.innerHTML = "<a href='" + inner_link + "' class=link-reset>" + inner_html + "</a>"
})
})
//turn all the tags into links, similar to what's done on the feed page
var tags = $("#renderedMarkdown .metadata .tags")[0]
var tags_list = ['mobile', 'etl']
var subscriptions_list = []
$.each(tags_list, function(i,tag){
ahref = document.createElement("a")
e_tag = encodeURIComponent(tag)
f_tag = tag.replace("/", "__")
tag_name = "#" + tag
tag_subscription_button_id_name = "tag-subscription-" + i + "__" + f_tag
ahref.setAttribute("data-container", "body")
ahref.setAttribute("data-toggle", "popover")
ahref.setAttribute("data-placement", "bottom")
ahref.setAttribute("data-html", "true")
ahref.setAttribute("data-tag-name", f_tag)
if (subscriptions_list.indexOf(tag) >= 0) {
ahref.setAttribute("class", "label label-subscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-primary btn-unsubscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-remove-sign glyphicon-white'></i>Unsubscribe " +
" </button> " +
" </div>")
} else {
ahref.setAttribute("class", "label label-unsubscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-default btn-subscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-ok-sign glyphicon-filled'></i>Subscribe " +
" </button> " +
" </div>")
}
ahref.setAttribute("href", "/tag_pages?tag=" + e_tag)
ahref.setAttribute("style", "font-weight:normal")
if (i == 0){
ahref.innerHTML = " "
colon = document.createElement("text")
colon.innerHTML = "<b>Tags</b>: "
tags.appendChild(colon)
}
ahref.innerHTML = ahref.innerHTML + tag_name
tags.appendChild(ahref)
if (i != tags_list.length - 1){
comma = document.createElement("text")
comma.innerText = ", "
tags.appendChild(comma)
}
})
tags.nextSibling.remove()
tags.innerHTML += "<i class='glyphicon glyphicon-edit icon-gray' style='font-size:12pt; padding-left:4px' id='tooltip-edit_tags'></i>"
$(".pop").popover({ trigger: "manual" , html: true, animation:false, delay: 100})
.on("mouseenter", function () {
var _this = this;
$(this).popover("show");
$(".popover").on("mouseleave", function () {
$(_this).popover('hide');
});
}).on("mouseleave", function () {
var _this = this;
setTimeout(function () {
if (!$(".popover:hover").length) {
$(_this).popover("hide");
}
}, 300);
});
$('#tooltip-edit_tags').click(function(){
$('#tooltip-edit_tags')[0].setAttribute("style", "display:none")
previousSibling = $("#tooltip-edit_tags")[0].previousSibling
tags_string = tags_list.join(", ")
form = document.createElement("form")
input = document.createElement("input")
tags_text = document.createElement("text")
icon_class = document.createElement("i")
icon_class.setAttribute("class", "glyphicon glyphicon-upload icon-gray")
icon_class.setAttribute("style", "font-size:23px; padding-left:4px")
icon_class.setAttribute("id", "tooltip-save_tags")
tags_text.innerText = "Tags: "
input.setAttribute('type', 'text')
input.setAttribute('name', 'tags_list')
input.setAttribute('value', tags_string)
input.setAttribute('style', 'width:75%; display: inline-block')
input.setAttribute('id' , 'change_tags')
form.appendChild(tags_text)
tags.textContent = " "
form.appendChild(input)
form.appendChild(icon_class)
tags.appendChild(form)
$("#tooltip-save_tags").click(function(){
tags_string = $("#change_tags")[0].value
tags_list = tags_string.split(",")
var re = /^[a-z0-9\-\_\:\/]+$/i
var good = true
for (var i = 0; i < tags_list.length; i++){
tag = tags_list[i]
if (tag.length == 0){
alert("There is a tag with length 0 - possible a trailing comma?")
good = false
break
} else {
tag_name = tag.trim()
if (!(re.test(tag_name))){
alert("The tag contains special characters. Make sure there are only alphanumeric characters in your tag")
good = false
break
}
}
}
if (good) {
var postContent = {}
postContent['tags'] = tags_string
$.ajax({
type: "POST",
dataType: "json",
data: JSON.stringify(postContent),
contentType: "application/json",
url: '/tag_list?post_path=etl/mobile-clients.kp',
async: false
});
location.reload()
}
})
tags.nextSibling.remove()
// Allow user to edit tags
var edit_icon = iconsJx.createEditTagsIcon();
$(tags).after(edit_icon);
$("#tooltip-edit_tags").on("click", function(){
var edit_tooltip = $("#tooltip-edit_tags");
edit_tooltip.attr("style", "display:none");
var tags_string = tags_list.join(", ");
var form = $("<form>");
var input = $("<input>");
var tags_text = $("<text>");
tags_text.html("Tags: ");
var icon = iconsJx.createSaveTagsIcon();
input.attr("type", "text");
input.attr("name", "tags_list");
input.attr("style", "width:75%; display: inline-block");
input.attr("id", "change_tags");
form.append(tags_text);
tags.textContent = " ";
tags_text.innerHTML = "Tags: ";
form.append(input);
form.append(icon);
tags.appendChild(form[0]);
$("#change_tags")[0].value = tags_string;
$("#change_tags").keypress(function(e){
if (e.which == 13){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/mobile-clients.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false;
};
});
$("#tooltip-save_tags").click(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/mobile-clients.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
});
$("form").submit(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/mobile-clients.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false
})
});
});
</script>

Просмотреть файл

@ -0,0 +1,726 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title> Knowledge </title>
<!-- js includes at the top as post embedded js colliding -->
<script src="/static/modules/jquery/jquery.min.js"></script>
<script src="/static/modules/tether/js/tether.min.js"></script>
<script src="/static/modules/bootstrap/js/bootstrap.min.js"></script>
<script src="/static/modules/bootstrap-slider/js/bootstrap-slider.min.js"></script>
<script src="/static/modules/typeahead.js/typeahead.bundle.min.js"></script>
<script src="/static/modules/handlebars/js/handlebars.js"></script>
<script src="/static/js/helpers.js"></script>
<script src="/static/modules/select2/js/select2.min.js"></script>
<script src="/static/modules/hightlight.pack.js/highlight.pack.js"></script>
<script src="/static/modules/marked.js/marked.js"></script>
<!-- require js is used for plotly, but has a bunch of collisions with other js packages
make sure to have it be last js package imported -->
<script src="/static/modules/require.js/require.min.js"></script>
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.6.2/html5shiv.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/2.1.0/es5-shim.min.js"></script>
<![endif]-->
<link rel="stylesheet" href="/static/modules/bootstrap/css/bootstrap.min.css">
<link rel="stylesheet" href="/static/modules/bootstrap-slider/css/bootstrap-slider.min.css">
<link rel="stylesheet" type='text/css' href="/static/css/custom.css">
<link rel="stylesheet" type='text/css' href="/static/modules/select2/css/select2.min.css">
<link href='https://fonts.googleapis.com/css?family=Lato:400,900|Playfair+Display|Source+Serif+Pro:400,700' rel='stylesheet' type='text/css'>
<link rel="shortcut icon" href="/static/images/favicon.png">
<link rel="stylesheet" href="/static/css/codehilite-friendly.css">
<style>
.spinner {
position: fixed;
top: 50%;
left: 50%;
margin-left: -50px; /* half width of the spinner gif */
margin-top: -50px; /* half height of the spinner gif */
text-align:center;
z-index:1234;
overflow: auto;
width: 100px; /* width of the spinner gif */
height: 102px; /*hight of the spinner gif +2px to fix IE8 issue */
}
.table {
font-size: 14px;
}
.modal-content {
max-width: 1024px;
}
</style>
</head>
<body>
<div class="navbar navbar-knowledge" role="navigation">
<div class="container page-container">
<a href="/" aria-selected="false" class="logo-image navbar-text" style='margin-top:16px; margin-left: -7px'>
<img width="125" src='/static/images/logo-white.svg'></img>
</a>
<a id="feed_tab" href="/" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Home
</a>
<a id="favorites_tab" href="/favorites" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Favorites
</a>
<a id="help_tab" href="/about" aria-selected="false" class="navbar-text" style='margin-top:18px'>
About
</a>
<a id="stats_tab" href="/stats" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Stats
</a>
<a id="webposts_tab" href="/create"
class="btn btn-primary navbar-text pull-right"
style='border-radius:4px; margin-top:10px; margin-right:-10px; background-color: #00a699; color:white; border-color: #00a699'>
Write a Post!
</a>
<div class="pull-right">
<div class="form-group">
<input class="form-control" id="searchbar" placeholder="Search for Knowledge" style="text-align:right">
</div>
</div>
</div>
</div>
<div class="container page-container">
<br>
<div class="container-fluid">
<div class="row">
<div class="col-md-6">
<div class="btn-group" role="group">
<button type="button" class="btn btn-default btn-rendered">
View Post
</button>
<button type="button" class="btn btn-default btn-raw">
View Raw Markdown
</button>
</div>
</div>
<div class="col-md-2">
</div>
<div class="col-md-4 text-right">
<i class="glyphicon glyphicon-eye-open" id="pageview_stats" style='color: #9CA299'></i>
<div id="pageview_stats" style="display: inline-block">
Viewed 2 times by 1 different users
</div>
<i class="glyphicon glyphicon-heart-empty glyphicon-clickable pop" style="font-size:16pt" id="tooltip-like" data-placement="bottom"
data-trigger="#tooltip-like"
data-container="body"
data-toggle="popover"
data-content='<div>Like This Post</div>'></i>
</div>
</div>
<div class="row col-md-12">
</div>
</div>
<div class='container-fluid'>
<div class="row">
<div class="col-md-12">
<div id="renderedMarkdown">
<div class='metadata'>
<h1>Mobile Clients ETL Job</h1>
<span class='authors'><a href='/feed?authors=Frank+Bertsch'>Frank Bertsch</a></span>
<span class='date_created'>February 17, 2017</span>
<span class='date_updated'>(Last Updated: May 01, 2017)</span>
<span class='tldr'><p>This job basically just takes core pings and puts them in parquet format.</p></span>
<span class='tags'></span>
</div>
<div class="codehilite"><pre><span></span><span class="kn">import</span> <span class="nn">os</span>
<span class="kn">import</span> <span class="nn">datetime</span> <span class="kn">as</span> <span class="nn">dt</span>
<span class="kn">import</span> <span class="nn">pandas</span> <span class="kn">as</span> <span class="nn">pd</span>
<span class="kn">import</span> <span class="nn">ujson</span> <span class="kn">as</span> <span class="nn">json</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="o">*</span>
<span class="kn">from</span> <span class="nn">moztelemetry</span> <span class="kn">import</span> <span class="n">get_pings</span><span class="p">,</span> <span class="n">get_pings_properties</span>
<span class="o">%</span><span class="n">pylab</span> <span class="n">inline</span>
</pre></div>
<div class="codehilite"><pre><span></span>
</pre></div>
<p>Take the set of pings, make sure we have actual clientIds and remove duplicate pings. We collect each unique ping.</p>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">dedupe_pings</span><span class="p">(</span><span class="n">rdd</span><span class="p">):</span>
<span class="k">return</span> <span class="n">rdd</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="n">p</span><span class="p">[</span><span class="s2">&quot;meta/clientId&quot;</span><span class="p">]</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">)</span>\
<span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="p">(</span><span class="n">p</span><span class="p">[</span><span class="s2">&quot;meta/documentId&quot;</span><span class="p">],</span> <span class="n">p</span><span class="p">))</span>\
<span class="o">.</span><span class="n">reduceByKey</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span><span class="p">)</span>\
<span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
</pre></div>
<p>Transform and sanitize the pings into arrays.</p>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">transform</span><span class="p">(</span><span class="n">ping</span><span class="p">):</span>
<span class="c1"># Should not be None since we filter those out.</span>
<span class="n">clientId</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;meta/clientId&quot;</span><span class="p">]</span>
<span class="c1"># Added via the ingestion process so should not be None.</span>
<span class="n">submissionDate</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">strptime</span><span class="p">(</span><span class="n">ping</span><span class="p">[</span><span class="s2">&quot;meta/submissionDate&quot;</span><span class="p">],</span> <span class="s2">&quot;%Y%m</span><span class="si">%d</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="n">geoCountry</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;meta/geoCountry&quot;</span><span class="p">]</span>
<span class="n">profileDate</span> <span class="o">=</span> <span class="bp">None</span>
<span class="n">profileDaynum</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;profileDate&quot;</span><span class="p">]</span>
<span class="k">if</span> <span class="n">profileDaynum</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="k">try</span><span class="p">:</span>
<span class="c1"># Bad data could push profileDaynum &gt; 32767 (size of a C int) and throw exception</span>
<span class="n">profileDate</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="p">(</span><span class="mi">1970</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> <span class="o">+</span> <span class="n">dt</span><span class="o">.</span><span class="n">timedelta</span><span class="p">(</span><span class="nb">int</span><span class="p">(</span><span class="n">profileDaynum</span><span class="p">))</span>
<span class="k">except</span><span class="p">:</span>
<span class="n">profileDate</span> <span class="o">=</span> <span class="bp">None</span>
<span class="c1"># Create date can be an improper string (~.03% of the time, so ignore)</span>
<span class="c1"># Year can be &lt; 2000 (~.005% of the time, so ignore)</span>
<span class="k">try</span><span class="p">:</span>
<span class="c1"># Create date should already be in ISO format</span>
<span class="n">creationDate</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;created&quot;</span><span class="p">]</span>
<span class="k">if</span> <span class="n">creationDate</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="c1"># This is only accurate because we know the creation date is always in &#39;Z&#39; (zulu) time.</span>
<span class="n">creationDate</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">strptime</span><span class="p">(</span><span class="n">ping</span><span class="p">[</span><span class="s2">&quot;created&quot;</span><span class="p">],</span> <span class="s2">&quot;%Y-%m-</span><span class="si">%d</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">creationDate</span><span class="o">.</span><span class="n">year</span> <span class="o">&lt;</span> <span class="mi">2000</span><span class="p">:</span>
<span class="n">creationDate</span> <span class="o">=</span> <span class="bp">None</span>
<span class="k">except</span> <span class="ne">ValueError</span><span class="p">:</span>
<span class="n">creationDate</span> <span class="o">=</span> <span class="bp">None</span>
<span class="n">appVersion</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;meta/appVersion&quot;</span><span class="p">]</span>
<span class="n">buildId</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;meta/appBuildId&quot;</span><span class="p">]</span>
<span class="n">locale</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;locale&quot;</span><span class="p">]</span>
<span class="n">os</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;os&quot;</span><span class="p">]</span>
<span class="n">osVersion</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;osversion&quot;</span><span class="p">]</span>
<span class="n">device</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;device&quot;</span><span class="p">]</span>
<span class="n">arch</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;arch&quot;</span><span class="p">]</span>
<span class="n">defaultSearch</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;defaultSearch&quot;</span><span class="p">]</span>
<span class="n">distributionId</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;distributionId&quot;</span><span class="p">]</span>
<span class="n">experiments</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;experiments&quot;</span><span class="p">]</span>
<span class="k">if</span> <span class="n">experiments</span> <span class="ow">is</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">experiments</span> <span class="o">=</span> <span class="p">[]</span>
<span class="c1">#bug 1315028</span>
<span class="n">defaultNewTabExperience</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;defaultNewTabExperience&quot;</span><span class="p">]</span>
<span class="n">defaultMailClient</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;defaultMailClient&quot;</span><span class="p">]</span>
<span class="c1">#bug 1307419</span>
<span class="n">searches</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;searches&quot;</span><span class="p">]</span>
<span class="n">durations</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;durations&quot;</span><span class="p">]</span>
<span class="n">sessions</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="s2">&quot;sessions&quot;</span><span class="p">]</span>
<span class="k">return</span> <span class="p">[</span><span class="n">clientId</span><span class="p">,</span> <span class="n">submissionDate</span><span class="p">,</span> <span class="n">creationDate</span><span class="p">,</span> <span class="n">profileDate</span><span class="p">,</span> <span class="n">geoCountry</span><span class="p">,</span> <span class="n">locale</span><span class="p">,</span> <span class="n">os</span><span class="p">,</span>
<span class="n">osVersion</span><span class="p">,</span> <span class="n">buildId</span><span class="p">,</span> <span class="n">appVersion</span><span class="p">,</span> <span class="n">device</span><span class="p">,</span> <span class="n">arch</span><span class="p">,</span> <span class="n">defaultSearch</span><span class="p">,</span> <span class="n">distributionId</span><span class="p">,</span>
<span class="n">json</span><span class="o">.</span><span class="n">dumps</span><span class="p">(</span><span class="n">experiments</span><span class="p">),</span> <span class="n">defaultNewTabExperience</span><span class="p">,</span> <span class="n">defaultMailClient</span><span class="p">,</span> <span class="n">searches</span><span class="p">,</span>
<span class="n">durations</span><span class="p">,</span> <span class="n">sessions</span><span class="p">]</span>
</pre></div>
<p>Create a set of pings from &ldquo;core&rdquo; to build a set of core client data. Output the data to CSV or Parquet.</p>
<p>This script is designed to loop over a range of days and output a single day for the given channels. Use explicit date ranges for backfilling, or now() - &lsquo;1day&rsquo; for automated runs.</p>
<div class="codehilite"><pre><span></span><span class="n">channels</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;nightly&quot;</span><span class="p">,</span> <span class="s2">&quot;aurora&quot;</span><span class="p">,</span> <span class="s2">&quot;beta&quot;</span><span class="p">,</span> <span class="s2">&quot;release&quot;</span><span class="p">]</span>
<span class="n">batch_date</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;date&#39;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">batch_date</span><span class="p">:</span>
<span class="n">start</span> <span class="o">=</span> <span class="n">end</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">strptime</span><span class="p">(</span><span class="n">batch_date</span><span class="p">,</span> <span class="s1">&#39;%Y%m</span><span class="si">%d</span><span class="s1">&#39;</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">start</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">now</span><span class="p">()</span> <span class="o">-</span> <span class="n">dt</span><span class="o">.</span><span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="n">end</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">now</span><span class="p">()</span> <span class="o">-</span> <span class="n">dt</span><span class="o">.</span><span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="n">day</span> <span class="o">=</span> <span class="n">start</span>
<span class="k">while</span> <span class="n">day</span> <span class="o">&lt;=</span> <span class="n">end</span><span class="p">:</span>
<span class="k">for</span> <span class="n">channel</span> <span class="ow">in</span> <span class="n">channels</span><span class="p">:</span>
<span class="k">print</span> <span class="s2">&quot;</span><span class="se">\n</span><span class="s2">channel: &quot;</span> <span class="o">+</span> <span class="n">channel</span> <span class="o">+</span> <span class="s2">&quot;, date: &quot;</span> <span class="o">+</span> <span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">&quot;%Y%m</span><span class="si">%d</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="nb">dict</span><span class="p">(</span>
<span class="n">doc_type</span><span class="o">=</span><span class="s2">&quot;core&quot;</span><span class="p">,</span>
<span class="n">submission_date</span><span class="o">=</span><span class="p">(</span><span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">&quot;%Y%m</span><span class="si">%d</span><span class="s2">&quot;</span><span class="p">),</span> <span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">&quot;%Y%m</span><span class="si">%d</span><span class="s2">&quot;</span><span class="p">)),</span>
<span class="n">channel</span><span class="o">=</span><span class="n">channel</span><span class="p">,</span>
<span class="n">app</span><span class="o">=</span><span class="s2">&quot;Fennec&quot;</span><span class="p">,</span>
<span class="n">fraction</span><span class="o">=</span><span class="mi">1</span>
<span class="p">)</span>
<span class="c1"># Grab all available source_version pings</span>
<span class="n">pings</span> <span class="o">=</span> <span class="n">get_pings</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">source_version</span><span class="o">=</span><span class="s2">&quot;*&quot;</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="n">subset</span> <span class="o">=</span> <span class="n">get_pings_properties</span><span class="p">(</span><span class="n">pings</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;meta/clientId&quot;</span><span class="p">,</span>
<span class="s2">&quot;meta/documentId&quot;</span><span class="p">,</span>
<span class="s2">&quot;meta/submissionDate&quot;</span><span class="p">,</span>
<span class="s2">&quot;meta/appVersion&quot;</span><span class="p">,</span>
<span class="s2">&quot;meta/appBuildId&quot;</span><span class="p">,</span>
<span class="s2">&quot;meta/geoCountry&quot;</span><span class="p">,</span>
<span class="s2">&quot;locale&quot;</span><span class="p">,</span>
<span class="s2">&quot;os&quot;</span><span class="p">,</span>
<span class="s2">&quot;osversion&quot;</span><span class="p">,</span>
<span class="s2">&quot;device&quot;</span><span class="p">,</span>
<span class="s2">&quot;arch&quot;</span><span class="p">,</span>
<span class="s2">&quot;profileDate&quot;</span><span class="p">,</span>
<span class="s2">&quot;created&quot;</span><span class="p">,</span>
<span class="s2">&quot;defaultSearch&quot;</span><span class="p">,</span>
<span class="s2">&quot;distributionId&quot;</span><span class="p">,</span>
<span class="s2">&quot;experiments&quot;</span><span class="p">,</span>
<span class="s2">&quot;defaultNewTabExperience&quot;</span><span class="p">,</span>
<span class="s2">&quot;defaultMailClient&quot;</span><span class="p">,</span>
<span class="s2">&quot;searches&quot;</span><span class="p">,</span>
<span class="s2">&quot;durations&quot;</span><span class="p">,</span>
<span class="s2">&quot;sessions&quot;</span><span class="p">])</span>
<span class="n">subset</span> <span class="o">=</span> <span class="n">dedupe_pings</span><span class="p">(</span><span class="n">subset</span><span class="p">)</span>
<span class="k">print</span> <span class="s2">&quot;</span><span class="se">\n</span><span class="s2">De-duped pings:&quot;</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">subset</span><span class="o">.</span><span class="n">count</span><span class="p">())</span>
<span class="k">print</span> <span class="n">subset</span><span class="o">.</span><span class="n">first</span><span class="p">()</span>
<span class="n">transformed</span> <span class="o">=</span> <span class="n">subset</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">transform</span><span class="p">)</span>
<span class="k">print</span> <span class="s2">&quot;</span><span class="se">\n</span><span class="s2">Transformed pings:&quot;</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">transformed</span><span class="o">.</span><span class="n">count</span><span class="p">())</span>
<span class="k">print</span> <span class="n">transformed</span><span class="o">.</span><span class="n">first</span><span class="p">()</span>
<span class="n">s3_output</span> <span class="o">=</span> <span class="s2">&quot;s3n://net-mozaws-prod-us-west-2-pipeline-analysis/mobile/mobile_clients&quot;</span>
<span class="n">s3_output</span> <span class="o">+=</span> <span class="s2">&quot;/v2/channel=&quot;</span> <span class="o">+</span> <span class="n">channel</span> <span class="o">+</span> <span class="s2">&quot;/submission=&quot;</span> <span class="o">+</span> <span class="n">day</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">&quot;%Y%m</span><span class="si">%d</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">StructType</span><span class="p">([</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;clientid&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">False</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;submissiondate&quot;</span><span class="p">,</span> <span class="n">TimestampType</span><span class="p">(),</span> <span class="bp">False</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;creationdate&quot;</span><span class="p">,</span> <span class="n">TimestampType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;profiledate&quot;</span><span class="p">,</span> <span class="n">TimestampType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;geocountry&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;locale&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;os&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;osversion&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;buildid&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;appversion&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;device&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;arch&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;defaultsearch&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;distributionid&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;experiments&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;default_new_tab_experience&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;default_mail_client&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;searches&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;durations&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">),</span>
<span class="n">StructField</span><span class="p">(</span><span class="s2">&quot;sessions&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">)</span>
<span class="p">])</span>
<span class="c1"># Make parquet parition file size large, but not too large for s3 to handle</span>
<span class="n">coalesce</span> <span class="o">=</span> <span class="mi">1</span>
<span class="k">if</span> <span class="n">channel</span> <span class="o">==</span> <span class="s2">&quot;release&quot;</span><span class="p">:</span>
<span class="n">coalesce</span> <span class="o">=</span> <span class="mi">4</span>
<span class="n">grouped</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">transformed</span><span class="p">,</span> <span class="n">schema</span><span class="p">)</span>
<span class="n">grouped</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">coalesce</span><span class="p">)</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">mode</span><span class="p">(</span><span class="s1">&#39;overwrite&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="n">s3_output</span><span class="p">)</span>
<span class="n">day</span> <span class="o">+=</span> <span class="n">dt</span><span class="o">.</span><span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
</div>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<h2> 0 Comments </h2>
</br>
<div>
<div class="row">
<div class="col-md-12">
<textarea class="form-control" type="text" id="comment-text" style="height:87px;" placeholder="Leave a comment..."></textarea>
</div>
</div>
</div>
</br>
<div>
<button class="btn btn-primary" id="post_comment_btn">Post Comment</button>
</div>
</div>
</div>
<br>
<div class="row">
<div class="col-md-12">
</div>
</br>
</div>
</div>
</div>
</div>
<div class="footer">
Served with <span class="glyphicon glyphicon-heart"></span> by <a href="https://github.com/airbnb/knowledge-repo">Knowledge Repo</a> <a href="https://github.com/airbnb/knowledge-repo/releases/tag/v0.7.4">0.7.4</a><br />
<i title="Last checked for updates: 4 minutes ago">Last indexed: 14 hours ago</i>
</div>
<script type='text/javascript'>
$("#searchbar")[0].setSelectionRange(1000, 1000);
$('#searchbar').typeahead({
hint: false,
highlight: true,
minLength: 1
},
{
name: 'knowledge_posts',
limit: 10,
display: function (item) {
return item.title + " - " + item.author;
},
templates: {
empty: Handlebars.compile(
'<div class="tt-not-found">' +
'Unable to find any posts that match the current query' +
'</div>'
),
suggestion: function(data) {
return '<p style="overflow-wrap:break-word"><strong class="text-rausch">' + data.title + '</strong> – ' + data.author + '</p>';
}
},
source: function(q, sync, async) {
$.ajax('/ajax/index/typeahead?search=' + q,
{
success: function(data,status){ async(JSON.parse(data)); }
})
}
});
$('#searchbar').bind('typeahead:select', function(obj, datum, name) {
window.location = '/post/'+encodeURIComponent(datum.path);
});
$('#searchbar').keypress(function(event){
var keycode = (event.keyCode ? event.keyCode : event.which);
if(keycode == '13'){
var path = document.location.pathname;
window.location = '/feed?filters=' + $('#searchbar').val()
}
});
var padding = $('.tt-menu').outerWidth()
$('.tt-menu').width($('#searchbar').width() + padding + "px")
</script>
<script src="/static/js/tooltips.js" type="text/javascript"></script>
<script type="text/javascript">
$("document").ready(function(){
var is_webeditor = false;
var post_id = "11";
var id = "None";
var post_path = "etl/mobile-clients.kp"
var data_repo_github_root = ""
tooltipsJx.initializeTooltips(is_webeditor, post_id, id, data_repo_github_root);
$(".btn-rendered").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path);
})
$(".btn-raw").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path) + "?render=raw";
})
$(".btn-webeditor").on("click", function(){
document.location.href = "/edit/" + encodeURI(post_path);
})
});
</script>
<script src="/static/js/helpers.js"></script>
<script src="/static/js/tags.js" type="text/javascript"></script>
<script src="/static/js/icons.js" type="text/javascript"></script>
<script src="/static/js/comments.js" type="text/javascript"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script type="text/javascript"
src="https://cdn.mathjax.org/mathjax/latest/MathJax.js">
</script>
<script>
$(document).on('ready', function(){
// Make the Rendered Markdown Button active
$(".btn-rendered").addClass("btn-active");
// Initialize headers
helpersJx.linkifyHeaders();
// Initialize comments
var post_path = 'etl/mobile-clients.kp';
$("#post_comment_btn").on('click', function(){
comment_author = 'knowledge_default';
post_author = 'Frank Bertsch';
post_title = 'Mobile Clients ETL Job';
commentsJx.postComment(comment_author, post_author, post_title, post_path);
location.reload();
});
all_comment_delete_buttons = $("[id^=delete_comment]")
$.each(all_comment_delete_buttons, function(i,v){
$(v).on("click", function(){
var id = v.id;
var comment_id = id.split("__")[1];
if(comment_id) {
commentsJx.deleteComment(post_path, comment_id)
location.reload();
}
});
});
$(document.body).on('click',"button[id^=tag-subscription]",function () {
tagsJx.addTagSubscriptionListener($(this)[0]);
});
})
//Turn all the headers to be links
//h1 = Title, don't want that
var all_headers = [$("h2"), $("h3"), $("h4"), $("h5"), $("h6")]
$.each(all_headers, function(index, value){
$.each(value, function(i, v){
var inner_html = v.innerHTML
inner_html_no_special = inner_html.replace(/[^a-zA-Z\- ]/g, "")
var inner_link = "#" + inner_html_no_special.toLowerCase().split(" ").join("-")
v.innerHTML = "<a href='" + inner_link + "' class=link-reset>" + inner_html + "</a>"
})
})
//turn all the tags into links, similar to what's done on the feed page
var tags = $("#renderedMarkdown .metadata .tags")[0]
var tags_list = ['mobile', 'etl']
var subscriptions_list = []
$.each(tags_list, function(i,tag){
ahref = document.createElement("a")
e_tag = encodeURIComponent(tag)
f_tag = tag.replace("/", "__")
tag_name = "#" + tag
tag_subscription_button_id_name = "tag-subscription-" + i + "__" + f_tag
ahref.setAttribute("data-container", "body")
ahref.setAttribute("data-toggle", "popover")
ahref.setAttribute("data-placement", "bottom")
ahref.setAttribute("data-html", "true")
ahref.setAttribute("data-tag-name", f_tag)
if (subscriptions_list.indexOf(tag) >= 0) {
ahref.setAttribute("class", "label label-subscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-primary btn-unsubscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-remove-sign glyphicon-white'></i>Unsubscribe " +
" </button> " +
" </div>")
} else {
ahref.setAttribute("class", "label label-unsubscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-default btn-subscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-ok-sign glyphicon-filled'></i>Subscribe " +
" </button> " +
" </div>")
}
ahref.setAttribute("href", "/tag_pages?tag=" + e_tag)
ahref.setAttribute("style", "font-weight:normal")
if (i == 0){
ahref.innerHTML = " "
colon = document.createElement("text")
colon.innerHTML = "<b>Tags</b>: "
tags.appendChild(colon)
}
ahref.innerHTML = ahref.innerHTML + tag_name
tags.appendChild(ahref)
if (i != tags_list.length - 1){
comma = document.createElement("text")
comma.innerText = ", "
tags.appendChild(comma)
}
})
tags.nextSibling.remove()
tags.innerHTML += "<i class='glyphicon glyphicon-edit icon-gray' style='font-size:12pt; padding-left:4px' id='tooltip-edit_tags'></i>"
$(".pop").popover({ trigger: "manual" , html: true, animation:false, delay: 100})
.on("mouseenter", function () {
var _this = this;
$(this).popover("show");
$(".popover").on("mouseleave", function () {
$(_this).popover('hide');
});
}).on("mouseleave", function () {
var _this = this;
setTimeout(function () {
if (!$(".popover:hover").length) {
$(_this).popover("hide");
}
}, 300);
});
$('#tooltip-edit_tags').click(function(){
$('#tooltip-edit_tags')[0].setAttribute("style", "display:none")
previousSibling = $("#tooltip-edit_tags")[0].previousSibling
tags_string = tags_list.join(", ")
form = document.createElement("form")
input = document.createElement("input")
tags_text = document.createElement("text")
icon_class = document.createElement("i")
icon_class.setAttribute("class", "glyphicon glyphicon-upload icon-gray")
icon_class.setAttribute("style", "font-size:23px; padding-left:4px")
icon_class.setAttribute("id", "tooltip-save_tags")
tags_text.innerText = "Tags: "
input.setAttribute('type', 'text')
input.setAttribute('name', 'tags_list')
input.setAttribute('value', tags_string)
input.setAttribute('style', 'width:75%; display: inline-block')
input.setAttribute('id' , 'change_tags')
form.appendChild(tags_text)
tags.textContent = " "
form.appendChild(input)
form.appendChild(icon_class)
tags.appendChild(form)
$("#tooltip-save_tags").click(function(){
tags_string = $("#change_tags")[0].value
tags_list = tags_string.split(",")
var re = /^[a-z0-9\-\_\:\/]+$/i
var good = true
for (var i = 0; i < tags_list.length; i++){
tag = tags_list[i]
if (tag.length == 0){
alert("There is a tag with length 0 - possible a trailing comma?")
good = false
break
} else {
tag_name = tag.trim()
if (!(re.test(tag_name))){
alert("The tag contains special characters. Make sure there are only alphanumeric characters in your tag")
good = false
break
}
}
}
if (good) {
var postContent = {}
postContent['tags'] = tags_string
$.ajax({
type: "POST",
dataType: "json",
data: JSON.stringify(postContent),
contentType: "application/json",
url: '/tag_list?post_path=etl/mobile-clients.kp',
async: false
});
location.reload()
}
})
tags.nextSibling.remove()
// Allow user to edit tags
var edit_icon = iconsJx.createEditTagsIcon();
$(tags).after(edit_icon);
$("#tooltip-edit_tags").on("click", function(){
var edit_tooltip = $("#tooltip-edit_tags");
edit_tooltip.attr("style", "display:none");
var tags_string = tags_list.join(", ");
var form = $("<form>");
var input = $("<input>");
var tags_text = $("<text>");
tags_text.html("Tags: ");
var icon = iconsJx.createSaveTagsIcon();
input.attr("type", "text");
input.attr("name", "tags_list");
input.attr("style", "width:75%; display: inline-block");
input.attr("id", "change_tags");
form.append(tags_text);
tags.textContent = " ";
tags_text.innerHTML = "Tags: ";
form.append(input);
form.append(icon);
tags.appendChild(form[0]);
$("#change_tags")[0].value = tags_string;
$("#change_tags").keypress(function(e){
if (e.which == 13){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/mobile-clients.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false;
};
});
$("#tooltip-save_tags").click(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/mobile-clients.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
});
$("form").submit(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/mobile-clients.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false
})
});
});
</script>
</body>
</html>

Просмотреть файл

@ -0,0 +1,13 @@
{
"title": "Mobile Clients ETL Job",
"authors": [
"Frank Bertsch"
],
"tags": [
"mobile",
"etl"
],
"publish_date": "2017-02-17",
"updated_at": "2017-02-17",
"tldr": "This job basically just takes core pings and puts them in parquet format."
}

816
etl/sync_log.kp/index.html Normal file
Просмотреть файл

@ -0,0 +1,816 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title> Knowledge </title>
<!-- js includes at the top as post embedded js colliding -->
<script src="https://reports.telemetry.mozilla.org/static/modules/jquery/jquery.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/tether/js/tether.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap/js/bootstrap.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/js/bootstrap-slider.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/typeahead.js/typeahead.bundle.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/handlebars/js/handlebars.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/select2/js/select2.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/hightlight.pack.js/highlight.pack.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/marked.js/marked.js"></script>
<!-- require js is used for plotly, but has a bunch of collisions with other js packages
make sure to have it be last js package imported -->
<script src="https://reports.telemetry.mozilla.org/static/modules/require.js/require.min.js"></script>
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.6.2/html5shiv.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/2.1.0/es5-shim.min.js"></script>
<![endif]-->
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap/css/bootstrap.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/css/bootstrap-slider.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/css/custom.css" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/select2/css/select2.min.css" rel="stylesheet" type="text/css"/>
<link href="https://fonts.googleapis.com/css?family=Lato:400,900|Playfair+Display|Source+Serif+Pro:400,700" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/images/favicon.png" rel="shortcut icon"/>
<link href="https://reports.telemetry.mozilla.org/static/css/codehilite-friendly.css" rel="stylesheet"/>
<style>
.spinner {
position: fixed;
top: 50%;
left: 50%;
margin-left: -50px; /* half width of the spinner gif */
margin-top: -50px; /* half height of the spinner gif */
text-align:center;
z-index:1234;
overflow: auto;
width: 100px; /* width of the spinner gif */
height: 102px; /*hight of the spinner gif +2px to fix IE8 issue */
}
.table {
font-size: 14px;
}
.modal-content {
max-width: 1024px;
}
</style>
</head>
<body>
<div class="container page-container">
<br/>
<div class="container-fluid">
<div class="row">
<div class="col-md-6">
</div>
<div class="col-md-2">
</div>
</div>
<div class="row col-md-12">
</div>
</div>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<div id="renderedMarkdown">
<div class="metadata">
<h1>Bug 1291340 - Import sync log data</h1>
<span class="authors"><a href="/feed?authors=mreid-moz">mreid-moz</a></span>
<span class="date_created">November 15, 2016</span>
<span class="date_updated">(Last Updated: July 10, 2017)</span>
<span class="tldr"><p>Read, convert, and store sync log data to Parquet form per <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1291340">bug 1291340</a>.</p></span>
</div>
<h2 id="bug-1291340-import-sync-log-data">Bug 1291340 - Import sync log data</h2>
<p>Read, convert, and store sync log data to Parquet form per <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1291340">bug 1291340</a>.</p>
<p>Conversion code is ported from the <a href="https://github.com/dannycoates/smt">smt repo</a>.</p>
<div class="codehilite"><pre><span></span><span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">datetime</span> <span class="k">as</span> <span class="n">dt</span><span class="p">,</span> <span class="n">timedelta</span><span class="p">,</span> <span class="n">date</span>
<span class="kn">from</span> <span class="nn">moztelemetry.dataset</span> <span class="kn">import</span> <span class="n">Dataset</span>
<span class="kn">from</span> <span class="nn">os</span> <span class="kn">import</span> <span class="n">environ</span>
<span class="c1"># Determine run parameters</span>
<span class="n">source_bucket</span> <span class="o">=</span> <span class="s1">'net-mozaws-prod-us-west-2-pipeline-analysis'</span>
<span class="n">dest_bucket</span> <span class="o">=</span> <span class="n">source_bucket</span>
<span class="n">dest_s3_prefix</span> <span class="o">=</span> <span class="s2">"s3://{}/mreid"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">dest_bucket</span><span class="p">)</span>
<span class="k">if</span> <span class="s2">"bucket"</span> <span class="ow">in</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">:</span>
<span class="n">dest_bucket</span> <span class="o">=</span> <span class="n">environ</span><span class="p">[</span><span class="s2">"bucket"</span><span class="p">]</span>
<span class="n">dest_s3_prefix</span> <span class="o">=</span> <span class="s2">"s3://{}"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">dest_bucket</span><span class="p">)</span>
<span class="n">yesterday</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="n">dt</span><span class="o">.</span><span class="n">utcnow</span><span class="p">()</span> <span class="o">-</span> <span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">),</span> <span class="s2">"%Y%m</span><span class="si">%d</span><span class="s2">"</span><span class="p">)</span>
<span class="c1"># Default to running for "yesterday" unless we've been given a</span>
<span class="c1"># specific date via the environment.</span>
<span class="n">target_day</span> <span class="o">=</span> <span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"date"</span><span class="p">,</span> <span class="n">yesterday</span><span class="p">)</span>
<span class="k">print</span> <span class="s2">"Running import for {}"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">target_day</span><span class="p">)</span>
</pre></div>
<h3 id="read-the-source-log-data">Read the source log data</h3>
<p>The sync data on S3 is stored in framed heka format, and is read using the <code>Dataset</code> API.</p>
<div class="codehilite"><pre><span></span><span class="c1"># Read the source data</span>
<span class="n">schema</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">target_prefix</span> <span class="o">=</span> <span class="s1">'sync-metrics/data'</span>
<span class="n">sync</span> <span class="o">=</span> <span class="n">Dataset</span><span class="p">(</span><span class="n">source_bucket</span><span class="p">,</span> <span class="n">schema</span><span class="p">,</span> <span class="n">prefix</span><span class="o">=</span><span class="n">target_prefix</span><span class="p">)</span>
<span class="c1"># The sync data on S3 does not have a proper "date" dimension, but the date is encoded </span>
<span class="c1"># in the key names themselves.</span>
<span class="c1"># Fetch the summaries and filter the list to the target day.</span>
<span class="n">summary_prefix</span> <span class="o">=</span> <span class="s2">"{}/{}"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">target_prefix</span><span class="p">,</span> <span class="n">target_day</span><span class="p">)</span>
<span class="n">sync_summaries</span> <span class="o">=</span> <span class="p">[</span> <span class="n">s</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="n">sync</span><span class="o">.</span><span class="n">summaries</span><span class="p">(</span><span class="n">sc</span><span class="p">)</span> <span class="k">if</span> <span class="n">s</span><span class="p">[</span><span class="s1">'key'</span><span class="p">]</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="n">summary_prefix</span><span class="p">)</span> <span class="p">]</span>
</pre></div>
<h3 id="custom-heka-decoder">Custom heka decoder</h3>
<p>The standard heka decoder assumes (based on Telemetry data) that all fields whose names have a <code>.</code> in them contain nested json strings. This is not true for sync log messages, which have fields such as <code>syncstorage.storage.sql.db.execute</code> with simple scalar values.</p>
<div class="codehilite"><pre><span></span><span class="kn">import</span> <span class="nn">ssl</span>
<span class="kn">from</span> <span class="nn">moztelemetry.heka.message_parser</span> <span class="kn">import</span> <span class="n">unpack</span>
<span class="c1"># Custom decoder for sync messages since we can have scalar fields with dots in their names.</span>
<span class="k">def</span> <span class="nf">sync_decoder</span><span class="p">(</span><span class="n">message</span><span class="p">):</span>
<span class="k">try</span><span class="p">:</span>
<span class="k">for</span> <span class="n">record</span><span class="p">,</span> <span class="n">total_bytes</span> <span class="ow">in</span> <span class="n">unpack</span><span class="p">(</span><span class="n">message</span><span class="p">):</span>
<span class="n">result</span> <span class="o">=</span> <span class="p">{}</span>
<span class="n">result</span><span class="p">[</span><span class="s2">"meta"</span><span class="p">]</span> <span class="o">=</span> <span class="p">{</span>
<span class="s2">"Timestamp"</span><span class="p">:</span> <span class="n">record</span><span class="o">.</span><span class="n">message</span><span class="o">.</span><span class="n">timestamp</span><span class="p">,</span>
<span class="s2">"Type"</span><span class="p">:</span> <span class="n">record</span><span class="o">.</span><span class="n">message</span><span class="o">.</span><span class="n">type</span><span class="p">,</span>
<span class="s2">"Hostname"</span><span class="p">:</span> <span class="n">record</span><span class="o">.</span><span class="n">message</span><span class="o">.</span><span class="n">hostname</span><span class="p">,</span>
<span class="p">}</span>
<span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">record</span><span class="o">.</span><span class="n">message</span><span class="o">.</span><span class="n">fields</span><span class="p">:</span>
<span class="n">name</span> <span class="o">=</span> <span class="n">field</span><span class="o">.</span><span class="n">name</span>
<span class="n">value</span> <span class="o">=</span> <span class="n">field</span><span class="o">.</span><span class="n">value_string</span>
<span class="k">if</span> <span class="n">field</span><span class="o">.</span><span class="n">value_type</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="c1"># TODO: handle bytes in a way that doesn't cause problems with JSON</span>
<span class="c1"># value = field.value_bytes</span>
<span class="k">continue</span>
<span class="k">elif</span> <span class="n">field</span><span class="o">.</span><span class="n">value_type</span> <span class="o">==</span> <span class="mi">2</span><span class="p">:</span>
<span class="n">value</span> <span class="o">=</span> <span class="n">field</span><span class="o">.</span><span class="n">value_integer</span>
<span class="k">elif</span> <span class="n">field</span><span class="o">.</span><span class="n">value_type</span> <span class="o">==</span> <span class="mi">3</span><span class="p">:</span>
<span class="n">value</span> <span class="o">=</span> <span class="n">field</span><span class="o">.</span><span class="n">value_double</span>
<span class="k">elif</span> <span class="n">field</span><span class="o">.</span><span class="n">value_type</span> <span class="o">==</span> <span class="mi">4</span><span class="p">:</span>
<span class="n">value</span> <span class="o">=</span> <span class="n">field</span><span class="o">.</span><span class="n">value_bool</span>
<span class="n">result</span><span class="p">[</span><span class="n">name</span><span class="p">]</span> <span class="o">=</span> <span class="n">value</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> <span class="k">else</span> <span class="s2">""</span>
<span class="k">yield</span> <span class="n">result</span>
<span class="k">except</span> <span class="n">ssl</span><span class="o">.</span><span class="n">SSLError</span><span class="p">:</span>
<span class="k">pass</span> <span class="c1"># https://github.com/boto/boto/issues/2830</span>
<span class="n">sync_records</span> <span class="o">=</span> <span class="n">sync</span><span class="o">.</span><span class="n">records</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">decode</span><span class="o">=</span><span class="n">sync_decoder</span><span class="p">,</span> <span class="n">summaries</span><span class="o">=</span><span class="n">sync_summaries</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="c1"># What do the records look like?</span>
<span class="c1"># Example heka message:</span>
<span class="c1">#Timestamp: 2016-10-28 15:11:45.98653696 -0300 ADT</span>
<span class="c1">#Type: mozsvc.metrics</span>
<span class="c1">#Hostname: ip-172-31-39-11</span>
<span class="c1">#Pid: 11383</span>
<span class="c1">#UUID: 155866c8-cc58-4048-a58c-6226c620fc57</span>
<span class="c1">#Logger: Sync-1_5</span>
<span class="c1">#Payload:</span>
<span class="c1">#EnvVersion: 1</span>
<span class="c1">#Severity: 7</span>
<span class="c1">#Fields: [name:"remoteAddressChain" representation:"" value_string:"" value_string:"" </span>
<span class="c1"># name:"path" value_string:"https://host/ver/somenum/storage/tabs" </span>
<span class="c1"># name:"fxa_uid" value_string:"some_id" </span>
<span class="c1"># name:"user_agent_version" value_type:DOUBLE value_double:49 </span>
<span class="c1"># name:"user_agent_os" value_string:"Windows 7" </span>
<span class="c1"># name:"device_id" value_string:"some_device_id" </span>
<span class="c1"># name:"method" value_string:"POST" </span>
<span class="c1"># name:"user_agent_browser" value_string:"Firefox" </span>
<span class="c1"># name:"name" value_string:"mozsvc.metrics" </span>
<span class="c1"># name:"request_time" value_type:DOUBLE value_double:0.003030061721801758 </span>
<span class="c1"># name:"code" value_type:DOUBLE value_double:200 </span>
<span class="c1"># ]</span>
<span class="c1"># Example record:</span>
<span class="c1">#sync_records.first()</span>
<span class="c1"># {u'code': 200.0,</span>
<span class="c1"># u'device_id': u'some_device_id',</span>
<span class="c1"># u'fxa_uid': u'some_id',</span>
<span class="c1"># 'meta': {'Hostname': u'ip-172-31-39-11',</span>
<span class="c1"># 'Timestamp': 1477678305976742912L,</span>
<span class="c1"># 'Type': u'mozsvc.metrics'},</span>
<span class="c1"># u'method': u'GET',</span>
<span class="c1"># u'name': u'mozsvc.metrics',</span>
<span class="c1"># u'path': u'https://host/ver/somenum/storage/crypto/keys',</span>
<span class="c1"># u'remoteAddressChain': u'',</span>
<span class="c1"># u'request_time': 0.017612934112548828,</span>
<span class="c1"># u'syncstorage.storage.sql.db.execute': 0.014925241470336914,</span>
<span class="c1"># u'syncstorage.storage.sql.pool.get': 5.221366882324219e-05,</span>
<span class="c1"># u'user_agent_browser': u'Firefox',</span>
<span class="c1"># u'user_agent_os': u'Windows 7',</span>
<span class="c1"># u'user_agent_version': 49.0}</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="c1"># Convert data. Code ported from https://github.com/dannycoates/smt</span>
<span class="kn">import</span> <span class="nn">re</span>
<span class="kn">import</span> <span class="nn">hashlib</span>
<span class="kn">import</span> <span class="nn">math</span>
<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">Row</span>
<span class="k">def</span> <span class="nf">sha_prefix</span><span class="p">(</span><span class="n">v</span><span class="p">):</span>
<span class="n">h</span> <span class="o">=</span> <span class="n">hashlib</span><span class="o">.</span><span class="n">sha256</span><span class="p">()</span>
<span class="n">h</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">v</span><span class="p">)</span>
<span class="k">return</span> <span class="n">h</span><span class="o">.</span><span class="n">hexdigest</span><span class="p">()[</span><span class="mi">0</span><span class="p">:</span><span class="mi">32</span><span class="p">]</span>
<span class="n">path_uid</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="s2">"(\d+)\/storage\/"</span><span class="p">)</span>
<span class="n">path_bucket</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="s2">"\d+\/storage\/(\w+)"</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getUid</span><span class="p">(</span><span class="n">path</span><span class="p">):</span>
<span class="k">if</span> <span class="n">path</span> <span class="ow">is</span> <span class="bp">None</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">None</span>
<span class="n">match</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">path_uid</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span>
<span class="k">if</span> <span class="n">match</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">uid</span> <span class="o">=</span> <span class="n">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="k">return</span> <span class="n">sha_prefix</span><span class="p">(</span><span class="n">uid</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">None</span>
<span class="k">def</span> <span class="nf">deriveDeviceId</span><span class="p">(</span><span class="n">uid</span><span class="p">,</span> <span class="n">agent</span><span class="p">):</span>
<span class="k">if</span> <span class="n">uid</span> <span class="ow">is</span> <span class="bp">None</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">None</span>
<span class="k">return</span> <span class="n">sha_prefix</span><span class="p">(</span><span class="s2">"{}{}"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">uid</span><span class="p">,</span> <span class="n">agent</span><span class="p">))</span>
<span class="n">SyncRow</span> <span class="o">=</span> <span class="n">Row</span><span class="p">(</span><span class="s2">"uid"</span><span class="p">,</span> <span class="s2">"s_uid"</span><span class="p">,</span> <span class="s2">"dev"</span><span class="p">,</span> <span class="s2">"s_dev"</span><span class="p">,</span> <span class="s2">"ts"</span><span class="p">,</span> <span class="s2">"method"</span><span class="p">,</span> <span class="s2">"code"</span><span class="p">,</span>
<span class="s2">"bucket"</span><span class="p">,</span> <span class="s2">"t"</span><span class="p">,</span> <span class="s2">"ua_browser"</span><span class="p">,</span> <span class="s2">"ua_version"</span><span class="p">,</span> <span class="s2">"ua_os"</span><span class="p">,</span> <span class="s2">"host"</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">convert</span><span class="p">(</span><span class="n">msg</span><span class="p">):</span>
<span class="n">bmatch</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">path_bucket</span><span class="p">,</span> <span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"path"</span><span class="p">,</span> <span class="s2">""</span><span class="p">))</span>
<span class="k">if</span> <span class="n">bmatch</span> <span class="ow">is</span> <span class="bp">None</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">None</span>
<span class="n">bucket</span> <span class="o">=</span> <span class="n">bmatch</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="n">uid</span> <span class="o">=</span> <span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"fxa_uid"</span><span class="p">)</span>
<span class="n">synth_uid</span> <span class="o">=</span> <span class="n">getUid</span><span class="p">(</span><span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"path"</span><span class="p">))</span>
<span class="n">dev</span> <span class="o">=</span> <span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"device_id"</span><span class="p">)</span>
<span class="n">synth_dev</span> <span class="o">=</span> <span class="n">deriveDeviceId</span><span class="p">(</span><span class="n">synth_uid</span><span class="p">,</span>
<span class="s2">"{}{}{}"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"user_agent_browser"</span><span class="p">,</span> <span class="s2">""</span><span class="p">),</span>
<span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"user_agent_version"</span><span class="p">,</span> <span class="s2">""</span><span class="p">),</span>
<span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"user_agent_os"</span><span class="p">,</span> <span class="s2">""</span><span class="p">))</span>
<span class="p">)</span>
<span class="n">code</span> <span class="o">=</span> <span class="mi">200</span>
<span class="c1"># support modern mozlog's use of errno for http status</span>
<span class="n">errno</span> <span class="o">=</span> <span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"errno"</span><span class="p">)</span>
<span class="k">if</span> <span class="n">errno</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="k">if</span> <span class="n">errno</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> <span class="c1"># success</span>
<span class="n">code</span> <span class="o">=</span> <span class="mi">200</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">code</span> <span class="o">=</span> <span class="n">errno</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">code</span> <span class="o">=</span> <span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"code"</span><span class="p">)</span>
<span class="k">if</span> <span class="n">code</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">code</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">code</span><span class="p">)</span>
<span class="n">t</span> <span class="o">=</span> <span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"t"</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span>
<span class="k">if</span> <span class="n">t</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">t</span> <span class="o">=</span> <span class="n">math</span><span class="o">.</span><span class="n">floor</span><span class="p">(</span><span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"request_time"</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> <span class="o">*</span> <span class="mi">1000</span><span class="p">)</span>
<span class="k">if</span> <span class="n">t</span> <span class="ow">is</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">t</span> <span class="o">=</span> <span class="mi">0</span>
<span class="n">converted</span> <span class="o">=</span> <span class="n">SyncRow</span><span class="p">(</span>
<span class="p">(</span><span class="n">uid</span> <span class="ow">or</span> <span class="n">synth_uid</span><span class="p">),</span>
<span class="n">synth_uid</span><span class="p">,</span>
<span class="p">(</span><span class="n">dev</span> <span class="ow">or</span> <span class="n">synth_dev</span><span class="p">),</span>
<span class="n">synth_dev</span><span class="p">,</span>
<span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"meta"</span><span class="p">)</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"Timestamp"</span><span class="p">),</span>
<span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"method"</span><span class="p">),</span>
<span class="n">code</span><span class="p">,</span>
<span class="n">bucket</span><span class="p">,</span>
<span class="n">t</span><span class="p">,</span>
<span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"user_agent_browser"</span><span class="p">),</span>
<span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"user_agent_version"</span><span class="p">),</span>
<span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"user_agent_os"</span><span class="p">),</span>
<span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"meta"</span><span class="p">)</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"Hostname"</span><span class="p">),</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">converted</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">converted</span> <span class="o">=</span> <span class="n">sync_records</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">convert</span><span class="p">(</span><span class="n">x</span><span class="p">))</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">converted</span> <span class="o">=</span> <span class="n">converted</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SQLContext</span>
<span class="n">sync_df</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">converted</span><span class="p">)</span>
<span class="n">sync_df</span><span class="o">.</span><span class="n">printSchema</span><span class="p">()</span>
</pre></div>
<div class="codehilite"><pre><span></span>root
|-- uid: string (nullable = true)
|-- s_uid: string (nullable = true)
|-- dev: string (nullable = true)
|-- s_dev: string (nullable = true)
|-- ts: long (nullable = true)
|-- method: string (nullable = true)
|-- code: long (nullable = true)
|-- bucket: string (nullable = true)
|-- t: double (nullable = true)
|-- ua_browser: string (nullable = true)
|-- ua_version: double (nullable = true)
|-- ua_os: string (nullable = true)
|-- host: string (nullable = true)
</pre></div>
<div class="codehilite"><pre><span></span><span class="c1"># Determine if we need to repartition.</span>
<span class="c1"># A record is something like 112 bytes, so figure out how many partitions</span>
<span class="c1"># we need to end up with reasonably-sized files.</span>
<span class="n">records_per_partition</span> <span class="o">=</span> <span class="mi">2500000</span>
<span class="n">total_records</span> <span class="o">=</span> <span class="n">sync_df</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
<span class="k">print</span> <span class="s2">"Found {} sync records"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">total_records</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="kn">import</span> <span class="nn">math</span>
<span class="n">num_partitions</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">math</span><span class="o">.</span><span class="n">ceil</span><span class="p">(</span><span class="nb">float</span><span class="p">(</span><span class="n">total_records</span><span class="p">)</span> <span class="o">/</span> <span class="n">records_per_partition</span><span class="p">))</span>
<span class="k">if</span> <span class="n">num_partitions</span> <span class="o">!=</span> <span class="n">sync_df</span><span class="o">.</span><span class="n">rdd</span><span class="o">.</span><span class="n">getNumPartitions</span><span class="p">():</span>
<span class="k">print</span> <span class="s2">"Repartitioning with {} partitions"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">num_partitions</span><span class="p">)</span>
<span class="n">sync_df</span> <span class="o">=</span> <span class="n">sync_df</span><span class="o">.</span><span class="n">repartition</span><span class="p">(</span><span class="n">num_partitions</span><span class="p">)</span>
<span class="c1"># Store data</span>
<span class="n">sync_log_s3path</span> <span class="o">=</span> <span class="s2">"{}/sync_log/v1/day={}"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">dest_s3_prefix</span><span class="p">,</span> <span class="n">target_day</span><span class="p">)</span>
<span class="n">sync_df</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="n">sync_log_s3path</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s2">"overwrite"</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="c1"># Transform, compute and store rollups</span>
<span class="n">sync_df</span><span class="o">.</span><span class="n">registerTempTable</span><span class="p">(</span><span class="s2">"sync"</span><span class="p">)</span>
<span class="n">sql_transform</span> <span class="o">=</span> <span class="s1">'''</span>
<span class="s1"> select</span>
<span class="s1"> uid,</span>
<span class="s1"> dev,</span>
<span class="s1"> ts,</span>
<span class="s1"> t,</span>
<span class="s1"> case </span>
<span class="s1"> when substring(ua_os,0,7) in ('iPad', 'iPod', 'iPhone') then 'ios'</span>
<span class="s1"> when substring(ua_os,0,7) = 'Android' then 'android'</span>
<span class="s1"> when substring(ua_os,0,7) = 'Windows' then 'windows'</span>
<span class="s1"> when substring(ua_os,0,7) = 'Macinto' then 'mac'</span>
<span class="s1"> when substring(ua_os,0,7) = 'Linux' then 'linux'</span>
<span class="s1"> when ua_os is null then 'unknown'</span>
<span class="s1"> else 'other'</span>
<span class="s1"> end as ua_os,</span>
<span class="s1"> ua_browser,</span>
<span class="s1"> ua_version,</span>
<span class="s1"> case method when 'POST' then 1 end as posts,</span>
<span class="s1"> case method when 'GET' then 1 end as gets,</span>
<span class="s1"> case method when 'PUT' then 1 end as puts,</span>
<span class="s1"> case method when 'DELETE' then 1 end as dels,</span>
<span class="s1"> case when code &lt; 300 then 1 end as aoks,</span>
<span class="s1"> case when code &gt; 399 and code &lt; 500 then 1 end as oops,</span>
<span class="s1"> case when code &gt; 499 and code &lt; 999 then 1 end as fups,</span>
<span class="s1"> case when bucket = 'clients' and method = 'GET' then 1 end as r_clients,</span>
<span class="s1"> case when bucket = 'crypto' and method = 'GET' then 1 end as r_crypto,</span>
<span class="s1"> case when bucket = 'forms' and method = 'GET' then 1 end as r_forms,</span>
<span class="s1"> case when bucket = 'history' and method = 'GET' then 1 end as r_history,</span>
<span class="s1"> case when bucket = 'keys' and method = 'GET' then 1 end as r_keys,</span>
<span class="s1"> case when bucket = 'meta' and method = 'GET' then 1 end as r_meta,</span>
<span class="s1"> case when bucket = 'bookmarks' and method = 'GET' then 1 end as r_bookmarks,</span>
<span class="s1"> case when bucket = 'prefs' and method = 'GET' then 1 end as r_prefs,</span>
<span class="s1"> case when bucket = 'tabs' and method = 'GET' then 1 end as r_tabs,</span>
<span class="s1"> case when bucket = 'passwords' and method = 'GET' then 1 end as r_passwords,</span>
<span class="s1"> case when bucket = 'addons' and method = 'GET' then 1 end as r_addons,</span>
<span class="s1"> case when bucket = 'clients' and method = 'POST' then 1 end as w_clients,</span>
<span class="s1"> case when bucket = 'crypto' and method = 'POST' then 1 end as w_crypto,</span>
<span class="s1"> case when bucket = 'forms' and method = 'POST' then 1 end as w_forms,</span>
<span class="s1"> case when bucket = 'history' and method = 'POST' then 1 end as w_history,</span>
<span class="s1"> case when bucket = 'keys' and method = 'POST' then 1 end as w_keys,</span>
<span class="s1"> case when bucket = 'meta' and method = 'POST' then 1 end as w_meta,</span>
<span class="s1"> case when bucket = 'bookmarks' and method = 'POST' then 1 end as w_bookmarks,</span>
<span class="s1"> case when bucket = 'prefs' and method = 'POST' then 1 end as w_prefs,</span>
<span class="s1"> case when bucket = 'tabs' and method = 'POST' then 1 end as w_tabs,</span>
<span class="s1"> case when bucket = 'passwords' and method = 'POST' then 1 end as w_passwords,</span>
<span class="s1"> case when bucket = 'addons' and method = 'POST' then 1 end as w_addons</span>
<span class="s1"> from sync</span>
<span class="s1">'''</span>
<span class="n">transformed</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="n">sql_transform</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">transformed</span><span class="o">.</span><span class="n">registerTempTable</span><span class="p">(</span><span class="s2">"tx"</span><span class="p">)</span>
<span class="n">sql_device_activity</span> <span class="o">=</span> <span class="s1">'''</span>
<span class="s1"> select</span>
<span class="s1"> uid,</span>
<span class="s1"> dev,</span>
<span class="s1"> max(ua_os) as ua_os,</span>
<span class="s1"> max(ua_browser) as ua_browser,</span>
<span class="s1"> max(ua_version) as ua_version,</span>
<span class="s1"> min(t) as min_t,</span>
<span class="s1"> max(t) as max_t,</span>
<span class="s1"> sum(posts) as posts,</span>
<span class="s1"> sum(gets) as gets,</span>
<span class="s1"> sum(puts) as puts,</span>
<span class="s1"> sum(dels) as dels,</span>
<span class="s1"> sum(aoks) as aoks,</span>
<span class="s1"> sum(oops) as oops,</span>
<span class="s1"> sum(fups) as fups,</span>
<span class="s1"> sum(r_clients) as r_clients,</span>
<span class="s1"> sum(r_crypto) as r_crypto,</span>
<span class="s1"> sum(r_forms) as r_forms,</span>
<span class="s1"> sum(r_history) as r_history,</span>
<span class="s1"> sum(r_keys) as r_keys,</span>
<span class="s1"> sum(r_meta) as r_meta,</span>
<span class="s1"> sum(r_bookmarks) as r_bookmarks,</span>
<span class="s1"> sum(r_prefs) as r_prefs,</span>
<span class="s1"> sum(r_tabs) as r_tabs,</span>
<span class="s1"> sum(r_passwords) as r_passwords,</span>
<span class="s1"> sum(r_addons) as r_addons,</span>
<span class="s1"> sum(w_clients) as w_clients,</span>
<span class="s1"> sum(w_crypto) as w_crypto,</span>
<span class="s1"> sum(w_forms) as w_forms,</span>
<span class="s1"> sum(w_history) as w_history,</span>
<span class="s1"> sum(w_keys) as w_keys,</span>
<span class="s1"> sum(w_meta) as w_meta,</span>
<span class="s1"> sum(w_bookmarks) as w_bookmarks,</span>
<span class="s1"> sum(w_prefs) as w_prefs,</span>
<span class="s1"> sum(w_tabs) as w_tabs,</span>
<span class="s1"> sum(w_passwords) as w_passwords,</span>
<span class="s1"> sum(w_addons) as w_addons</span>
<span class="s1"> from tx group by uid, dev</span>
<span class="s1">'''</span>
<span class="n">rolled_up</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="n">sql_device_activity</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="c1"># Store device activity rollups</span>
<span class="n">sync_log_device_activity_s3base</span> <span class="o">=</span> <span class="s2">"{}/sync_log_device_activity/v1"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">dest_s3_prefix</span><span class="p">)</span>
<span class="n">sync_log_device_activity_s3path</span> <span class="o">=</span> <span class="s2">"{}/day={}"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">sync_log_device_activity_s3base</span><span class="p">,</span> <span class="n">target_day</span><span class="p">)</span>
<span class="c1"># TODO: Do we need to repartition?</span>
<span class="n">rolled_up</span><span class="o">.</span><span class="n">repartition</span><span class="p">(</span><span class="mi">5</span><span class="p">)</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="n">sync_log_device_activity_s3path</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s2">"overwrite"</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">compute_device_counts</span><span class="p">(</span><span class="n">device_activity</span><span class="p">,</span> <span class="n">target_day</span><span class="p">):</span>
<span class="n">device_activity</span><span class="o">.</span><span class="n">registerTempTable</span><span class="p">(</span><span class="s2">"device_activity"</span><span class="p">)</span>
<span class="n">df</span> <span class="o">=</span> <span class="s2">"%Y%m</span><span class="si">%d</span><span class="s2">"</span>
<span class="n">last_week_date</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">strptime</span><span class="p">(</span><span class="n">target_day</span><span class="p">,</span> <span class="n">df</span><span class="p">)</span> <span class="o">-</span> <span class="n">timedelta</span><span class="p">(</span><span class="mi">7</span><span class="p">)</span>
<span class="n">last_week</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="n">last_week_date</span><span class="p">,</span> <span class="n">df</span><span class="p">)</span>
<span class="n">sql_device_counts</span> <span class="o">=</span> <span class="s2">"""</span>
<span class="s2"> select</span>
<span class="s2"> uid,</span>
<span class="s2"> count(distinct dev) as devs</span>
<span class="s2"> from</span>
<span class="s2"> (select</span>
<span class="s2"> uid,</span>
<span class="s2"> dev</span>
<span class="s2"> from device_activity</span>
<span class="s2"> where uid in</span>
<span class="s2"> (select distinct(uid) from device_activity where day = '{}')</span>
<span class="s2"> and day &gt; '{}'</span>
<span class="s2"> and day &lt;= '{}')</span>
<span class="s2"> group by uid</span>
<span class="s2"> """</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">target_day</span><span class="p">,</span> <span class="n">last_week</span><span class="p">,</span> <span class="n">target_day</span><span class="p">)</span>
<span class="k">return</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="n">sql_device_counts</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="c1"># Compute and store device counts</span>
<span class="c1"># Re-read device activity data from S3 so we can look at historic info</span>
<span class="n">device_activity</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="n">sync_log_device_activity_s3base</span><span class="p">)</span>
<span class="n">device_counts</span> <span class="o">=</span> <span class="n">compute_device_counts</span><span class="p">(</span><span class="n">device_activity</span><span class="p">,</span> <span class="n">target_day</span><span class="p">)</span>
<span class="n">sync_log_device_counts_s3path</span> <span class="o">=</span> <span class="s2">"{}/sync_log_device_counts/v1/day={}"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">dest_s3_prefix</span><span class="p">,</span> <span class="n">target_day</span><span class="p">)</span>
<span class="n">device_counts</span><span class="o">.</span><span class="n">repartition</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="n">sync_log_device_counts_s3path</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s2">"overwrite"</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
</div>
</div></body></html>
<div>
</div>
<br/>
<div class="row">
<div class="col-md-12">
</div>
</div>
<script type="text/javascript">
$("#searchbar")[0].setSelectionRange(1000, 1000);
$('#searchbar').typeahead({
hint: false,
highlight: true,
minLength: 1
},
{
name: 'knowledge_posts',
limit: 10,
display: function (item) {
return item.title + " - " + item.author;
},
templates: {
empty: Handlebars.compile(
'<div class="tt-not-found">' +
'Unable to find any posts that match the current query' +
'</div>'
),
suggestion: function(data) {
return '<p style="overflow-wrap:break-word"><strong class="text-rausch">' + data.title + '</strong> – ' + data.author + '</p>';
}
},
source: function(q, sync, async) {
$.ajax('/ajax/index/typeahead?search=' + q,
{
success: function(data,status){ async(JSON.parse(data)); }
})
}
});
$('#searchbar').bind('typeahead:select', function(obj, datum, name) {
window.location = '/post/'+encodeURIComponent(datum.path);
});
$('#searchbar').keypress(function(event){
var keycode = (event.keyCode ? event.keyCode : event.which);
if(keycode == '13'){
var path = document.location.pathname;
window.location = '/feed?filters=' + $('#searchbar').val()
}
});
var padding = $('.tt-menu').outerWidth()
$('.tt-menu').width($('#searchbar').width() + padding + "px")
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/tooltips.js" type="text/javascript"></script>
<script type="text/javascript">
$("document").ready(function(){
var is_webeditor = false;
var post_id = "1";
var id = "None";
var post_path = "etl/sync_log.kp"
var data_repo_github_root = ""
tooltipsJx.initializeTooltips(is_webeditor, post_id, id, data_repo_github_root);
$(".btn-rendered").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path);
})
$(".btn-raw").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path) + "?render=raw";
})
$(".btn-webeditor").on("click", function(){
document.location.href = "/edit/" + encodeURI(post_path);
})
});
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/tags.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/icons.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/comments.js" type="text/javascript"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js" type="text/javascript">
</script>
<script>
$(document).on('ready', function(){
// Make the Rendered Markdown Button active
$(".btn-rendered").addClass("btn-active");
// Initialize headers
helpersJx.linkifyHeaders();
// Initialize comments
var post_path = 'etl/sync_log.kp';
$("#post_comment_btn").on('click', function(){
comment_author = 'knowledge_default';
post_author = 'mreid-moz';
post_title = 'Bug 1291340 - Import sync log data';
commentsJx.postComment(comment_author, post_author, post_title, post_path);
location.reload();
});
all_comment_delete_buttons = $("[id^=delete_comment]")
$.each(all_comment_delete_buttons, function(i,v){
$(v).on("click", function(){
var id = v.id;
var comment_id = id.split("__")[1];
if(comment_id) {
commentsJx.deleteComment(post_path, comment_id)
location.reload();
}
});
});
$(document.body).on('click',"button[id^=tag-subscription]",function () {
tagsJx.addTagSubscriptionListener($(this)[0]);
});
})
//Turn all the headers to be links
//h1 = Title, don't want that
var all_headers = [$("h2"), $("h3"), $("h4"), $("h5"), $("h6")]
$.each(all_headers, function(index, value){
$.each(value, function(i, v){
var inner_html = v.innerHTML
inner_html_no_special = inner_html.replace(/[^a-zA-Z\- ]/g, "")
var inner_link = "#" + inner_html_no_special.toLowerCase().split(" ").join("-")
v.innerHTML = "<a href='" + inner_link + "' class=link-reset>" + inner_html + "</a>"
})
})
//turn all the tags into links, similar to what's done on the feed page
var tags = $("#renderedMarkdown .metadata .tags")[0]
var tags_list = ['sync', 'etl']
var subscriptions_list = []
$.each(tags_list, function(i,tag){
ahref = document.createElement("a")
e_tag = encodeURIComponent(tag)
f_tag = tag.replace("/", "__")
tag_name = "#" + tag
tag_subscription_button_id_name = "tag-subscription-" + i + "__" + f_tag
ahref.setAttribute("data-container", "body")
ahref.setAttribute("data-toggle", "popover")
ahref.setAttribute("data-placement", "bottom")
ahref.setAttribute("data-html", "true")
ahref.setAttribute("data-tag-name", f_tag)
if (subscriptions_list.indexOf(tag) >= 0) {
ahref.setAttribute("class", "label label-subscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-primary btn-unsubscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-remove-sign glyphicon-white'></i>Unsubscribe " +
" </button> " +
" </div>")
} else {
ahref.setAttribute("class", "label label-unsubscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-default btn-subscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-ok-sign glyphicon-filled'></i>Subscribe " +
" </button> " +
" </div>")
}
ahref.setAttribute("href", "/tag_pages?tag=" + e_tag)
ahref.setAttribute("style", "font-weight:normal")
if (i == 0){
ahref.innerHTML = " "
colon = document.createElement("text")
colon.innerHTML = "<b>Tags</b>: "
tags.appendChild(colon)
}
ahref.innerHTML = ahref.innerHTML + tag_name
tags.appendChild(ahref)
if (i != tags_list.length - 1){
comma = document.createElement("text")
comma.innerText = ", "
tags.appendChild(comma)
}
})
tags.nextSibling.remove()
tags.innerHTML += "<i class='glyphicon glyphicon-edit icon-gray' style='font-size:12pt; padding-left:4px' id='tooltip-edit_tags'></i>"
$(".pop").popover({ trigger: "manual" , html: true, animation:false, delay: 100})
.on("mouseenter", function () {
var _this = this;
$(this).popover("show");
$(".popover").on("mouseleave", function () {
$(_this).popover('hide');
});
}).on("mouseleave", function () {
var _this = this;
setTimeout(function () {
if (!$(".popover:hover").length) {
$(_this).popover("hide");
}
}, 300);
});
$('#tooltip-edit_tags').click(function(){
$('#tooltip-edit_tags')[0].setAttribute("style", "display:none")
previousSibling = $("#tooltip-edit_tags")[0].previousSibling
tags_string = tags_list.join(", ")
form = document.createElement("form")
input = document.createElement("input")
tags_text = document.createElement("text")
icon_class = document.createElement("i")
icon_class.setAttribute("class", "glyphicon glyphicon-upload icon-gray")
icon_class.setAttribute("style", "font-size:23px; padding-left:4px")
icon_class.setAttribute("id", "tooltip-save_tags")
tags_text.innerText = "Tags: "
input.setAttribute('type', 'text')
input.setAttribute('name', 'tags_list')
input.setAttribute('value', tags_string)
input.setAttribute('style', 'width:75%; display: inline-block')
input.setAttribute('id' , 'change_tags')
form.appendChild(tags_text)
tags.textContent = " "
form.appendChild(input)
form.appendChild(icon_class)
tags.appendChild(form)
$("#tooltip-save_tags").click(function(){
tags_string = $("#change_tags")[0].value
tags_list = tags_string.split(",")
var re = /^[a-z0-9\-\_\:\/]+$/i
var good = true
for (var i = 0; i < tags_list.length; i++){
tag = tags_list[i]
if (tag.length == 0){
alert("There is a tag with length 0 - possible a trailing comma?")
good = false
break
} else {
tag_name = tag.trim()
if (!(re.test(tag_name))){
alert("The tag contains special characters. Make sure there are only alphanumeric characters in your tag")
good = false
break
}
}
}
if (good) {
var postContent = {}
postContent['tags'] = tags_string
$.ajax({
type: "POST",
dataType: "json",
data: JSON.stringify(postContent),
contentType: "application/json",
url: '/tag_list?post_path=etl/sync_log.kp',
async: false
});
location.reload()
}
})
tags.nextSibling.remove()
// Allow user to edit tags
var edit_icon = iconsJx.createEditTagsIcon();
$(tags).after(edit_icon);
$("#tooltip-edit_tags").on("click", function(){
var edit_tooltip = $("#tooltip-edit_tags");
edit_tooltip.attr("style", "display:none");
var tags_string = tags_list.join(", ");
var form = $("<form>");
var input = $("<input>");
var tags_text = $("<text>");
tags_text.html("Tags: ");
var icon = iconsJx.createSaveTagsIcon();
input.attr("type", "text");
input.attr("name", "tags_list");
input.attr("style", "width:75%; display: inline-block");
input.attr("id", "change_tags");
form.append(tags_text);
tags.textContent = " ";
tags_text.innerHTML = "Tags: ";
form.append(input);
form.append(icon);
tags.appendChild(form[0]);
$("#change_tags")[0].value = tags_string;
$("#change_tags").keypress(function(e){
if (e.which == 13){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/sync_log.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false;
};
});
$("#tooltip-save_tags").click(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/sync_log.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
});
$("form").submit(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/sync_log.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false
})
});
});
</script>

Просмотреть файл

@ -0,0 +1,958 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title> Knowledge </title>
<!-- js includes at the top as post embedded js colliding -->
<script src="/static/modules/jquery/jquery.min.js"></script>
<script src="/static/modules/tether/js/tether.min.js"></script>
<script src="/static/modules/bootstrap/js/bootstrap.min.js"></script>
<script src="/static/modules/bootstrap-slider/js/bootstrap-slider.min.js"></script>
<script src="/static/modules/typeahead.js/typeahead.bundle.min.js"></script>
<script src="/static/modules/handlebars/js/handlebars.js"></script>
<script src="/static/js/helpers.js"></script>
<script src="/static/modules/select2/js/select2.min.js"></script>
<script src="/static/modules/hightlight.pack.js/highlight.pack.js"></script>
<script src="/static/modules/marked.js/marked.js"></script>
<!-- require js is used for plotly, but has a bunch of collisions with other js packages
make sure to have it be last js package imported -->
<script src="/static/modules/require.js/require.min.js"></script>
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.6.2/html5shiv.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/2.1.0/es5-shim.min.js"></script>
<![endif]-->
<link rel="stylesheet" href="/static/modules/bootstrap/css/bootstrap.min.css">
<link rel="stylesheet" href="/static/modules/bootstrap-slider/css/bootstrap-slider.min.css">
<link rel="stylesheet" type='text/css' href="/static/css/custom.css">
<link rel="stylesheet" type='text/css' href="/static/modules/select2/css/select2.min.css">
<link href='https://fonts.googleapis.com/css?family=Lato:400,900|Playfair+Display|Source+Serif+Pro:400,700' rel='stylesheet' type='text/css'>
<link rel="shortcut icon" href="/static/images/favicon.png">
<link rel="stylesheet" href="/static/css/codehilite-friendly.css">
<style>
.spinner {
position: fixed;
top: 50%;
left: 50%;
margin-left: -50px; /* half width of the spinner gif */
margin-top: -50px; /* half height of the spinner gif */
text-align:center;
z-index:1234;
overflow: auto;
width: 100px; /* width of the spinner gif */
height: 102px; /*hight of the spinner gif +2px to fix IE8 issue */
}
.table {
font-size: 14px;
}
.modal-content {
max-width: 1024px;
}
</style>
</head>
<body>
<div class="navbar navbar-knowledge" role="navigation">
<div class="container page-container">
<a href="/" aria-selected="false" class="logo-image navbar-text" style='margin-top:16px; margin-left: -7px'>
<img width="125" src='/static/images/logo-white.svg'></img>
</a>
<a id="feed_tab" href="/" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Home
</a>
<a id="favorites_tab" href="/favorites" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Favorites
</a>
<a id="help_tab" href="/about" aria-selected="false" class="navbar-text" style='margin-top:18px'>
About
</a>
<a id="stats_tab" href="/stats" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Stats
</a>
<a id="webposts_tab" href="/create"
class="btn btn-primary navbar-text pull-right"
style='border-radius:4px; margin-top:10px; margin-right:-10px; background-color: #00a699; color:white; border-color: #00a699'>
Write a Post!
</a>
<div class="pull-right">
<div class="form-group">
<input class="form-control" id="searchbar" placeholder="Search for Knowledge" style="text-align:right">
</div>
</div>
</div>
</div>
<div class="container page-container">
<br>
<div class="container-fluid">
<div class="row">
<div class="col-md-6">
<div class="btn-group" role="group">
<button type="button" class="btn btn-default btn-rendered">
View Post
</button>
<button type="button" class="btn btn-default btn-raw">
View Raw Markdown
</button>
</div>
</div>
<div class="col-md-2">
</div>
<div class="col-md-4 text-right">
<i class="glyphicon glyphicon-eye-open" id="pageview_stats" style='color: #9CA299'></i>
<div id="pageview_stats" style="display: inline-block">
Viewed 1 times by 1 different users
</div>
<i class="glyphicon glyphicon-heart-empty glyphicon-clickable pop" style="font-size:16pt" id="tooltip-like" data-placement="bottom"
data-trigger="#tooltip-like"
data-container="body"
data-toggle="popover"
data-content='<div>Like This Post</div>'></i>
</div>
</div>
<div class="row col-md-12">
</div>
</div>
<div class='container-fluid'>
<div class="row">
<div class="col-md-12">
<div id="renderedMarkdown">
<div class='metadata'>
<h1>Bug 1291340 - Import sync log data</h1>
<span class='authors'><a href='/feed?authors=mreid-moz'>mreid-moz</a></span>
<span class='date_created'>November 15, 2016</span>
<span class='date_updated'>(Last Updated: July 10, 2017)</span>
<span class='tldr'><p>Read, convert, and store sync log data to Parquet form per <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1291340">bug 1291340</a>.</p></span>
<span class='tags'></span>
</div>
<h2 id="bug-1291340-import-sync-log-data">Bug 1291340 - Import sync log data</h2>
<p>Read, convert, and store sync log data to Parquet form per <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1291340">bug 1291340</a>.</p>
<p>Conversion code is ported from the <a href="https://github.com/dannycoates/smt">smt repo</a>.</p>
<div class="codehilite"><pre><span></span><span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">datetime</span> <span class="k">as</span> <span class="n">dt</span><span class="p">,</span> <span class="n">timedelta</span><span class="p">,</span> <span class="n">date</span>
<span class="kn">from</span> <span class="nn">moztelemetry.dataset</span> <span class="kn">import</span> <span class="n">Dataset</span>
<span class="kn">from</span> <span class="nn">os</span> <span class="kn">import</span> <span class="n">environ</span>
<span class="c1"># Determine run parameters</span>
<span class="n">source_bucket</span> <span class="o">=</span> <span class="s1">&#39;net-mozaws-prod-us-west-2-pipeline-analysis&#39;</span>
<span class="n">dest_bucket</span> <span class="o">=</span> <span class="n">source_bucket</span>
<span class="n">dest_s3_prefix</span> <span class="o">=</span> <span class="s2">&quot;s3://{}/mreid&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">dest_bucket</span><span class="p">)</span>
<span class="k">if</span> <span class="s2">&quot;bucket&quot;</span> <span class="ow">in</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">:</span>
<span class="n">dest_bucket</span> <span class="o">=</span> <span class="n">environ</span><span class="p">[</span><span class="s2">&quot;bucket&quot;</span><span class="p">]</span>
<span class="n">dest_s3_prefix</span> <span class="o">=</span> <span class="s2">&quot;s3://{}&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">dest_bucket</span><span class="p">)</span>
<span class="n">yesterday</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="n">dt</span><span class="o">.</span><span class="n">utcnow</span><span class="p">()</span> <span class="o">-</span> <span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">),</span> <span class="s2">&quot;%Y%m</span><span class="si">%d</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="c1"># Default to running for &quot;yesterday&quot; unless we&#39;ve been given a</span>
<span class="c1"># specific date via the environment.</span>
<span class="n">target_day</span> <span class="o">=</span> <span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;date&quot;</span><span class="p">,</span> <span class="n">yesterday</span><span class="p">)</span>
<span class="k">print</span> <span class="s2">&quot;Running import for {}&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">target_day</span><span class="p">)</span>
</pre></div>
<h3 id="read-the-source-log-data">Read the source log data</h3>
<p>The sync data on S3 is stored in framed heka format, and is read using the <code>Dataset</code> API.</p>
<div class="codehilite"><pre><span></span><span class="c1"># Read the source data</span>
<span class="n">schema</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">target_prefix</span> <span class="o">=</span> <span class="s1">&#39;sync-metrics/data&#39;</span>
<span class="n">sync</span> <span class="o">=</span> <span class="n">Dataset</span><span class="p">(</span><span class="n">source_bucket</span><span class="p">,</span> <span class="n">schema</span><span class="p">,</span> <span class="n">prefix</span><span class="o">=</span><span class="n">target_prefix</span><span class="p">)</span>
<span class="c1"># The sync data on S3 does not have a proper &quot;date&quot; dimension, but the date is encoded </span>
<span class="c1"># in the key names themselves.</span>
<span class="c1"># Fetch the summaries and filter the list to the target day.</span>
<span class="n">summary_prefix</span> <span class="o">=</span> <span class="s2">&quot;{}/{}&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">target_prefix</span><span class="p">,</span> <span class="n">target_day</span><span class="p">)</span>
<span class="n">sync_summaries</span> <span class="o">=</span> <span class="p">[</span> <span class="n">s</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="n">sync</span><span class="o">.</span><span class="n">summaries</span><span class="p">(</span><span class="n">sc</span><span class="p">)</span> <span class="k">if</span> <span class="n">s</span><span class="p">[</span><span class="s1">&#39;key&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="n">summary_prefix</span><span class="p">)</span> <span class="p">]</span>
</pre></div>
<h3 id="custom-heka-decoder">Custom heka decoder</h3>
<p>The standard heka decoder assumes (based on Telemetry data) that all fields whose names have a <code>.</code> in them contain nested json strings. This is not true for sync log messages, which have fields such as <code>syncstorage.storage.sql.db.execute</code> with simple scalar values.</p>
<div class="codehilite"><pre><span></span><span class="kn">import</span> <span class="nn">ssl</span>
<span class="kn">from</span> <span class="nn">moztelemetry.heka.message_parser</span> <span class="kn">import</span> <span class="n">unpack</span>
<span class="c1"># Custom decoder for sync messages since we can have scalar fields with dots in their names.</span>
<span class="k">def</span> <span class="nf">sync_decoder</span><span class="p">(</span><span class="n">message</span><span class="p">):</span>
<span class="k">try</span><span class="p">:</span>
<span class="k">for</span> <span class="n">record</span><span class="p">,</span> <span class="n">total_bytes</span> <span class="ow">in</span> <span class="n">unpack</span><span class="p">(</span><span class="n">message</span><span class="p">):</span>
<span class="n">result</span> <span class="o">=</span> <span class="p">{}</span>
<span class="n">result</span><span class="p">[</span><span class="s2">&quot;meta&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="p">{</span>
<span class="s2">&quot;Timestamp&quot;</span><span class="p">:</span> <span class="n">record</span><span class="o">.</span><span class="n">message</span><span class="o">.</span><span class="n">timestamp</span><span class="p">,</span>
<span class="s2">&quot;Type&quot;</span><span class="p">:</span> <span class="n">record</span><span class="o">.</span><span class="n">message</span><span class="o">.</span><span class="n">type</span><span class="p">,</span>
<span class="s2">&quot;Hostname&quot;</span><span class="p">:</span> <span class="n">record</span><span class="o">.</span><span class="n">message</span><span class="o">.</span><span class="n">hostname</span><span class="p">,</span>
<span class="p">}</span>
<span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">record</span><span class="o">.</span><span class="n">message</span><span class="o">.</span><span class="n">fields</span><span class="p">:</span>
<span class="n">name</span> <span class="o">=</span> <span class="n">field</span><span class="o">.</span><span class="n">name</span>
<span class="n">value</span> <span class="o">=</span> <span class="n">field</span><span class="o">.</span><span class="n">value_string</span>
<span class="k">if</span> <span class="n">field</span><span class="o">.</span><span class="n">value_type</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="c1"># TODO: handle bytes in a way that doesn&#39;t cause problems with JSON</span>
<span class="c1"># value = field.value_bytes</span>
<span class="k">continue</span>
<span class="k">elif</span> <span class="n">field</span><span class="o">.</span><span class="n">value_type</span> <span class="o">==</span> <span class="mi">2</span><span class="p">:</span>
<span class="n">value</span> <span class="o">=</span> <span class="n">field</span><span class="o">.</span><span class="n">value_integer</span>
<span class="k">elif</span> <span class="n">field</span><span class="o">.</span><span class="n">value_type</span> <span class="o">==</span> <span class="mi">3</span><span class="p">:</span>
<span class="n">value</span> <span class="o">=</span> <span class="n">field</span><span class="o">.</span><span class="n">value_double</span>
<span class="k">elif</span> <span class="n">field</span><span class="o">.</span><span class="n">value_type</span> <span class="o">==</span> <span class="mi">4</span><span class="p">:</span>
<span class="n">value</span> <span class="o">=</span> <span class="n">field</span><span class="o">.</span><span class="n">value_bool</span>
<span class="n">result</span><span class="p">[</span><span class="n">name</span><span class="p">]</span> <span class="o">=</span> <span class="n">value</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> <span class="k">else</span> <span class="s2">&quot;&quot;</span>
<span class="k">yield</span> <span class="n">result</span>
<span class="k">except</span> <span class="n">ssl</span><span class="o">.</span><span class="n">SSLError</span><span class="p">:</span>
<span class="k">pass</span> <span class="c1"># https://github.com/boto/boto/issues/2830</span>
<span class="n">sync_records</span> <span class="o">=</span> <span class="n">sync</span><span class="o">.</span><span class="n">records</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">decode</span><span class="o">=</span><span class="n">sync_decoder</span><span class="p">,</span> <span class="n">summaries</span><span class="o">=</span><span class="n">sync_summaries</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="c1"># What do the records look like?</span>
<span class="c1"># Example heka message:</span>
<span class="c1">#Timestamp: 2016-10-28 15:11:45.98653696 -0300 ADT</span>
<span class="c1">#Type: mozsvc.metrics</span>
<span class="c1">#Hostname: ip-172-31-39-11</span>
<span class="c1">#Pid: 11383</span>
<span class="c1">#UUID: 155866c8-cc58-4048-a58c-6226c620fc57</span>
<span class="c1">#Logger: Sync-1_5</span>
<span class="c1">#Payload:</span>
<span class="c1">#EnvVersion: 1</span>
<span class="c1">#Severity: 7</span>
<span class="c1">#Fields: [name:&quot;remoteAddressChain&quot; representation:&quot;&quot; value_string:&quot;&quot; value_string:&quot;&quot; </span>
<span class="c1"># name:&quot;path&quot; value_string:&quot;https://host/ver/somenum/storage/tabs&quot; </span>
<span class="c1"># name:&quot;fxa_uid&quot; value_string:&quot;some_id&quot; </span>
<span class="c1"># name:&quot;user_agent_version&quot; value_type:DOUBLE value_double:49 </span>
<span class="c1"># name:&quot;user_agent_os&quot; value_string:&quot;Windows 7&quot; </span>
<span class="c1"># name:&quot;device_id&quot; value_string:&quot;some_device_id&quot; </span>
<span class="c1"># name:&quot;method&quot; value_string:&quot;POST&quot; </span>
<span class="c1"># name:&quot;user_agent_browser&quot; value_string:&quot;Firefox&quot; </span>
<span class="c1"># name:&quot;name&quot; value_string:&quot;mozsvc.metrics&quot; </span>
<span class="c1"># name:&quot;request_time&quot; value_type:DOUBLE value_double:0.003030061721801758 </span>
<span class="c1"># name:&quot;code&quot; value_type:DOUBLE value_double:200 </span>
<span class="c1"># ]</span>
<span class="c1"># Example record:</span>
<span class="c1">#sync_records.first()</span>
<span class="c1"># {u&#39;code&#39;: 200.0,</span>
<span class="c1"># u&#39;device_id&#39;: u&#39;some_device_id&#39;,</span>
<span class="c1"># u&#39;fxa_uid&#39;: u&#39;some_id&#39;,</span>
<span class="c1"># &#39;meta&#39;: {&#39;Hostname&#39;: u&#39;ip-172-31-39-11&#39;,</span>
<span class="c1"># &#39;Timestamp&#39;: 1477678305976742912L,</span>
<span class="c1"># &#39;Type&#39;: u&#39;mozsvc.metrics&#39;},</span>
<span class="c1"># u&#39;method&#39;: u&#39;GET&#39;,</span>
<span class="c1"># u&#39;name&#39;: u&#39;mozsvc.metrics&#39;,</span>
<span class="c1"># u&#39;path&#39;: u&#39;https://host/ver/somenum/storage/crypto/keys&#39;,</span>
<span class="c1"># u&#39;remoteAddressChain&#39;: u&#39;&#39;,</span>
<span class="c1"># u&#39;request_time&#39;: 0.017612934112548828,</span>
<span class="c1"># u&#39;syncstorage.storage.sql.db.execute&#39;: 0.014925241470336914,</span>
<span class="c1"># u&#39;syncstorage.storage.sql.pool.get&#39;: 5.221366882324219e-05,</span>
<span class="c1"># u&#39;user_agent_browser&#39;: u&#39;Firefox&#39;,</span>
<span class="c1"># u&#39;user_agent_os&#39;: u&#39;Windows 7&#39;,</span>
<span class="c1"># u&#39;user_agent_version&#39;: 49.0}</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="c1"># Convert data. Code ported from https://github.com/dannycoates/smt</span>
<span class="kn">import</span> <span class="nn">re</span>
<span class="kn">import</span> <span class="nn">hashlib</span>
<span class="kn">import</span> <span class="nn">math</span>
<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">Row</span>
<span class="k">def</span> <span class="nf">sha_prefix</span><span class="p">(</span><span class="n">v</span><span class="p">):</span>
<span class="n">h</span> <span class="o">=</span> <span class="n">hashlib</span><span class="o">.</span><span class="n">sha256</span><span class="p">()</span>
<span class="n">h</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">v</span><span class="p">)</span>
<span class="k">return</span> <span class="n">h</span><span class="o">.</span><span class="n">hexdigest</span><span class="p">()[</span><span class="mi">0</span><span class="p">:</span><span class="mi">32</span><span class="p">]</span>
<span class="n">path_uid</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="s2">&quot;(\d+)\/storage\/&quot;</span><span class="p">)</span>
<span class="n">path_bucket</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="s2">&quot;\d+\/storage\/(\w+)&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getUid</span><span class="p">(</span><span class="n">path</span><span class="p">):</span>
<span class="k">if</span> <span class="n">path</span> <span class="ow">is</span> <span class="bp">None</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">None</span>
<span class="n">match</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">path_uid</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span>
<span class="k">if</span> <span class="n">match</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">uid</span> <span class="o">=</span> <span class="n">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="k">return</span> <span class="n">sha_prefix</span><span class="p">(</span><span class="n">uid</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">None</span>
<span class="k">def</span> <span class="nf">deriveDeviceId</span><span class="p">(</span><span class="n">uid</span><span class="p">,</span> <span class="n">agent</span><span class="p">):</span>
<span class="k">if</span> <span class="n">uid</span> <span class="ow">is</span> <span class="bp">None</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">None</span>
<span class="k">return</span> <span class="n">sha_prefix</span><span class="p">(</span><span class="s2">&quot;{}{}&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">uid</span><span class="p">,</span> <span class="n">agent</span><span class="p">))</span>
<span class="n">SyncRow</span> <span class="o">=</span> <span class="n">Row</span><span class="p">(</span><span class="s2">&quot;uid&quot;</span><span class="p">,</span> <span class="s2">&quot;s_uid&quot;</span><span class="p">,</span> <span class="s2">&quot;dev&quot;</span><span class="p">,</span> <span class="s2">&quot;s_dev&quot;</span><span class="p">,</span> <span class="s2">&quot;ts&quot;</span><span class="p">,</span> <span class="s2">&quot;method&quot;</span><span class="p">,</span> <span class="s2">&quot;code&quot;</span><span class="p">,</span>
<span class="s2">&quot;bucket&quot;</span><span class="p">,</span> <span class="s2">&quot;t&quot;</span><span class="p">,</span> <span class="s2">&quot;ua_browser&quot;</span><span class="p">,</span> <span class="s2">&quot;ua_version&quot;</span><span class="p">,</span> <span class="s2">&quot;ua_os&quot;</span><span class="p">,</span> <span class="s2">&quot;host&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">convert</span><span class="p">(</span><span class="n">msg</span><span class="p">):</span>
<span class="n">bmatch</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">path_bucket</span><span class="p">,</span> <span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;path&quot;</span><span class="p">,</span> <span class="s2">&quot;&quot;</span><span class="p">))</span>
<span class="k">if</span> <span class="n">bmatch</span> <span class="ow">is</span> <span class="bp">None</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">None</span>
<span class="n">bucket</span> <span class="o">=</span> <span class="n">bmatch</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="n">uid</span> <span class="o">=</span> <span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;fxa_uid&quot;</span><span class="p">)</span>
<span class="n">synth_uid</span> <span class="o">=</span> <span class="n">getUid</span><span class="p">(</span><span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;path&quot;</span><span class="p">))</span>
<span class="n">dev</span> <span class="o">=</span> <span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;device_id&quot;</span><span class="p">)</span>
<span class="n">synth_dev</span> <span class="o">=</span> <span class="n">deriveDeviceId</span><span class="p">(</span><span class="n">synth_uid</span><span class="p">,</span>
<span class="s2">&quot;{}{}{}&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;user_agent_browser&quot;</span><span class="p">,</span> <span class="s2">&quot;&quot;</span><span class="p">),</span>
<span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;user_agent_version&quot;</span><span class="p">,</span> <span class="s2">&quot;&quot;</span><span class="p">),</span>
<span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;user_agent_os&quot;</span><span class="p">,</span> <span class="s2">&quot;&quot;</span><span class="p">))</span>
<span class="p">)</span>
<span class="n">code</span> <span class="o">=</span> <span class="mi">200</span>
<span class="c1"># support modern mozlog&#39;s use of errno for http status</span>
<span class="n">errno</span> <span class="o">=</span> <span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;errno&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">errno</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="k">if</span> <span class="n">errno</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> <span class="c1"># success</span>
<span class="n">code</span> <span class="o">=</span> <span class="mi">200</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">code</span> <span class="o">=</span> <span class="n">errno</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">code</span> <span class="o">=</span> <span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;code&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">code</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">code</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">code</span><span class="p">)</span>
<span class="n">t</span> <span class="o">=</span> <span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;t&quot;</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span>
<span class="k">if</span> <span class="n">t</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">t</span> <span class="o">=</span> <span class="n">math</span><span class="o">.</span><span class="n">floor</span><span class="p">(</span><span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;request_time&quot;</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> <span class="o">*</span> <span class="mi">1000</span><span class="p">)</span>
<span class="k">if</span> <span class="n">t</span> <span class="ow">is</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">t</span> <span class="o">=</span> <span class="mi">0</span>
<span class="n">converted</span> <span class="o">=</span> <span class="n">SyncRow</span><span class="p">(</span>
<span class="p">(</span><span class="n">uid</span> <span class="ow">or</span> <span class="n">synth_uid</span><span class="p">),</span>
<span class="n">synth_uid</span><span class="p">,</span>
<span class="p">(</span><span class="n">dev</span> <span class="ow">or</span> <span class="n">synth_dev</span><span class="p">),</span>
<span class="n">synth_dev</span><span class="p">,</span>
<span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;meta&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;Timestamp&quot;</span><span class="p">),</span>
<span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;method&quot;</span><span class="p">),</span>
<span class="n">code</span><span class="p">,</span>
<span class="n">bucket</span><span class="p">,</span>
<span class="n">t</span><span class="p">,</span>
<span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;user_agent_browser&quot;</span><span class="p">),</span>
<span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;user_agent_version&quot;</span><span class="p">),</span>
<span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;user_agent_os&quot;</span><span class="p">),</span>
<span class="n">msg</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;meta&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;Hostname&quot;</span><span class="p">),</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">converted</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">converted</span> <span class="o">=</span> <span class="n">sync_records</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">convert</span><span class="p">(</span><span class="n">x</span><span class="p">))</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">converted</span> <span class="o">=</span> <span class="n">converted</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SQLContext</span>
<span class="n">sync_df</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">converted</span><span class="p">)</span>
<span class="n">sync_df</span><span class="o">.</span><span class="n">printSchema</span><span class="p">()</span>
</pre></div>
<div class="codehilite"><pre><span></span>root
|-- uid: string (nullable = true)
|-- s_uid: string (nullable = true)
|-- dev: string (nullable = true)
|-- s_dev: string (nullable = true)
|-- ts: long (nullable = true)
|-- method: string (nullable = true)
|-- code: long (nullable = true)
|-- bucket: string (nullable = true)
|-- t: double (nullable = true)
|-- ua_browser: string (nullable = true)
|-- ua_version: double (nullable = true)
|-- ua_os: string (nullable = true)
|-- host: string (nullable = true)
</pre></div>
<div class="codehilite"><pre><span></span><span class="c1"># Determine if we need to repartition.</span>
<span class="c1"># A record is something like 112 bytes, so figure out how many partitions</span>
<span class="c1"># we need to end up with reasonably-sized files.</span>
<span class="n">records_per_partition</span> <span class="o">=</span> <span class="mi">2500000</span>
<span class="n">total_records</span> <span class="o">=</span> <span class="n">sync_df</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
<span class="k">print</span> <span class="s2">&quot;Found {} sync records&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">total_records</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="kn">import</span> <span class="nn">math</span>
<span class="n">num_partitions</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">math</span><span class="o">.</span><span class="n">ceil</span><span class="p">(</span><span class="nb">float</span><span class="p">(</span><span class="n">total_records</span><span class="p">)</span> <span class="o">/</span> <span class="n">records_per_partition</span><span class="p">))</span>
<span class="k">if</span> <span class="n">num_partitions</span> <span class="o">!=</span> <span class="n">sync_df</span><span class="o">.</span><span class="n">rdd</span><span class="o">.</span><span class="n">getNumPartitions</span><span class="p">():</span>
<span class="k">print</span> <span class="s2">&quot;Repartitioning with {} partitions&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">num_partitions</span><span class="p">)</span>
<span class="n">sync_df</span> <span class="o">=</span> <span class="n">sync_df</span><span class="o">.</span><span class="n">repartition</span><span class="p">(</span><span class="n">num_partitions</span><span class="p">)</span>
<span class="c1"># Store data</span>
<span class="n">sync_log_s3path</span> <span class="o">=</span> <span class="s2">&quot;{}/sync_log/v1/day={}&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">dest_s3_prefix</span><span class="p">,</span> <span class="n">target_day</span><span class="p">)</span>
<span class="n">sync_df</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="n">sync_log_s3path</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s2">&quot;overwrite&quot;</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="c1"># Transform, compute and store rollups</span>
<span class="n">sync_df</span><span class="o">.</span><span class="n">registerTempTable</span><span class="p">(</span><span class="s2">&quot;sync&quot;</span><span class="p">)</span>
<span class="n">sql_transform</span> <span class="o">=</span> <span class="s1">&#39;&#39;&#39;</span>
<span class="s1"> select</span>
<span class="s1"> uid,</span>
<span class="s1"> dev,</span>
<span class="s1"> ts,</span>
<span class="s1"> t,</span>
<span class="s1"> case </span>
<span class="s1"> when substring(ua_os,0,7) in (&#39;iPad&#39;, &#39;iPod&#39;, &#39;iPhone&#39;) then &#39;ios&#39;</span>
<span class="s1"> when substring(ua_os,0,7) = &#39;Android&#39; then &#39;android&#39;</span>
<span class="s1"> when substring(ua_os,0,7) = &#39;Windows&#39; then &#39;windows&#39;</span>
<span class="s1"> when substring(ua_os,0,7) = &#39;Macinto&#39; then &#39;mac&#39;</span>
<span class="s1"> when substring(ua_os,0,7) = &#39;Linux&#39; then &#39;linux&#39;</span>
<span class="s1"> when ua_os is null then &#39;unknown&#39;</span>
<span class="s1"> else &#39;other&#39;</span>
<span class="s1"> end as ua_os,</span>
<span class="s1"> ua_browser,</span>
<span class="s1"> ua_version,</span>
<span class="s1"> case method when &#39;POST&#39; then 1 end as posts,</span>
<span class="s1"> case method when &#39;GET&#39; then 1 end as gets,</span>
<span class="s1"> case method when &#39;PUT&#39; then 1 end as puts,</span>
<span class="s1"> case method when &#39;DELETE&#39; then 1 end as dels,</span>
<span class="s1"> case when code &lt; 300 then 1 end as aoks,</span>
<span class="s1"> case when code &gt; 399 and code &lt; 500 then 1 end as oops,</span>
<span class="s1"> case when code &gt; 499 and code &lt; 999 then 1 end as fups,</span>
<span class="s1"> case when bucket = &#39;clients&#39; and method = &#39;GET&#39; then 1 end as r_clients,</span>
<span class="s1"> case when bucket = &#39;crypto&#39; and method = &#39;GET&#39; then 1 end as r_crypto,</span>
<span class="s1"> case when bucket = &#39;forms&#39; and method = &#39;GET&#39; then 1 end as r_forms,</span>
<span class="s1"> case when bucket = &#39;history&#39; and method = &#39;GET&#39; then 1 end as r_history,</span>
<span class="s1"> case when bucket = &#39;keys&#39; and method = &#39;GET&#39; then 1 end as r_keys,</span>
<span class="s1"> case when bucket = &#39;meta&#39; and method = &#39;GET&#39; then 1 end as r_meta,</span>
<span class="s1"> case when bucket = &#39;bookmarks&#39; and method = &#39;GET&#39; then 1 end as r_bookmarks,</span>
<span class="s1"> case when bucket = &#39;prefs&#39; and method = &#39;GET&#39; then 1 end as r_prefs,</span>
<span class="s1"> case when bucket = &#39;tabs&#39; and method = &#39;GET&#39; then 1 end as r_tabs,</span>
<span class="s1"> case when bucket = &#39;passwords&#39; and method = &#39;GET&#39; then 1 end as r_passwords,</span>
<span class="s1"> case when bucket = &#39;addons&#39; and method = &#39;GET&#39; then 1 end as r_addons,</span>
<span class="s1"> case when bucket = &#39;clients&#39; and method = &#39;POST&#39; then 1 end as w_clients,</span>
<span class="s1"> case when bucket = &#39;crypto&#39; and method = &#39;POST&#39; then 1 end as w_crypto,</span>
<span class="s1"> case when bucket = &#39;forms&#39; and method = &#39;POST&#39; then 1 end as w_forms,</span>
<span class="s1"> case when bucket = &#39;history&#39; and method = &#39;POST&#39; then 1 end as w_history,</span>
<span class="s1"> case when bucket = &#39;keys&#39; and method = &#39;POST&#39; then 1 end as w_keys,</span>
<span class="s1"> case when bucket = &#39;meta&#39; and method = &#39;POST&#39; then 1 end as w_meta,</span>
<span class="s1"> case when bucket = &#39;bookmarks&#39; and method = &#39;POST&#39; then 1 end as w_bookmarks,</span>
<span class="s1"> case when bucket = &#39;prefs&#39; and method = &#39;POST&#39; then 1 end as w_prefs,</span>
<span class="s1"> case when bucket = &#39;tabs&#39; and method = &#39;POST&#39; then 1 end as w_tabs,</span>
<span class="s1"> case when bucket = &#39;passwords&#39; and method = &#39;POST&#39; then 1 end as w_passwords,</span>
<span class="s1"> case when bucket = &#39;addons&#39; and method = &#39;POST&#39; then 1 end as w_addons</span>
<span class="s1"> from sync</span>
<span class="s1">&#39;&#39;&#39;</span>
<span class="n">transformed</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="n">sql_transform</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">transformed</span><span class="o">.</span><span class="n">registerTempTable</span><span class="p">(</span><span class="s2">&quot;tx&quot;</span><span class="p">)</span>
<span class="n">sql_device_activity</span> <span class="o">=</span> <span class="s1">&#39;&#39;&#39;</span>
<span class="s1"> select</span>
<span class="s1"> uid,</span>
<span class="s1"> dev,</span>
<span class="s1"> max(ua_os) as ua_os,</span>
<span class="s1"> max(ua_browser) as ua_browser,</span>
<span class="s1"> max(ua_version) as ua_version,</span>
<span class="s1"> min(t) as min_t,</span>
<span class="s1"> max(t) as max_t,</span>
<span class="s1"> sum(posts) as posts,</span>
<span class="s1"> sum(gets) as gets,</span>
<span class="s1"> sum(puts) as puts,</span>
<span class="s1"> sum(dels) as dels,</span>
<span class="s1"> sum(aoks) as aoks,</span>
<span class="s1"> sum(oops) as oops,</span>
<span class="s1"> sum(fups) as fups,</span>
<span class="s1"> sum(r_clients) as r_clients,</span>
<span class="s1"> sum(r_crypto) as r_crypto,</span>
<span class="s1"> sum(r_forms) as r_forms,</span>
<span class="s1"> sum(r_history) as r_history,</span>
<span class="s1"> sum(r_keys) as r_keys,</span>
<span class="s1"> sum(r_meta) as r_meta,</span>
<span class="s1"> sum(r_bookmarks) as r_bookmarks,</span>
<span class="s1"> sum(r_prefs) as r_prefs,</span>
<span class="s1"> sum(r_tabs) as r_tabs,</span>
<span class="s1"> sum(r_passwords) as r_passwords,</span>
<span class="s1"> sum(r_addons) as r_addons,</span>
<span class="s1"> sum(w_clients) as w_clients,</span>
<span class="s1"> sum(w_crypto) as w_crypto,</span>
<span class="s1"> sum(w_forms) as w_forms,</span>
<span class="s1"> sum(w_history) as w_history,</span>
<span class="s1"> sum(w_keys) as w_keys,</span>
<span class="s1"> sum(w_meta) as w_meta,</span>
<span class="s1"> sum(w_bookmarks) as w_bookmarks,</span>
<span class="s1"> sum(w_prefs) as w_prefs,</span>
<span class="s1"> sum(w_tabs) as w_tabs,</span>
<span class="s1"> sum(w_passwords) as w_passwords,</span>
<span class="s1"> sum(w_addons) as w_addons</span>
<span class="s1"> from tx group by uid, dev</span>
<span class="s1">&#39;&#39;&#39;</span>
<span class="n">rolled_up</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="n">sql_device_activity</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="c1"># Store device activity rollups</span>
<span class="n">sync_log_device_activity_s3base</span> <span class="o">=</span> <span class="s2">&quot;{}/sync_log_device_activity/v1&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">dest_s3_prefix</span><span class="p">)</span>
<span class="n">sync_log_device_activity_s3path</span> <span class="o">=</span> <span class="s2">&quot;{}/day={}&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">sync_log_device_activity_s3base</span><span class="p">,</span> <span class="n">target_day</span><span class="p">)</span>
<span class="c1"># TODO: Do we need to repartition?</span>
<span class="n">rolled_up</span><span class="o">.</span><span class="n">repartition</span><span class="p">(</span><span class="mi">5</span><span class="p">)</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="n">sync_log_device_activity_s3path</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s2">&quot;overwrite&quot;</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">compute_device_counts</span><span class="p">(</span><span class="n">device_activity</span><span class="p">,</span> <span class="n">target_day</span><span class="p">):</span>
<span class="n">device_activity</span><span class="o">.</span><span class="n">registerTempTable</span><span class="p">(</span><span class="s2">&quot;device_activity&quot;</span><span class="p">)</span>
<span class="n">df</span> <span class="o">=</span> <span class="s2">&quot;%Y%m</span><span class="si">%d</span><span class="s2">&quot;</span>
<span class="n">last_week_date</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">strptime</span><span class="p">(</span><span class="n">target_day</span><span class="p">,</span> <span class="n">df</span><span class="p">)</span> <span class="o">-</span> <span class="n">timedelta</span><span class="p">(</span><span class="mi">7</span><span class="p">)</span>
<span class="n">last_week</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="n">last_week_date</span><span class="p">,</span> <span class="n">df</span><span class="p">)</span>
<span class="n">sql_device_counts</span> <span class="o">=</span> <span class="s2">&quot;&quot;&quot;</span>
<span class="s2"> select</span>
<span class="s2"> uid,</span>
<span class="s2"> count(distinct dev) as devs</span>
<span class="s2"> from</span>
<span class="s2"> (select</span>
<span class="s2"> uid,</span>
<span class="s2"> dev</span>
<span class="s2"> from device_activity</span>
<span class="s2"> where uid in</span>
<span class="s2"> (select distinct(uid) from device_activity where day = &#39;{}&#39;)</span>
<span class="s2"> and day &gt; &#39;{}&#39;</span>
<span class="s2"> and day &lt;= &#39;{}&#39;)</span>
<span class="s2"> group by uid</span>
<span class="s2"> &quot;&quot;&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">target_day</span><span class="p">,</span> <span class="n">last_week</span><span class="p">,</span> <span class="n">target_day</span><span class="p">)</span>
<span class="k">return</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="n">sql_device_counts</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="c1"># Compute and store device counts</span>
<span class="c1"># Re-read device activity data from S3 so we can look at historic info</span>
<span class="n">device_activity</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="n">sync_log_device_activity_s3base</span><span class="p">)</span>
<span class="n">device_counts</span> <span class="o">=</span> <span class="n">compute_device_counts</span><span class="p">(</span><span class="n">device_activity</span><span class="p">,</span> <span class="n">target_day</span><span class="p">)</span>
<span class="n">sync_log_device_counts_s3path</span> <span class="o">=</span> <span class="s2">&quot;{}/sync_log_device_counts/v1/day={}&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">dest_s3_prefix</span><span class="p">,</span> <span class="n">target_day</span><span class="p">)</span>
<span class="n">device_counts</span><span class="o">.</span><span class="n">repartition</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="n">sync_log_device_counts_s3path</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s2">&quot;overwrite&quot;</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
</div>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<h2> 0 Comments </h2>
</br>
<div>
<div class="row">
<div class="col-md-12">
<textarea class="form-control" type="text" id="comment-text" style="height:87px;" placeholder="Leave a comment..."></textarea>
</div>
</div>
</div>
</br>
<div>
<button class="btn btn-primary" id="post_comment_btn">Post Comment</button>
</div>
</div>
</div>
<br>
<div class="row">
<div class="col-md-12">
</div>
</br>
</div>
</div>
</div>
</div>
<div class="footer">
Served with <span class="glyphicon glyphicon-heart"></span> by <a href="https://github.com/airbnb/knowledge-repo">Knowledge Repo</a> <a href="https://github.com/airbnb/knowledge-repo/releases/tag/v0.7.4">0.7.4</a><br />
<i title="Last checked for updates: 4 minutes ago">Last indexed: 14 hours ago</i>
</div>
<script type='text/javascript'>
$("#searchbar")[0].setSelectionRange(1000, 1000);
$('#searchbar').typeahead({
hint: false,
highlight: true,
minLength: 1
},
{
name: 'knowledge_posts',
limit: 10,
display: function (item) {
return item.title + " - " + item.author;
},
templates: {
empty: Handlebars.compile(
'<div class="tt-not-found">' +
'Unable to find any posts that match the current query' +
'</div>'
),
suggestion: function(data) {
return '<p style="overflow-wrap:break-word"><strong class="text-rausch">' + data.title + '</strong> – ' + data.author + '</p>';
}
},
source: function(q, sync, async) {
$.ajax('/ajax/index/typeahead?search=' + q,
{
success: function(data,status){ async(JSON.parse(data)); }
})
}
});
$('#searchbar').bind('typeahead:select', function(obj, datum, name) {
window.location = '/post/'+encodeURIComponent(datum.path);
});
$('#searchbar').keypress(function(event){
var keycode = (event.keyCode ? event.keyCode : event.which);
if(keycode == '13'){
var path = document.location.pathname;
window.location = '/feed?filters=' + $('#searchbar').val()
}
});
var padding = $('.tt-menu').outerWidth()
$('.tt-menu').width($('#searchbar').width() + padding + "px")
</script>
<script src="/static/js/tooltips.js" type="text/javascript"></script>
<script type="text/javascript">
$("document").ready(function(){
var is_webeditor = false;
var post_id = "1";
var id = "None";
var post_path = "etl/sync_log.kp"
var data_repo_github_root = ""
tooltipsJx.initializeTooltips(is_webeditor, post_id, id, data_repo_github_root);
$(".btn-rendered").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path);
})
$(".btn-raw").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path) + "?render=raw";
})
$(".btn-webeditor").on("click", function(){
document.location.href = "/edit/" + encodeURI(post_path);
})
});
</script>
<script src="/static/js/helpers.js"></script>
<script src="/static/js/tags.js" type="text/javascript"></script>
<script src="/static/js/icons.js" type="text/javascript"></script>
<script src="/static/js/comments.js" type="text/javascript"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script type="text/javascript"
src="https://cdn.mathjax.org/mathjax/latest/MathJax.js">
</script>
<script>
$(document).on('ready', function(){
// Make the Rendered Markdown Button active
$(".btn-rendered").addClass("btn-active");
// Initialize headers
helpersJx.linkifyHeaders();
// Initialize comments
var post_path = 'etl/sync_log.kp';
$("#post_comment_btn").on('click', function(){
comment_author = 'knowledge_default';
post_author = 'mreid-moz';
post_title = 'Bug 1291340 - Import sync log data';
commentsJx.postComment(comment_author, post_author, post_title, post_path);
location.reload();
});
all_comment_delete_buttons = $("[id^=delete_comment]")
$.each(all_comment_delete_buttons, function(i,v){
$(v).on("click", function(){
var id = v.id;
var comment_id = id.split("__")[1];
if(comment_id) {
commentsJx.deleteComment(post_path, comment_id)
location.reload();
}
});
});
$(document.body).on('click',"button[id^=tag-subscription]",function () {
tagsJx.addTagSubscriptionListener($(this)[0]);
});
})
//Turn all the headers to be links
//h1 = Title, don't want that
var all_headers = [$("h2"), $("h3"), $("h4"), $("h5"), $("h6")]
$.each(all_headers, function(index, value){
$.each(value, function(i, v){
var inner_html = v.innerHTML
inner_html_no_special = inner_html.replace(/[^a-zA-Z\- ]/g, "")
var inner_link = "#" + inner_html_no_special.toLowerCase().split(" ").join("-")
v.innerHTML = "<a href='" + inner_link + "' class=link-reset>" + inner_html + "</a>"
})
})
//turn all the tags into links, similar to what's done on the feed page
var tags = $("#renderedMarkdown .metadata .tags")[0]
var tags_list = ['sync', 'etl']
var subscriptions_list = []
$.each(tags_list, function(i,tag){
ahref = document.createElement("a")
e_tag = encodeURIComponent(tag)
f_tag = tag.replace("/", "__")
tag_name = "#" + tag
tag_subscription_button_id_name = "tag-subscription-" + i + "__" + f_tag
ahref.setAttribute("data-container", "body")
ahref.setAttribute("data-toggle", "popover")
ahref.setAttribute("data-placement", "bottom")
ahref.setAttribute("data-html", "true")
ahref.setAttribute("data-tag-name", f_tag)
if (subscriptions_list.indexOf(tag) >= 0) {
ahref.setAttribute("class", "label label-subscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-primary btn-unsubscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-remove-sign glyphicon-white'></i>Unsubscribe " +
" </button> " +
" </div>")
} else {
ahref.setAttribute("class", "label label-unsubscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-default btn-subscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-ok-sign glyphicon-filled'></i>Subscribe " +
" </button> " +
" </div>")
}
ahref.setAttribute("href", "/tag_pages?tag=" + e_tag)
ahref.setAttribute("style", "font-weight:normal")
if (i == 0){
ahref.innerHTML = " "
colon = document.createElement("text")
colon.innerHTML = "<b>Tags</b>: "
tags.appendChild(colon)
}
ahref.innerHTML = ahref.innerHTML + tag_name
tags.appendChild(ahref)
if (i != tags_list.length - 1){
comma = document.createElement("text")
comma.innerText = ", "
tags.appendChild(comma)
}
})
tags.nextSibling.remove()
tags.innerHTML += "<i class='glyphicon glyphicon-edit icon-gray' style='font-size:12pt; padding-left:4px' id='tooltip-edit_tags'></i>"
$(".pop").popover({ trigger: "manual" , html: true, animation:false, delay: 100})
.on("mouseenter", function () {
var _this = this;
$(this).popover("show");
$(".popover").on("mouseleave", function () {
$(_this).popover('hide');
});
}).on("mouseleave", function () {
var _this = this;
setTimeout(function () {
if (!$(".popover:hover").length) {
$(_this).popover("hide");
}
}, 300);
});
$('#tooltip-edit_tags').click(function(){
$('#tooltip-edit_tags')[0].setAttribute("style", "display:none")
previousSibling = $("#tooltip-edit_tags")[0].previousSibling
tags_string = tags_list.join(", ")
form = document.createElement("form")
input = document.createElement("input")
tags_text = document.createElement("text")
icon_class = document.createElement("i")
icon_class.setAttribute("class", "glyphicon glyphicon-upload icon-gray")
icon_class.setAttribute("style", "font-size:23px; padding-left:4px")
icon_class.setAttribute("id", "tooltip-save_tags")
tags_text.innerText = "Tags: "
input.setAttribute('type', 'text')
input.setAttribute('name', 'tags_list')
input.setAttribute('value', tags_string)
input.setAttribute('style', 'width:75%; display: inline-block')
input.setAttribute('id' , 'change_tags')
form.appendChild(tags_text)
tags.textContent = " "
form.appendChild(input)
form.appendChild(icon_class)
tags.appendChild(form)
$("#tooltip-save_tags").click(function(){
tags_string = $("#change_tags")[0].value
tags_list = tags_string.split(",")
var re = /^[a-z0-9\-\_\:\/]+$/i
var good = true
for (var i = 0; i < tags_list.length; i++){
tag = tags_list[i]
if (tag.length == 0){
alert("There is a tag with length 0 - possible a trailing comma?")
good = false
break
} else {
tag_name = tag.trim()
if (!(re.test(tag_name))){
alert("The tag contains special characters. Make sure there are only alphanumeric characters in your tag")
good = false
break
}
}
}
if (good) {
var postContent = {}
postContent['tags'] = tags_string
$.ajax({
type: "POST",
dataType: "json",
data: JSON.stringify(postContent),
contentType: "application/json",
url: '/tag_list?post_path=etl/sync_log.kp',
async: false
});
location.reload()
}
})
tags.nextSibling.remove()
// Allow user to edit tags
var edit_icon = iconsJx.createEditTagsIcon();
$(tags).after(edit_icon);
$("#tooltip-edit_tags").on("click", function(){
var edit_tooltip = $("#tooltip-edit_tags");
edit_tooltip.attr("style", "display:none");
var tags_string = tags_list.join(", ");
var form = $("<form>");
var input = $("<input>");
var tags_text = $("<text>");
tags_text.html("Tags: ");
var icon = iconsJx.createSaveTagsIcon();
input.attr("type", "text");
input.attr("name", "tags_list");
input.attr("style", "width:75%; display: inline-block");
input.attr("id", "change_tags");
form.append(tags_text);
tags.textContent = " ";
tags_text.innerHTML = "Tags: ";
form.append(input);
form.append(icon);
tags.appendChild(form[0]);
$("#change_tags")[0].value = tags_string;
$("#change_tags").keypress(function(e){
if (e.which == 13){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/sync_log.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false;
};
});
$("#tooltip-save_tags").click(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/sync_log.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
});
$("form").submit(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/sync_log.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false
})
});
});
</script>
</body>
</html>

Просмотреть файл

@ -0,0 +1,13 @@
{
"title": "Bug 1291340 - Import sync log data",
"authors": [
"mreid-moz"
],
"tags": [
"sync",
"etl"
],
"publish_date": "2016-11-15",
"updated_at": "2016-11-15",
"tldr": "Read, convert, and store sync log data to Parquet form per [bug 1291340](https://bugzilla.mozilla.org/show_bug.cgi?id=1291340)."
}

Просмотреть файл

@ -0,0 +1,544 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title> Knowledge </title>
<!-- js includes at the top as post embedded js colliding -->
<script src="https://reports.telemetry.mozilla.org/static/modules/jquery/jquery.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/tether/js/tether.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap/js/bootstrap.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/js/bootstrap-slider.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/typeahead.js/typeahead.bundle.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/handlebars/js/handlebars.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/select2/js/select2.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/hightlight.pack.js/highlight.pack.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/marked.js/marked.js"></script>
<!-- require js is used for plotly, but has a bunch of collisions with other js packages
make sure to have it be last js package imported -->
<script src="https://reports.telemetry.mozilla.org/static/modules/require.js/require.min.js"></script>
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.6.2/html5shiv.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/2.1.0/es5-shim.min.js"></script>
<![endif]-->
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap/css/bootstrap.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/css/bootstrap-slider.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/css/custom.css" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/select2/css/select2.min.css" rel="stylesheet" type="text/css"/>
<link href="https://fonts.googleapis.com/css?family=Lato:400,900|Playfair+Display|Source+Serif+Pro:400,700" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/images/favicon.png" rel="shortcut icon"/>
<link href="https://reports.telemetry.mozilla.org/static/css/codehilite-friendly.css" rel="stylesheet"/>
<style>
.spinner {
position: fixed;
top: 50%;
left: 50%;
margin-left: -50px; /* half width of the spinner gif */
margin-top: -50px; /* half height of the spinner gif */
text-align:center;
z-index:1234;
overflow: auto;
width: 100px; /* width of the spinner gif */
height: 102px; /*hight of the spinner gif +2px to fix IE8 issue */
}
.table {
font-size: 14px;
}
.modal-content {
max-width: 1024px;
}
</style>
</head>
<body>
<div class="container page-container">
<br/>
<div class="container-fluid">
<div class="row">
<div class="col-md-6">
</div>
<div class="col-md-2">
</div>
</div>
<div class="row col-md-12">
</div>
</div>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<div id="renderedMarkdown">
<div class="metadata">
<h1>TxP Pulse ETL</h1>
<span class="authors"><a href="/feed?authors=sunahsuh">sunahsuh</a></span>
<span class="date_created">February 17, 2017</span>
<span class="date_updated">(Last Updated: February 21, 2017)</span>
<span class="tldr"><p>This notebook transforms pings from the Pulse testpilot test to a parquet dataset. Docs at https://github.com/mozilla/pulse/blob/master/docs/metrics.md</p></span>
</div>
<div class="codehilite"><pre><span></span><span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="o">*</span>
<span class="kn">import</span> <span class="nn">dateutil.parser</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="o">*</span>
<span class="kn">from</span> <span class="nn">moztelemetry</span> <span class="kn">import</span> <span class="n">get_pings_properties</span>
<span class="kn">from</span> <span class="nn">moztelemetry.dataset</span> <span class="kn">import</span> <span class="n">Dataset</span>
<span class="k">class</span> <span class="nc">ColumnConfig</span><span class="p">:</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">,</span> <span class="n">path</span><span class="p">,</span> <span class="n">cleaning_func</span><span class="p">,</span> <span class="n">struct_type</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">name</span>
<span class="bp">self</span><span class="o">.</span><span class="n">path</span> <span class="o">=</span> <span class="n">path</span>
<span class="bp">self</span><span class="o">.</span><span class="n">cleaning_func</span> <span class="o">=</span> <span class="n">cleaning_func</span>
<span class="bp">self</span><span class="o">.</span><span class="n">struct_type</span> <span class="o">=</span> <span class="n">struct_type</span>
<span class="k">class</span> <span class="nc">DataFrameConfig</span><span class="p">:</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">col_configs</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="p">[</span><span class="n">ColumnConfig</span><span class="p">(</span><span class="o">*</span><span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">col_configs</span><span class="p">]</span>
<span class="k">def</span> <span class="nf">toStructType</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="n">StructType</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">StructField</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="n">col</span><span class="o">.</span><span class="n">struct_type</span><span class="p">,</span> <span class="bp">True</span><span class="p">),</span>
<span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">))</span>
<span class="k">def</span> <span class="nf">get_names</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">col</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">get_paths</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">col</span><span class="o">.</span><span class="n">path</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">pings_to_df</span><span class="p">(</span><span class="n">sqlContext</span><span class="p">,</span> <span class="n">pings</span><span class="p">,</span> <span class="n">data_frame_config</span><span class="p">):</span>
<span class="sd">"""Performs simple data pipelining on raw pings</span>
<span class="sd"> Arguments:</span>
<span class="sd"> data_frame_config: a list of tuples of the form:</span>
<span class="sd"> (name, path, cleaning_func, column_type)</span>
<span class="sd"> """</span>
<span class="k">def</span> <span class="nf">build_cell</span><span class="p">(</span><span class="n">ping</span><span class="p">,</span> <span class="n">column_config</span><span class="p">):</span>
<span class="sd">"""Takes a json ping and a column config and returns a cleaned cell"""</span>
<span class="n">raw_value</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="n">column_config</span><span class="o">.</span><span class="n">path</span><span class="p">]</span>
<span class="n">func</span> <span class="o">=</span> <span class="n">column_config</span><span class="o">.</span><span class="n">cleaning_func</span>
<span class="k">if</span> <span class="n">func</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">func</span><span class="p">(</span><span class="n">raw_value</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">raw_value</span>
<span class="k">def</span> <span class="nf">ping_to_row</span><span class="p">(</span><span class="n">ping</span><span class="p">):</span>
<span class="k">return</span> <span class="p">[</span><span class="n">build_cell</span><span class="p">(</span><span class="n">ping</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">data_frame_config</span><span class="o">.</span><span class="n">columns</span><span class="p">]</span>
<span class="n">filtered_pings</span> <span class="o">=</span> <span class="n">get_pings_properties</span><span class="p">(</span><span class="n">pings</span><span class="p">,</span> <span class="n">data_frame_config</span><span class="o">.</span><span class="n">get_paths</span><span class="p">())</span>
<span class="k">return</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span>
<span class="n">filtered_pings</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">ping_to_row</span><span class="p">),</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">data_frame_config</span><span class="o">.</span><span class="n">toStructType</span><span class="p">())</span>
<span class="k">def</span> <span class="nf">__main__</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">sqlContext</span><span class="p">,</span> <span class="n">submission_date</span><span class="p">):</span>
<span class="k">if</span> <span class="n">submission_date</span> <span class="ow">is</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">submission_date</span> <span class="o">=</span> <span class="p">(</span><span class="n">date</span><span class="o">.</span><span class="n">today</span><span class="p">()</span> <span class="o">-</span> <span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">))</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">"%Y%m</span><span class="si">%d</span><span class="s2">"</span><span class="p">)</span>
<span class="n">get_doctype_pings</span> <span class="o">=</span> <span class="k">lambda</span> <span class="n">docType</span><span class="p">:</span> <span class="n">Dataset</span><span class="o">.</span><span class="n">from_source</span><span class="p">(</span><span class="s2">"telemetry"</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">docType</span><span class="o">=</span><span class="n">docType</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">submissionDate</span><span class="o">=</span><span class="n">submission_date</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">appName</span><span class="o">=</span><span class="s2">"Firefox"</span><span class="p">)</span> \
<span class="o">.</span><span class="n">records</span><span class="p">(</span><span class="n">sc</span><span class="p">)</span>
<span class="k">return</span> <span class="n">pings_to_df</span><span class="p">(</span>
<span class="n">sqlContext</span><span class="p">,</span>
<span class="n">get_doctype_pings</span><span class="p">(</span><span class="s2">"testpilottest"</span><span class="p">),</span>
<span class="n">DataFrameConfig</span><span class="p">([</span>
<span class="p">(</span><span class="s2">"method"</span><span class="p">,</span> <span class="s2">"payload/payload/method"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"id"</span><span class="p">,</span> <span class="s2">"payload/payload/id"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"type"</span><span class="p">,</span> <span class="s2">"payload/payload/type"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"object"</span><span class="p">,</span> <span class="s2">"payload/payload/object"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"category"</span><span class="p">,</span> <span class="s2">"payload/payload/category"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"variant"</span><span class="p">,</span> <span class="s2">"payload/payload/variant"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"details"</span><span class="p">,</span> <span class="s2">"payload/payload/details"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"sentiment"</span><span class="p">,</span> <span class="s2">"payload/payload/sentiment"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">IntegerType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"reason"</span><span class="p">,</span> <span class="s2">"payload/payload/reason"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"adBlocker"</span><span class="p">,</span> <span class="s2">"payload/payload/adBlocker"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"addons"</span><span class="p">,</span> <span class="s2">"payload/payload/addons"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">ArrayType</span><span class="p">(</span><span class="n">StringType</span><span class="p">())),</span>
<span class="p">(</span><span class="s2">"channel"</span><span class="p">,</span> <span class="s2">"payload/payload/channel"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"hostname"</span><span class="p">,</span> <span class="s2">"payload/payload/hostname"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"language"</span><span class="p">,</span> <span class="s2">"payload/payload/language"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"openTabs"</span><span class="p">,</span> <span class="s2">"payload/payload/openTabs"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">IntegerType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"openWindows"</span><span class="p">,</span> <span class="s2">"payload/payload/openWindows"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">IntegerType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"platform"</span><span class="p">,</span> <span class="s2">"payload/payload/platform"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"protocol"</span><span class="p">,</span> <span class="s2">"payload/payload/protocol"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"telemetryId"</span><span class="p">,</span> <span class="s2">"payload/payload/telemetryId"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"timerContentLoaded"</span><span class="p">,</span> <span class="s2">"payload/payload/timerContentLoaded"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"timerFirstInteraction"</span><span class="p">,</span> <span class="s2">"payload/payload/timerFirstInteraction"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"timerFirstPaint"</span><span class="p">,</span> <span class="s2">"payload/payload/timerFirstPaint"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"timerWindowLoad"</span><span class="p">,</span> <span class="s2">"payload/payload/timerWindowLoad"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"inner_timestamp"</span><span class="p">,</span> <span class="s2">"payload/payload/timestamp"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"fx_version"</span><span class="p">,</span> <span class="s2">"payload/payload/fx_version"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"creation_date"</span><span class="p">,</span> <span class="s2">"creationDate"</span><span class="p">,</span> <span class="n">dateutil</span><span class="o">.</span><span class="n">parser</span><span class="o">.</span><span class="n">parse</span><span class="p">,</span> <span class="n">TimestampType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"test"</span><span class="p">,</span> <span class="s2">"payload/test"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"variants"</span><span class="p">,</span> <span class="s2">"payload/variants"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"timestamp"</span><span class="p">,</span> <span class="s2">"payload/timestamp"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"version"</span><span class="p">,</span> <span class="s2">"payload/version"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">())</span>
<span class="p">]))</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="s2">"test = 'pulse@mozilla.com'"</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">submission_date</span> <span class="o">=</span> <span class="p">(</span><span class="n">date</span><span class="o">.</span><span class="n">today</span><span class="p">()</span> <span class="o">-</span> <span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">))</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">"%Y%m</span><span class="si">%d</span><span class="s2">"</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">tpt</span> <span class="o">=</span> <span class="n">__main__</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">sqlContext</span><span class="p">,</span> <span class="n">submission_date</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">tpt</span><span class="o">.</span><span class="n">repartition</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="s1">'s3://telemetry-parquet/testpilot/txp_pulse/v1/submission_date={}'</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">submission_date</span><span class="p">))</span>
</pre></div>
</div>
</div>
</div>
</div>
</div></body></html>
<div>
</div>
<br/>
<div class="row">
<div class="col-md-12">
</div>
</div>
<script type="text/javascript">
$("#searchbar")[0].setSelectionRange(1000, 1000);
$('#searchbar').typeahead({
hint: false,
highlight: true,
minLength: 1
},
{
name: 'knowledge_posts',
limit: 10,
display: function (item) {
return item.title + " - " + item.author;
},
templates: {
empty: Handlebars.compile(
'<div class="tt-not-found">' +
'Unable to find any posts that match the current query' +
'</div>'
),
suggestion: function(data) {
return '<p style="overflow-wrap:break-word"><strong class="text-rausch">' + data.title + '</strong> – ' + data.author + '</p>';
}
},
source: function(q, sync, async) {
$.ajax('/ajax/index/typeahead?search=' + q,
{
success: function(data,status){ async(JSON.parse(data)); }
})
}
});
$('#searchbar').bind('typeahead:select', function(obj, datum, name) {
window.location = '/post/'+encodeURIComponent(datum.path);
});
$('#searchbar').keypress(function(event){
var keycode = (event.keyCode ? event.keyCode : event.which);
if(keycode == '13'){
var path = document.location.pathname;
window.location = '/feed?filters=' + $('#searchbar').val()
}
});
var padding = $('.tt-menu').outerWidth()
$('.tt-menu').width($('#searchbar').width() + padding + "px")
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/tooltips.js" type="text/javascript"></script>
<script type="text/javascript">
$("document").ready(function(){
var is_webeditor = false;
var post_id = "33";
var id = "None";
var post_path = "etl/testpilot/pulse.kp"
var data_repo_github_root = ""
tooltipsJx.initializeTooltips(is_webeditor, post_id, id, data_repo_github_root);
$(".btn-rendered").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path);
})
$(".btn-raw").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path) + "?render=raw";
})
$(".btn-webeditor").on("click", function(){
document.location.href = "/edit/" + encodeURI(post_path);
})
});
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/tags.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/icons.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/comments.js" type="text/javascript"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js" type="text/javascript">
</script>
<script>
$(document).on('ready', function(){
// Make the Rendered Markdown Button active
$(".btn-rendered").addClass("btn-active");
// Initialize headers
helpersJx.linkifyHeaders();
// Initialize comments
var post_path = 'etl/testpilot/pulse.kp';
$("#post_comment_btn").on('click', function(){
comment_author = 'knowledge_default';
post_author = 'sunahsuh';
post_title = 'TxP Pulse ETL';
commentsJx.postComment(comment_author, post_author, post_title, post_path);
location.reload();
});
all_comment_delete_buttons = $("[id^=delete_comment]")
$.each(all_comment_delete_buttons, function(i,v){
$(v).on("click", function(){
var id = v.id;
var comment_id = id.split("__")[1];
if(comment_id) {
commentsJx.deleteComment(post_path, comment_id)
location.reload();
}
});
});
$(document.body).on('click',"button[id^=tag-subscription]",function () {
tagsJx.addTagSubscriptionListener($(this)[0]);
});
})
//Turn all the headers to be links
//h1 = Title, don't want that
var all_headers = [$("h2"), $("h3"), $("h4"), $("h5"), $("h6")]
$.each(all_headers, function(index, value){
$.each(value, function(i, v){
var inner_html = v.innerHTML
inner_html_no_special = inner_html.replace(/[^a-zA-Z\- ]/g, "")
var inner_link = "#" + inner_html_no_special.toLowerCase().split(" ").join("-")
v.innerHTML = "<a href='" + inner_link + "' class=link-reset>" + inner_html + "</a>"
})
})
//turn all the tags into links, similar to what's done on the feed page
var tags = $("#renderedMarkdown .metadata .tags")[0]
var tags_list = ['testpilot', 'etl']
var subscriptions_list = []
$.each(tags_list, function(i,tag){
ahref = document.createElement("a")
e_tag = encodeURIComponent(tag)
f_tag = tag.replace("/", "__")
tag_name = "#" + tag
tag_subscription_button_id_name = "tag-subscription-" + i + "__" + f_tag
ahref.setAttribute("data-container", "body")
ahref.setAttribute("data-toggle", "popover")
ahref.setAttribute("data-placement", "bottom")
ahref.setAttribute("data-html", "true")
ahref.setAttribute("data-tag-name", f_tag)
if (subscriptions_list.indexOf(tag) >= 0) {
ahref.setAttribute("class", "label label-subscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-primary btn-unsubscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-remove-sign glyphicon-white'></i>Unsubscribe " +
" </button> " +
" </div>")
} else {
ahref.setAttribute("class", "label label-unsubscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-default btn-subscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-ok-sign glyphicon-filled'></i>Subscribe " +
" </button> " +
" </div>")
}
ahref.setAttribute("href", "/tag_pages?tag=" + e_tag)
ahref.setAttribute("style", "font-weight:normal")
if (i == 0){
ahref.innerHTML = " "
colon = document.createElement("text")
colon.innerHTML = "<b>Tags</b>: "
tags.appendChild(colon)
}
ahref.innerHTML = ahref.innerHTML + tag_name
tags.appendChild(ahref)
if (i != tags_list.length - 1){
comma = document.createElement("text")
comma.innerText = ", "
tags.appendChild(comma)
}
})
tags.nextSibling.remove()
tags.innerHTML += "<i class='glyphicon glyphicon-edit icon-gray' style='font-size:12pt; padding-left:4px' id='tooltip-edit_tags'></i>"
$(".pop").popover({ trigger: "manual" , html: true, animation:false, delay: 100})
.on("mouseenter", function () {
var _this = this;
$(this).popover("show");
$(".popover").on("mouseleave", function () {
$(_this).popover('hide');
});
}).on("mouseleave", function () {
var _this = this;
setTimeout(function () {
if (!$(".popover:hover").length) {
$(_this).popover("hide");
}
}, 300);
});
$('#tooltip-edit_tags').click(function(){
$('#tooltip-edit_tags')[0].setAttribute("style", "display:none")
previousSibling = $("#tooltip-edit_tags")[0].previousSibling
tags_string = tags_list.join(", ")
form = document.createElement("form")
input = document.createElement("input")
tags_text = document.createElement("text")
icon_class = document.createElement("i")
icon_class.setAttribute("class", "glyphicon glyphicon-upload icon-gray")
icon_class.setAttribute("style", "font-size:23px; padding-left:4px")
icon_class.setAttribute("id", "tooltip-save_tags")
tags_text.innerText = "Tags: "
input.setAttribute('type', 'text')
input.setAttribute('name', 'tags_list')
input.setAttribute('value', tags_string)
input.setAttribute('style', 'width:75%; display: inline-block')
input.setAttribute('id' , 'change_tags')
form.appendChild(tags_text)
tags.textContent = " "
form.appendChild(input)
form.appendChild(icon_class)
tags.appendChild(form)
$("#tooltip-save_tags").click(function(){
tags_string = $("#change_tags")[0].value
tags_list = tags_string.split(",")
var re = /^[a-z0-9\-\_\:\/]+$/i
var good = true
for (var i = 0; i < tags_list.length; i++){
tag = tags_list[i]
if (tag.length == 0){
alert("There is a tag with length 0 - possible a trailing comma?")
good = false
break
} else {
tag_name = tag.trim()
if (!(re.test(tag_name))){
alert("The tag contains special characters. Make sure there are only alphanumeric characters in your tag")
good = false
break
}
}
}
if (good) {
var postContent = {}
postContent['tags'] = tags_string
$.ajax({
type: "POST",
dataType: "json",
data: JSON.stringify(postContent),
contentType: "application/json",
url: '/tag_list?post_path=etl/testpilot/pulse.kp',
async: false
});
location.reload()
}
})
tags.nextSibling.remove()
// Allow user to edit tags
var edit_icon = iconsJx.createEditTagsIcon();
$(tags).after(edit_icon);
$("#tooltip-edit_tags").on("click", function(){
var edit_tooltip = $("#tooltip-edit_tags");
edit_tooltip.attr("style", "display:none");
var tags_string = tags_list.join(", ");
var form = $("<form>");
var input = $("<input>");
var tags_text = $("<text>");
tags_text.html("Tags: ");
var icon = iconsJx.createSaveTagsIcon();
input.attr("type", "text");
input.attr("name", "tags_list");
input.attr("style", "width:75%; display: inline-block");
input.attr("id", "change_tags");
form.append(tags_text);
tags.textContent = " ";
tags_text.innerHTML = "Tags: ";
form.append(input);
form.append(icon);
tags.appendChild(form[0]);
$("#change_tags")[0].value = tags_string;
$("#change_tags").keypress(function(e){
if (e.which == 13){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/testpilot/pulse.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false;
};
});
$("#tooltip-save_tags").click(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/testpilot/pulse.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
});
$("form").submit(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/testpilot/pulse.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false
})
});
});
</script>

Просмотреть файл

@ -0,0 +1,662 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title> Knowledge </title>
<!-- js includes at the top as post embedded js colliding -->
<script src="/static/modules/jquery/jquery.min.js"></script>
<script src="/static/modules/tether/js/tether.min.js"></script>
<script src="/static/modules/bootstrap/js/bootstrap.min.js"></script>
<script src="/static/modules/bootstrap-slider/js/bootstrap-slider.min.js"></script>
<script src="/static/modules/typeahead.js/typeahead.bundle.min.js"></script>
<script src="/static/modules/handlebars/js/handlebars.js"></script>
<script src="/static/js/helpers.js"></script>
<script src="/static/modules/select2/js/select2.min.js"></script>
<script src="/static/modules/hightlight.pack.js/highlight.pack.js"></script>
<script src="/static/modules/marked.js/marked.js"></script>
<!-- require js is used for plotly, but has a bunch of collisions with other js packages
make sure to have it be last js package imported -->
<script src="/static/modules/require.js/require.min.js"></script>
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.6.2/html5shiv.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/2.1.0/es5-shim.min.js"></script>
<![endif]-->
<link rel="stylesheet" href="/static/modules/bootstrap/css/bootstrap.min.css">
<link rel="stylesheet" href="/static/modules/bootstrap-slider/css/bootstrap-slider.min.css">
<link rel="stylesheet" type='text/css' href="/static/css/custom.css">
<link rel="stylesheet" type='text/css' href="/static/modules/select2/css/select2.min.css">
<link href='https://fonts.googleapis.com/css?family=Lato:400,900|Playfair+Display|Source+Serif+Pro:400,700' rel='stylesheet' type='text/css'>
<link rel="shortcut icon" href="/static/images/favicon.png">
<link rel="stylesheet" href="/static/css/codehilite-friendly.css">
<style>
.spinner {
position: fixed;
top: 50%;
left: 50%;
margin-left: -50px; /* half width of the spinner gif */
margin-top: -50px; /* half height of the spinner gif */
text-align:center;
z-index:1234;
overflow: auto;
width: 100px; /* width of the spinner gif */
height: 102px; /*hight of the spinner gif +2px to fix IE8 issue */
}
.table {
font-size: 14px;
}
.modal-content {
max-width: 1024px;
}
</style>
</head>
<body>
<div class="navbar navbar-knowledge" role="navigation">
<div class="container page-container">
<a href="/" aria-selected="false" class="logo-image navbar-text" style='margin-top:16px; margin-left: -7px'>
<img width="125" src='/static/images/logo-white.svg'></img>
</a>
<a id="feed_tab" href="/" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Home
</a>
<a id="favorites_tab" href="/favorites" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Favorites
</a>
<a id="help_tab" href="/about" aria-selected="false" class="navbar-text" style='margin-top:18px'>
About
</a>
<a id="stats_tab" href="/stats" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Stats
</a>
<a id="webposts_tab" href="/create"
class="btn btn-primary navbar-text pull-right"
style='border-radius:4px; margin-top:10px; margin-right:-10px; background-color: #00a699; color:white; border-color: #00a699'>
Write a Post!
</a>
<div class="pull-right">
<div class="form-group">
<input class="form-control" id="searchbar" placeholder="Search for Knowledge" style="text-align:right">
</div>
</div>
</div>
</div>
<div class="container page-container">
<br>
<div class="container-fluid">
<div class="row">
<div class="col-md-6">
<div class="btn-group" role="group">
<button type="button" class="btn btn-default btn-rendered">
View Post
</button>
<button type="button" class="btn btn-default btn-raw">
View Raw Markdown
</button>
</div>
</div>
<div class="col-md-2">
</div>
<div class="col-md-4 text-right">
<i class="glyphicon glyphicon-eye-open" id="pageview_stats" style='color: #9CA299'></i>
<div id="pageview_stats" style="display: inline-block">
Viewed 1 times by 1 different users
</div>
<i class="glyphicon glyphicon-heart-empty glyphicon-clickable pop" style="font-size:16pt" id="tooltip-like" data-placement="bottom"
data-trigger="#tooltip-like"
data-container="body"
data-toggle="popover"
data-content='<div>Like This Post</div>'></i>
</div>
</div>
<div class="row col-md-12">
</div>
</div>
<div class='container-fluid'>
<div class="row">
<div class="col-md-12">
<div id="renderedMarkdown">
<div class='metadata'>
<h1>TxP Pulse ETL</h1>
<span class='authors'><a href='/feed?authors=sunahsuh'>sunahsuh</a></span>
<span class='date_created'>February 17, 2017</span>
<span class='date_updated'>(Last Updated: February 21, 2017)</span>
<span class='tldr'><p>This notebook transforms pings from the Pulse testpilot test to a parquet dataset. Docs at https://github.com/mozilla/pulse/blob/master/docs/metrics.md</p></span>
<span class='tags'></span>
</div>
<div class="codehilite"><pre><span></span><span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="o">*</span>
<span class="kn">import</span> <span class="nn">dateutil.parser</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="o">*</span>
<span class="kn">from</span> <span class="nn">moztelemetry</span> <span class="kn">import</span> <span class="n">get_pings_properties</span>
<span class="kn">from</span> <span class="nn">moztelemetry.dataset</span> <span class="kn">import</span> <span class="n">Dataset</span>
<span class="k">class</span> <span class="nc">ColumnConfig</span><span class="p">:</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">,</span> <span class="n">path</span><span class="p">,</span> <span class="n">cleaning_func</span><span class="p">,</span> <span class="n">struct_type</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">name</span>
<span class="bp">self</span><span class="o">.</span><span class="n">path</span> <span class="o">=</span> <span class="n">path</span>
<span class="bp">self</span><span class="o">.</span><span class="n">cleaning_func</span> <span class="o">=</span> <span class="n">cleaning_func</span>
<span class="bp">self</span><span class="o">.</span><span class="n">struct_type</span> <span class="o">=</span> <span class="n">struct_type</span>
<span class="k">class</span> <span class="nc">DataFrameConfig</span><span class="p">:</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">col_configs</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="p">[</span><span class="n">ColumnConfig</span><span class="p">(</span><span class="o">*</span><span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">col_configs</span><span class="p">]</span>
<span class="k">def</span> <span class="nf">toStructType</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="n">StructType</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">StructField</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="n">col</span><span class="o">.</span><span class="n">struct_type</span><span class="p">,</span> <span class="bp">True</span><span class="p">),</span>
<span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">))</span>
<span class="k">def</span> <span class="nf">get_names</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">col</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">get_paths</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">col</span><span class="o">.</span><span class="n">path</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">pings_to_df</span><span class="p">(</span><span class="n">sqlContext</span><span class="p">,</span> <span class="n">pings</span><span class="p">,</span> <span class="n">data_frame_config</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Performs simple data pipelining on raw pings</span>
<span class="sd"> Arguments:</span>
<span class="sd"> data_frame_config: a list of tuples of the form:</span>
<span class="sd"> (name, path, cleaning_func, column_type)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="nf">build_cell</span><span class="p">(</span><span class="n">ping</span><span class="p">,</span> <span class="n">column_config</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Takes a json ping and a column config and returns a cleaned cell&quot;&quot;&quot;</span>
<span class="n">raw_value</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="n">column_config</span><span class="o">.</span><span class="n">path</span><span class="p">]</span>
<span class="n">func</span> <span class="o">=</span> <span class="n">column_config</span><span class="o">.</span><span class="n">cleaning_func</span>
<span class="k">if</span> <span class="n">func</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">func</span><span class="p">(</span><span class="n">raw_value</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">raw_value</span>
<span class="k">def</span> <span class="nf">ping_to_row</span><span class="p">(</span><span class="n">ping</span><span class="p">):</span>
<span class="k">return</span> <span class="p">[</span><span class="n">build_cell</span><span class="p">(</span><span class="n">ping</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">data_frame_config</span><span class="o">.</span><span class="n">columns</span><span class="p">]</span>
<span class="n">filtered_pings</span> <span class="o">=</span> <span class="n">get_pings_properties</span><span class="p">(</span><span class="n">pings</span><span class="p">,</span> <span class="n">data_frame_config</span><span class="o">.</span><span class="n">get_paths</span><span class="p">())</span>
<span class="k">return</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span>
<span class="n">filtered_pings</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">ping_to_row</span><span class="p">),</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">data_frame_config</span><span class="o">.</span><span class="n">toStructType</span><span class="p">())</span>
<span class="k">def</span> <span class="nf">__main__</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">sqlContext</span><span class="p">,</span> <span class="n">submission_date</span><span class="p">):</span>
<span class="k">if</span> <span class="n">submission_date</span> <span class="ow">is</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">submission_date</span> <span class="o">=</span> <span class="p">(</span><span class="n">date</span><span class="o">.</span><span class="n">today</span><span class="p">()</span> <span class="o">-</span> <span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">))</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">&quot;%Y%m</span><span class="si">%d</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="n">get_doctype_pings</span> <span class="o">=</span> <span class="k">lambda</span> <span class="n">docType</span><span class="p">:</span> <span class="n">Dataset</span><span class="o">.</span><span class="n">from_source</span><span class="p">(</span><span class="s2">&quot;telemetry&quot;</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">docType</span><span class="o">=</span><span class="n">docType</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">submissionDate</span><span class="o">=</span><span class="n">submission_date</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">appName</span><span class="o">=</span><span class="s2">&quot;Firefox&quot;</span><span class="p">)</span> \
<span class="o">.</span><span class="n">records</span><span class="p">(</span><span class="n">sc</span><span class="p">)</span>
<span class="k">return</span> <span class="n">pings_to_df</span><span class="p">(</span>
<span class="n">sqlContext</span><span class="p">,</span>
<span class="n">get_doctype_pings</span><span class="p">(</span><span class="s2">&quot;testpilottest&quot;</span><span class="p">),</span>
<span class="n">DataFrameConfig</span><span class="p">([</span>
<span class="p">(</span><span class="s2">&quot;method&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/method&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;id&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/id&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;type&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/type&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;object&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/object&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;category&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/category&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;variant&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/variant&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;details&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/details&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;sentiment&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/sentiment&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">IntegerType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;reason&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/reason&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;adBlocker&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/adBlocker&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;addons&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/addons&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">ArrayType</span><span class="p">(</span><span class="n">StringType</span><span class="p">())),</span>
<span class="p">(</span><span class="s2">&quot;channel&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/channel&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;hostname&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/hostname&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;language&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/language&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;openTabs&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/openTabs&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">IntegerType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;openWindows&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/openWindows&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">IntegerType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;platform&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/platform&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;protocol&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/protocol&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;telemetryId&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/telemetryId&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;timerContentLoaded&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/timerContentLoaded&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;timerFirstInteraction&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/timerFirstInteraction&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;timerFirstPaint&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/timerFirstPaint&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;timerWindowLoad&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/timerWindowLoad&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;inner_timestamp&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/timestamp&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;fx_version&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/fx_version&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;creation_date&quot;</span><span class="p">,</span> <span class="s2">&quot;creationDate&quot;</span><span class="p">,</span> <span class="n">dateutil</span><span class="o">.</span><span class="n">parser</span><span class="o">.</span><span class="n">parse</span><span class="p">,</span> <span class="n">TimestampType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;test&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/test&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;variants&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/variants&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;timestamp&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/timestamp&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;version&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/version&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">())</span>
<span class="p">]))</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="s2">&quot;test = &#39;pulse@mozilla.com&#39;&quot;</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">submission_date</span> <span class="o">=</span> <span class="p">(</span><span class="n">date</span><span class="o">.</span><span class="n">today</span><span class="p">()</span> <span class="o">-</span> <span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">))</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">&quot;%Y%m</span><span class="si">%d</span><span class="s2">&quot;</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">tpt</span> <span class="o">=</span> <span class="n">__main__</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">sqlContext</span><span class="p">,</span> <span class="n">submission_date</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">tpt</span><span class="o">.</span><span class="n">repartition</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="s1">&#39;s3://telemetry-parquet/testpilot/txp_pulse/v1/submission_date={}&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">submission_date</span><span class="p">))</span>
</pre></div>
</div>
</div>
</div>
</div>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<h2> 0 Comments </h2>
</br>
<div>
<div class="row">
<div class="col-md-12">
<textarea class="form-control" type="text" id="comment-text" style="height:87px;" placeholder="Leave a comment..."></textarea>
</div>
</div>
</div>
</br>
<div>
<button class="btn btn-primary" id="post_comment_btn">Post Comment</button>
</div>
</div>
</div>
<br>
<div class="row">
<div class="col-md-12">
</div>
</br>
</div>
</div>
</div>
</div>
<div class="footer">
Served with <span class="glyphicon glyphicon-heart"></span> by <a href="https://github.com/airbnb/knowledge-repo">Knowledge Repo</a> <a href="https://github.com/airbnb/knowledge-repo/releases/tag/v0.7.4">0.7.4</a><br />
<i title="Last checked for updates: 4 minutes ago">Last indexed: 14 hours ago</i>
</div>
<script type='text/javascript'>
$("#searchbar")[0].setSelectionRange(1000, 1000);
$('#searchbar').typeahead({
hint: false,
highlight: true,
minLength: 1
},
{
name: 'knowledge_posts',
limit: 10,
display: function (item) {
return item.title + " - " + item.author;
},
templates: {
empty: Handlebars.compile(
'<div class="tt-not-found">' +
'Unable to find any posts that match the current query' +
'</div>'
),
suggestion: function(data) {
return '<p style="overflow-wrap:break-word"><strong class="text-rausch">' + data.title + '</strong> – ' + data.author + '</p>';
}
},
source: function(q, sync, async) {
$.ajax('/ajax/index/typeahead?search=' + q,
{
success: function(data,status){ async(JSON.parse(data)); }
})
}
});
$('#searchbar').bind('typeahead:select', function(obj, datum, name) {
window.location = '/post/'+encodeURIComponent(datum.path);
});
$('#searchbar').keypress(function(event){
var keycode = (event.keyCode ? event.keyCode : event.which);
if(keycode == '13'){
var path = document.location.pathname;
window.location = '/feed?filters=' + $('#searchbar').val()
}
});
var padding = $('.tt-menu').outerWidth()
$('.tt-menu').width($('#searchbar').width() + padding + "px")
</script>
<script src="/static/js/tooltips.js" type="text/javascript"></script>
<script type="text/javascript">
$("document").ready(function(){
var is_webeditor = false;
var post_id = "33";
var id = "None";
var post_path = "etl/testpilot/pulse.kp"
var data_repo_github_root = ""
tooltipsJx.initializeTooltips(is_webeditor, post_id, id, data_repo_github_root);
$(".btn-rendered").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path);
})
$(".btn-raw").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path) + "?render=raw";
})
$(".btn-webeditor").on("click", function(){
document.location.href = "/edit/" + encodeURI(post_path);
})
});
</script>
<script src="/static/js/helpers.js"></script>
<script src="/static/js/tags.js" type="text/javascript"></script>
<script src="/static/js/icons.js" type="text/javascript"></script>
<script src="/static/js/comments.js" type="text/javascript"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script type="text/javascript"
src="https://cdn.mathjax.org/mathjax/latest/MathJax.js">
</script>
<script>
$(document).on('ready', function(){
// Make the Rendered Markdown Button active
$(".btn-rendered").addClass("btn-active");
// Initialize headers
helpersJx.linkifyHeaders();
// Initialize comments
var post_path = 'etl/testpilot/pulse.kp';
$("#post_comment_btn").on('click', function(){
comment_author = 'knowledge_default';
post_author = 'sunahsuh';
post_title = 'TxP Pulse ETL';
commentsJx.postComment(comment_author, post_author, post_title, post_path);
location.reload();
});
all_comment_delete_buttons = $("[id^=delete_comment]")
$.each(all_comment_delete_buttons, function(i,v){
$(v).on("click", function(){
var id = v.id;
var comment_id = id.split("__")[1];
if(comment_id) {
commentsJx.deleteComment(post_path, comment_id)
location.reload();
}
});
});
$(document.body).on('click',"button[id^=tag-subscription]",function () {
tagsJx.addTagSubscriptionListener($(this)[0]);
});
})
//Turn all the headers to be links
//h1 = Title, don't want that
var all_headers = [$("h2"), $("h3"), $("h4"), $("h5"), $("h6")]
$.each(all_headers, function(index, value){
$.each(value, function(i, v){
var inner_html = v.innerHTML
inner_html_no_special = inner_html.replace(/[^a-zA-Z\- ]/g, "")
var inner_link = "#" + inner_html_no_special.toLowerCase().split(" ").join("-")
v.innerHTML = "<a href='" + inner_link + "' class=link-reset>" + inner_html + "</a>"
})
})
//turn all the tags into links, similar to what's done on the feed page
var tags = $("#renderedMarkdown .metadata .tags")[0]
var tags_list = ['testpilot', 'etl']
var subscriptions_list = []
$.each(tags_list, function(i,tag){
ahref = document.createElement("a")
e_tag = encodeURIComponent(tag)
f_tag = tag.replace("/", "__")
tag_name = "#" + tag
tag_subscription_button_id_name = "tag-subscription-" + i + "__" + f_tag
ahref.setAttribute("data-container", "body")
ahref.setAttribute("data-toggle", "popover")
ahref.setAttribute("data-placement", "bottom")
ahref.setAttribute("data-html", "true")
ahref.setAttribute("data-tag-name", f_tag)
if (subscriptions_list.indexOf(tag) >= 0) {
ahref.setAttribute("class", "label label-subscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-primary btn-unsubscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-remove-sign glyphicon-white'></i>Unsubscribe " +
" </button> " +
" </div>")
} else {
ahref.setAttribute("class", "label label-unsubscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-default btn-subscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-ok-sign glyphicon-filled'></i>Subscribe " +
" </button> " +
" </div>")
}
ahref.setAttribute("href", "/tag_pages?tag=" + e_tag)
ahref.setAttribute("style", "font-weight:normal")
if (i == 0){
ahref.innerHTML = " "
colon = document.createElement("text")
colon.innerHTML = "<b>Tags</b>: "
tags.appendChild(colon)
}
ahref.innerHTML = ahref.innerHTML + tag_name
tags.appendChild(ahref)
if (i != tags_list.length - 1){
comma = document.createElement("text")
comma.innerText = ", "
tags.appendChild(comma)
}
})
tags.nextSibling.remove()
tags.innerHTML += "<i class='glyphicon glyphicon-edit icon-gray' style='font-size:12pt; padding-left:4px' id='tooltip-edit_tags'></i>"
$(".pop").popover({ trigger: "manual" , html: true, animation:false, delay: 100})
.on("mouseenter", function () {
var _this = this;
$(this).popover("show");
$(".popover").on("mouseleave", function () {
$(_this).popover('hide');
});
}).on("mouseleave", function () {
var _this = this;
setTimeout(function () {
if (!$(".popover:hover").length) {
$(_this).popover("hide");
}
}, 300);
});
$('#tooltip-edit_tags').click(function(){
$('#tooltip-edit_tags')[0].setAttribute("style", "display:none")
previousSibling = $("#tooltip-edit_tags")[0].previousSibling
tags_string = tags_list.join(", ")
form = document.createElement("form")
input = document.createElement("input")
tags_text = document.createElement("text")
icon_class = document.createElement("i")
icon_class.setAttribute("class", "glyphicon glyphicon-upload icon-gray")
icon_class.setAttribute("style", "font-size:23px; padding-left:4px")
icon_class.setAttribute("id", "tooltip-save_tags")
tags_text.innerText = "Tags: "
input.setAttribute('type', 'text')
input.setAttribute('name', 'tags_list')
input.setAttribute('value', tags_string)
input.setAttribute('style', 'width:75%; display: inline-block')
input.setAttribute('id' , 'change_tags')
form.appendChild(tags_text)
tags.textContent = " "
form.appendChild(input)
form.appendChild(icon_class)
tags.appendChild(form)
$("#tooltip-save_tags").click(function(){
tags_string = $("#change_tags")[0].value
tags_list = tags_string.split(",")
var re = /^[a-z0-9\-\_\:\/]+$/i
var good = true
for (var i = 0; i < tags_list.length; i++){
tag = tags_list[i]
if (tag.length == 0){
alert("There is a tag with length 0 - possible a trailing comma?")
good = false
break
} else {
tag_name = tag.trim()
if (!(re.test(tag_name))){
alert("The tag contains special characters. Make sure there are only alphanumeric characters in your tag")
good = false
break
}
}
}
if (good) {
var postContent = {}
postContent['tags'] = tags_string
$.ajax({
type: "POST",
dataType: "json",
data: JSON.stringify(postContent),
contentType: "application/json",
url: '/tag_list?post_path=etl/testpilot/pulse.kp',
async: false
});
location.reload()
}
})
tags.nextSibling.remove()
// Allow user to edit tags
var edit_icon = iconsJx.createEditTagsIcon();
$(tags).after(edit_icon);
$("#tooltip-edit_tags").on("click", function(){
var edit_tooltip = $("#tooltip-edit_tags");
edit_tooltip.attr("style", "display:none");
var tags_string = tags_list.join(", ");
var form = $("<form>");
var input = $("<input>");
var tags_text = $("<text>");
tags_text.html("Tags: ");
var icon = iconsJx.createSaveTagsIcon();
input.attr("type", "text");
input.attr("name", "tags_list");
input.attr("style", "width:75%; display: inline-block");
input.attr("id", "change_tags");
form.append(tags_text);
tags.textContent = " ";
tags_text.innerHTML = "Tags: ";
form.append(input);
form.append(icon);
tags.appendChild(form[0]);
$("#change_tags")[0].value = tags_string;
$("#change_tags").keypress(function(e){
if (e.which == 13){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/testpilot/pulse.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false;
};
});
$("#tooltip-save_tags").click(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/testpilot/pulse.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
});
$("form").submit(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/testpilot/pulse.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false
})
});
});
</script>
</body>
</html>

Просмотреть файл

@ -0,0 +1,13 @@
{
"title": "TxP Pulse ETL",
"authors": [
"sunahsuh"
],
"tags": [
"testpilot",
"etl"
],
"publish_date": "2017-02-17",
"updated_at": "2017-02-17",
"tldr": "This notebook transforms pings from the Pulse testpilot test to a parquet dataset. Docs at https://github.com/mozilla/pulse/blob/master/docs/metrics.md"
}

Просмотреть файл

@ -0,0 +1,553 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title> Knowledge </title>
<!-- js includes at the top as post embedded js colliding -->
<script src="https://reports.telemetry.mozilla.org/static/modules/jquery/jquery.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/tether/js/tether.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap/js/bootstrap.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/js/bootstrap-slider.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/typeahead.js/typeahead.bundle.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/handlebars/js/handlebars.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/select2/js/select2.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/hightlight.pack.js/highlight.pack.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/marked.js/marked.js"></script>
<!-- require js is used for plotly, but has a bunch of collisions with other js packages
make sure to have it be last js package imported -->
<script src="https://reports.telemetry.mozilla.org/static/modules/require.js/require.min.js"></script>
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.6.2/html5shiv.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/2.1.0/es5-shim.min.js"></script>
<![endif]-->
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap/css/bootstrap.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/css/bootstrap-slider.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/css/custom.css" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/select2/css/select2.min.css" rel="stylesheet" type="text/css"/>
<link href="https://fonts.googleapis.com/css?family=Lato:400,900|Playfair+Display|Source+Serif+Pro:400,700" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/images/favicon.png" rel="shortcut icon"/>
<link href="https://reports.telemetry.mozilla.org/static/css/codehilite-friendly.css" rel="stylesheet"/>
<style>
.spinner {
position: fixed;
top: 50%;
left: 50%;
margin-left: -50px; /* half width of the spinner gif */
margin-top: -50px; /* half height of the spinner gif */
text-align:center;
z-index:1234;
overflow: auto;
width: 100px; /* width of the spinner gif */
height: 102px; /*hight of the spinner gif +2px to fix IE8 issue */
}
.table {
font-size: 14px;
}
.modal-content {
max-width: 1024px;
}
</style>
</head>
<body>
<div class="container page-container">
<br/>
<div class="container-fluid">
<div class="row">
<div class="col-md-6">
</div>
<div class="col-md-2">
</div>
</div>
<div class="row col-md-12">
</div>
</div>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<div id="renderedMarkdown">
<div class="metadata">
<h1>TxP Snoozetabs ETL</h1>
<span class="authors"><a href="/feed?authors=sunahsuh">sunahsuh</a></span>
<span class="date_created">February 17, 2017</span>
<span class="date_updated">(Last Updated: March 20, 2017)</span>
<span class="tldr"><p>This notebook transforms pings from the SnoozeTabs testpilot test to a parquet dataset. Docs at https://github.com/bwinton/SnoozeTabs/blob/master/docs/metrics.md</p></span>
</div>
<div class="codehilite"><pre><span></span><span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="o">*</span>
<span class="kn">import</span> <span class="nn">dateutil.parser</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="o">*</span>
<span class="kn">import</span> <span class="nn">boto3</span>
<span class="kn">from</span> <span class="nn">moztelemetry</span> <span class="kn">import</span> <span class="n">get_pings_properties</span>
<span class="kn">from</span> <span class="nn">moztelemetry.dataset</span> <span class="kn">import</span> <span class="n">Dataset</span>
<span class="k">class</span> <span class="nc">ColumnConfig</span><span class="p">:</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">,</span> <span class="n">path</span><span class="p">,</span> <span class="n">cleaning_func</span><span class="p">,</span> <span class="n">struct_type</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">name</span>
<span class="bp">self</span><span class="o">.</span><span class="n">path</span> <span class="o">=</span> <span class="n">path</span>
<span class="bp">self</span><span class="o">.</span><span class="n">cleaning_func</span> <span class="o">=</span> <span class="n">cleaning_func</span>
<span class="bp">self</span><span class="o">.</span><span class="n">struct_type</span> <span class="o">=</span> <span class="n">struct_type</span>
<span class="k">class</span> <span class="nc">DataFrameConfig</span><span class="p">:</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">col_configs</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="p">[</span><span class="n">ColumnConfig</span><span class="p">(</span><span class="o">*</span><span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">col_configs</span><span class="p">]</span>
<span class="k">def</span> <span class="nf">toStructType</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="n">StructType</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">StructField</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="n">col</span><span class="o">.</span><span class="n">struct_type</span><span class="p">,</span> <span class="bp">True</span><span class="p">),</span>
<span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">))</span>
<span class="k">def</span> <span class="nf">get_names</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">col</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">get_paths</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">col</span><span class="o">.</span><span class="n">path</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">pings_to_df</span><span class="p">(</span><span class="n">sqlContext</span><span class="p">,</span> <span class="n">pings</span><span class="p">,</span> <span class="n">data_frame_config</span><span class="p">):</span>
<span class="sd">"""Performs simple data pipelining on raw pings</span>
<span class="sd"> Arguments:</span>
<span class="sd"> data_frame_config: a list of tuples of the form:</span>
<span class="sd"> (name, path, cleaning_func, column_type)</span>
<span class="sd"> """</span>
<span class="k">def</span> <span class="nf">build_cell</span><span class="p">(</span><span class="n">ping</span><span class="p">,</span> <span class="n">column_config</span><span class="p">):</span>
<span class="sd">"""Takes a json ping and a column config and returns a cleaned cell"""</span>
<span class="n">raw_value</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="n">column_config</span><span class="o">.</span><span class="n">path</span><span class="p">]</span>
<span class="n">func</span> <span class="o">=</span> <span class="n">column_config</span><span class="o">.</span><span class="n">cleaning_func</span>
<span class="k">if</span> <span class="n">func</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">func</span><span class="p">(</span><span class="n">raw_value</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">raw_value</span>
<span class="k">def</span> <span class="nf">ping_to_row</span><span class="p">(</span><span class="n">ping</span><span class="p">):</span>
<span class="k">return</span> <span class="p">[</span><span class="n">build_cell</span><span class="p">(</span><span class="n">ping</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">data_frame_config</span><span class="o">.</span><span class="n">columns</span><span class="p">]</span>
<span class="n">filtered_pings</span> <span class="o">=</span> <span class="n">get_pings_properties</span><span class="p">(</span><span class="n">pings</span><span class="p">,</span> <span class="n">data_frame_config</span><span class="o">.</span><span class="n">get_paths</span><span class="p">())</span>
<span class="k">return</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span>
<span class="n">filtered_pings</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">ping_to_row</span><span class="p">),</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">data_frame_config</span><span class="o">.</span><span class="n">toStructType</span><span class="p">())</span>
<span class="k">def</span> <span class="nf">save_df</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">name</span><span class="p">,</span> <span class="n">date_partition</span><span class="p">,</span> <span class="n">partitions</span><span class="o">=</span><span class="mi">1</span><span class="p">):</span>
<span class="k">if</span> <span class="n">date_partition</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">partition_str</span> <span class="o">=</span> <span class="s2">"/submission={day}"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">day</span><span class="o">=</span><span class="n">date_partition</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">partition_str</span><span class="o">=</span><span class="s2">""</span>
<span class="n">path_fmt</span> <span class="o">=</span> <span class="s2">"s3n://telemetry-parquet/harter/cliqz_{name}/v1{partition_str}"</span>
<span class="n">path</span> <span class="o">=</span> <span class="n">path_fmt</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">,</span> <span class="n">partition_str</span><span class="o">=</span><span class="n">partition_str</span><span class="p">)</span>
<span class="n">df</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">partitions</span><span class="p">)</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">mode</span><span class="p">(</span><span class="s2">"overwrite"</span><span class="p">)</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">__main__</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">sqlContext</span><span class="p">,</span> <span class="n">submission_date</span><span class="p">):</span>
<span class="k">if</span> <span class="n">submission_date</span> <span class="ow">is</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">submission_date</span> <span class="o">=</span> <span class="p">(</span><span class="n">date</span><span class="o">.</span><span class="n">today</span><span class="p">()</span> <span class="o">-</span> <span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">))</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">"%Y%m</span><span class="si">%d</span><span class="s2">"</span><span class="p">)</span>
<span class="n">get_doctype_pings</span> <span class="o">=</span> <span class="k">lambda</span> <span class="n">docType</span><span class="p">:</span> <span class="n">Dataset</span><span class="o">.</span><span class="n">from_source</span><span class="p">(</span><span class="s2">"telemetry"</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">docType</span><span class="o">=</span><span class="n">docType</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">submissionDate</span><span class="o">=</span><span class="n">submission_date</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">appName</span><span class="o">=</span><span class="s2">"Firefox"</span><span class="p">)</span> \
<span class="o">.</span><span class="n">records</span><span class="p">(</span><span class="n">sc</span><span class="p">)</span>
<span class="n">old_st</span> <span class="o">=</span> <span class="n">pings_to_df</span><span class="p">(</span>
<span class="n">sqlContext</span><span class="p">,</span>
<span class="n">get_doctype_pings</span><span class="p">(</span><span class="s2">"testpilottest"</span><span class="p">),</span>
<span class="n">DataFrameConfig</span><span class="p">([</span>
<span class="p">(</span><span class="s2">"client_id"</span><span class="p">,</span> <span class="s2">"clientId"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"event"</span><span class="p">,</span> <span class="s2">"payload/payload/testpilotPingData/event"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"snooze_time"</span><span class="p">,</span> <span class="s2">"payload/payload/testpilotPingData/snooze_time"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"snooze_time_type"</span><span class="p">,</span> <span class="s2">"payload/payload/testpilotPingData/snooze_time_type"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"creation_date"</span><span class="p">,</span> <span class="s2">"creationDate"</span><span class="p">,</span> <span class="n">dateutil</span><span class="o">.</span><span class="n">parser</span><span class="o">.</span><span class="n">parse</span><span class="p">,</span> <span class="n">TimestampType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"test"</span><span class="p">,</span> <span class="s2">"payload/test"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"variants"</span><span class="p">,</span> <span class="s2">"payload/variants"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"timestamp"</span><span class="p">,</span> <span class="s2">"payload/timestamp"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"version"</span><span class="p">,</span> <span class="s2">"payload/version"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">())</span>
<span class="p">]))</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="s2">"event IS NOT NULL"</span><span class="p">)</span> \
<span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="s2">"test = 'snoozetabs@mozilla.com'"</span><span class="p">)</span>
<span class="n">new_st</span> <span class="o">=</span> <span class="n">pings_to_df</span><span class="p">(</span>
<span class="n">sqlContext</span><span class="p">,</span>
<span class="n">get_doctype_pings</span><span class="p">(</span><span class="s2">"testpilottest"</span><span class="p">),</span>
<span class="n">DataFrameConfig</span><span class="p">([</span>
<span class="p">(</span><span class="s2">"client_id"</span><span class="p">,</span> <span class="s2">"clientId"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"event"</span><span class="p">,</span> <span class="s2">"payload/payload/event"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"snooze_time"</span><span class="p">,</span> <span class="s2">"payload/payload/snooze_time"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"snooze_time_type"</span><span class="p">,</span> <span class="s2">"payload/payload/snooze_time_type"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"creation_date"</span><span class="p">,</span> <span class="s2">"creationDate"</span><span class="p">,</span> <span class="n">dateutil</span><span class="o">.</span><span class="n">parser</span><span class="o">.</span><span class="n">parse</span><span class="p">,</span> <span class="n">TimestampType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"test"</span><span class="p">,</span> <span class="s2">"payload/test"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"variants"</span><span class="p">,</span> <span class="s2">"payload/variants"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"timestamp"</span><span class="p">,</span> <span class="s2">"payload/timestamp"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">"version"</span><span class="p">,</span> <span class="s2">"payload/version"</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">())</span>
<span class="p">]))</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="s2">"event IS NOT NULL"</span><span class="p">)</span> \
<span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="s2">"test = 'snoozetabs@mozilla.com'"</span><span class="p">)</span>
<span class="k">return</span> <span class="n">old_st</span><span class="o">.</span><span class="n">union</span><span class="p">(</span><span class="n">new_st</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span>Unable to parse whitelist (/mnt/anaconda2/lib/python2.7/site-packages/moztelemetry/histogram-whitelists.json). Assuming all histograms are acceptable.
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">tpt</span> <span class="o">=</span> <span class="n">__main__</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">sqlContext</span><span class="p">,</span> <span class="n">submission_date</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">tpt</span><span class="o">.</span><span class="n">repartition</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="s1">'s3://telemetry-parquet/testpilot/txp_snoozetabs/v2/submission_date={}'</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">submission_date</span><span class="p">))</span>
</pre></div>
</div>
</div>
</div>
</div>
</div></body></html>
<div>
</div>
<br/>
<div class="row">
<div class="col-md-12">
</div>
</div>
<script type="text/javascript">
$("#searchbar")[0].setSelectionRange(1000, 1000);
$('#searchbar').typeahead({
hint: false,
highlight: true,
minLength: 1
},
{
name: 'knowledge_posts',
limit: 10,
display: function (item) {
return item.title + " - " + item.author;
},
templates: {
empty: Handlebars.compile(
'<div class="tt-not-found">' +
'Unable to find any posts that match the current query' +
'</div>'
),
suggestion: function(data) {
return '<p style="overflow-wrap:break-word"><strong class="text-rausch">' + data.title + '</strong> – ' + data.author + '</p>';
}
},
source: function(q, sync, async) {
$.ajax('/ajax/index/typeahead?search=' + q,
{
success: function(data,status){ async(JSON.parse(data)); }
})
}
});
$('#searchbar').bind('typeahead:select', function(obj, datum, name) {
window.location = '/post/'+encodeURIComponent(datum.path);
});
$('#searchbar').keypress(function(event){
var keycode = (event.keyCode ? event.keyCode : event.which);
if(keycode == '13'){
var path = document.location.pathname;
window.location = '/feed?filters=' + $('#searchbar').val()
}
});
var padding = $('.tt-menu').outerWidth()
$('.tt-menu').width($('#searchbar').width() + padding + "px")
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/tooltips.js" type="text/javascript"></script>
<script type="text/javascript">
$("document").ready(function(){
var is_webeditor = false;
var post_id = "20";
var id = "None";
var post_path = "etl/testpilot/snoozetabs.kp"
var data_repo_github_root = ""
tooltipsJx.initializeTooltips(is_webeditor, post_id, id, data_repo_github_root);
$(".btn-rendered").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path);
})
$(".btn-raw").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path) + "?render=raw";
})
$(".btn-webeditor").on("click", function(){
document.location.href = "/edit/" + encodeURI(post_path);
})
});
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/tags.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/icons.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/comments.js" type="text/javascript"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js" type="text/javascript">
</script>
<script>
$(document).on('ready', function(){
// Make the Rendered Markdown Button active
$(".btn-rendered").addClass("btn-active");
// Initialize headers
helpersJx.linkifyHeaders();
// Initialize comments
var post_path = 'etl/testpilot/snoozetabs.kp';
$("#post_comment_btn").on('click', function(){
comment_author = 'knowledge_default';
post_author = 'sunahsuh';
post_title = 'TxP Snoozetabs ETL';
commentsJx.postComment(comment_author, post_author, post_title, post_path);
location.reload();
});
all_comment_delete_buttons = $("[id^=delete_comment]")
$.each(all_comment_delete_buttons, function(i,v){
$(v).on("click", function(){
var id = v.id;
var comment_id = id.split("__")[1];
if(comment_id) {
commentsJx.deleteComment(post_path, comment_id)
location.reload();
}
});
});
$(document.body).on('click',"button[id^=tag-subscription]",function () {
tagsJx.addTagSubscriptionListener($(this)[0]);
});
})
//Turn all the headers to be links
//h1 = Title, don't want that
var all_headers = [$("h2"), $("h3"), $("h4"), $("h5"), $("h6")]
$.each(all_headers, function(index, value){
$.each(value, function(i, v){
var inner_html = v.innerHTML
inner_html_no_special = inner_html.replace(/[^a-zA-Z\- ]/g, "")
var inner_link = "#" + inner_html_no_special.toLowerCase().split(" ").join("-")
v.innerHTML = "<a href='" + inner_link + "' class=link-reset>" + inner_html + "</a>"
})
})
//turn all the tags into links, similar to what's done on the feed page
var tags = $("#renderedMarkdown .metadata .tags")[0]
var tags_list = ['testpilot', 'etl']
var subscriptions_list = []
$.each(tags_list, function(i,tag){
ahref = document.createElement("a")
e_tag = encodeURIComponent(tag)
f_tag = tag.replace("/", "__")
tag_name = "#" + tag
tag_subscription_button_id_name = "tag-subscription-" + i + "__" + f_tag
ahref.setAttribute("data-container", "body")
ahref.setAttribute("data-toggle", "popover")
ahref.setAttribute("data-placement", "bottom")
ahref.setAttribute("data-html", "true")
ahref.setAttribute("data-tag-name", f_tag)
if (subscriptions_list.indexOf(tag) >= 0) {
ahref.setAttribute("class", "label label-subscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-primary btn-unsubscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-remove-sign glyphicon-white'></i>Unsubscribe " +
" </button> " +
" </div>")
} else {
ahref.setAttribute("class", "label label-unsubscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-default btn-subscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-ok-sign glyphicon-filled'></i>Subscribe " +
" </button> " +
" </div>")
}
ahref.setAttribute("href", "/tag_pages?tag=" + e_tag)
ahref.setAttribute("style", "font-weight:normal")
if (i == 0){
ahref.innerHTML = " "
colon = document.createElement("text")
colon.innerHTML = "<b>Tags</b>: "
tags.appendChild(colon)
}
ahref.innerHTML = ahref.innerHTML + tag_name
tags.appendChild(ahref)
if (i != tags_list.length - 1){
comma = document.createElement("text")
comma.innerText = ", "
tags.appendChild(comma)
}
})
tags.nextSibling.remove()
tags.innerHTML += "<i class='glyphicon glyphicon-edit icon-gray' style='font-size:12pt; padding-left:4px' id='tooltip-edit_tags'></i>"
$(".pop").popover({ trigger: "manual" , html: true, animation:false, delay: 100})
.on("mouseenter", function () {
var _this = this;
$(this).popover("show");
$(".popover").on("mouseleave", function () {
$(_this).popover('hide');
});
}).on("mouseleave", function () {
var _this = this;
setTimeout(function () {
if (!$(".popover:hover").length) {
$(_this).popover("hide");
}
}, 300);
});
$('#tooltip-edit_tags').click(function(){
$('#tooltip-edit_tags')[0].setAttribute("style", "display:none")
previousSibling = $("#tooltip-edit_tags")[0].previousSibling
tags_string = tags_list.join(", ")
form = document.createElement("form")
input = document.createElement("input")
tags_text = document.createElement("text")
icon_class = document.createElement("i")
icon_class.setAttribute("class", "glyphicon glyphicon-upload icon-gray")
icon_class.setAttribute("style", "font-size:23px; padding-left:4px")
icon_class.setAttribute("id", "tooltip-save_tags")
tags_text.innerText = "Tags: "
input.setAttribute('type', 'text')
input.setAttribute('name', 'tags_list')
input.setAttribute('value', tags_string)
input.setAttribute('style', 'width:75%; display: inline-block')
input.setAttribute('id' , 'change_tags')
form.appendChild(tags_text)
tags.textContent = " "
form.appendChild(input)
form.appendChild(icon_class)
tags.appendChild(form)
$("#tooltip-save_tags").click(function(){
tags_string = $("#change_tags")[0].value
tags_list = tags_string.split(",")
var re = /^[a-z0-9\-\_\:\/]+$/i
var good = true
for (var i = 0; i < tags_list.length; i++){
tag = tags_list[i]
if (tag.length == 0){
alert("There is a tag with length 0 - possible a trailing comma?")
good = false
break
} else {
tag_name = tag.trim()
if (!(re.test(tag_name))){
alert("The tag contains special characters. Make sure there are only alphanumeric characters in your tag")
good = false
break
}
}
}
if (good) {
var postContent = {}
postContent['tags'] = tags_string
$.ajax({
type: "POST",
dataType: "json",
data: JSON.stringify(postContent),
contentType: "application/json",
url: '/tag_list?post_path=etl/testpilot/snoozetabs.kp',
async: false
});
location.reload()
}
})
tags.nextSibling.remove()
// Allow user to edit tags
var edit_icon = iconsJx.createEditTagsIcon();
$(tags).after(edit_icon);
$("#tooltip-edit_tags").on("click", function(){
var edit_tooltip = $("#tooltip-edit_tags");
edit_tooltip.attr("style", "display:none");
var tags_string = tags_list.join(", ");
var form = $("<form>");
var input = $("<input>");
var tags_text = $("<text>");
tags_text.html("Tags: ");
var icon = iconsJx.createSaveTagsIcon();
input.attr("type", "text");
input.attr("name", "tags_list");
input.attr("style", "width:75%; display: inline-block");
input.attr("id", "change_tags");
form.append(tags_text);
tags.textContent = " ";
tags_text.innerHTML = "Tags: ";
form.append(input);
form.append(icon);
tags.appendChild(form[0]);
$("#change_tags")[0].value = tags_string;
$("#change_tags").keypress(function(e){
if (e.which == 13){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/testpilot/snoozetabs.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false;
};
});
$("#tooltip-save_tags").click(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/testpilot/snoozetabs.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
});
$("form").submit(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/testpilot/snoozetabs.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false
})
});
});
</script>

Просмотреть файл

@ -0,0 +1,671 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title> Knowledge </title>
<!-- js includes at the top as post embedded js colliding -->
<script src="/static/modules/jquery/jquery.min.js"></script>
<script src="/static/modules/tether/js/tether.min.js"></script>
<script src="/static/modules/bootstrap/js/bootstrap.min.js"></script>
<script src="/static/modules/bootstrap-slider/js/bootstrap-slider.min.js"></script>
<script src="/static/modules/typeahead.js/typeahead.bundle.min.js"></script>
<script src="/static/modules/handlebars/js/handlebars.js"></script>
<script src="/static/js/helpers.js"></script>
<script src="/static/modules/select2/js/select2.min.js"></script>
<script src="/static/modules/hightlight.pack.js/highlight.pack.js"></script>
<script src="/static/modules/marked.js/marked.js"></script>
<!-- require js is used for plotly, but has a bunch of collisions with other js packages
make sure to have it be last js package imported -->
<script src="/static/modules/require.js/require.min.js"></script>
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.6.2/html5shiv.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/2.1.0/es5-shim.min.js"></script>
<![endif]-->
<link rel="stylesheet" href="/static/modules/bootstrap/css/bootstrap.min.css">
<link rel="stylesheet" href="/static/modules/bootstrap-slider/css/bootstrap-slider.min.css">
<link rel="stylesheet" type='text/css' href="/static/css/custom.css">
<link rel="stylesheet" type='text/css' href="/static/modules/select2/css/select2.min.css">
<link href='https://fonts.googleapis.com/css?family=Lato:400,900|Playfair+Display|Source+Serif+Pro:400,700' rel='stylesheet' type='text/css'>
<link rel="shortcut icon" href="/static/images/favicon.png">
<link rel="stylesheet" href="/static/css/codehilite-friendly.css">
<style>
.spinner {
position: fixed;
top: 50%;
left: 50%;
margin-left: -50px; /* half width of the spinner gif */
margin-top: -50px; /* half height of the spinner gif */
text-align:center;
z-index:1234;
overflow: auto;
width: 100px; /* width of the spinner gif */
height: 102px; /*hight of the spinner gif +2px to fix IE8 issue */
}
.table {
font-size: 14px;
}
.modal-content {
max-width: 1024px;
}
</style>
</head>
<body>
<div class="navbar navbar-knowledge" role="navigation">
<div class="container page-container">
<a href="/" aria-selected="false" class="logo-image navbar-text" style='margin-top:16px; margin-left: -7px'>
<img width="125" src='/static/images/logo-white.svg'></img>
</a>
<a id="feed_tab" href="/" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Home
</a>
<a id="favorites_tab" href="/favorites" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Favorites
</a>
<a id="help_tab" href="/about" aria-selected="false" class="navbar-text" style='margin-top:18px'>
About
</a>
<a id="stats_tab" href="/stats" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Stats
</a>
<a id="webposts_tab" href="/create"
class="btn btn-primary navbar-text pull-right"
style='border-radius:4px; margin-top:10px; margin-right:-10px; background-color: #00a699; color:white; border-color: #00a699'>
Write a Post!
</a>
<div class="pull-right">
<div class="form-group">
<input class="form-control" id="searchbar" placeholder="Search for Knowledge" style="text-align:right">
</div>
</div>
</div>
</div>
<div class="container page-container">
<br>
<div class="container-fluid">
<div class="row">
<div class="col-md-6">
<div class="btn-group" role="group">
<button type="button" class="btn btn-default btn-rendered">
View Post
</button>
<button type="button" class="btn btn-default btn-raw">
View Raw Markdown
</button>
</div>
</div>
<div class="col-md-2">
</div>
<div class="col-md-4 text-right">
<i class="glyphicon glyphicon-eye-open" id="pageview_stats" style='color: #9CA299'></i>
<div id="pageview_stats" style="display: inline-block">
Viewed 1 times by 1 different users
</div>
<i class="glyphicon glyphicon-heart-empty glyphicon-clickable pop" style="font-size:16pt" id="tooltip-like" data-placement="bottom"
data-trigger="#tooltip-like"
data-container="body"
data-toggle="popover"
data-content='<div>Like This Post</div>'></i>
</div>
</div>
<div class="row col-md-12">
</div>
</div>
<div class='container-fluid'>
<div class="row">
<div class="col-md-12">
<div id="renderedMarkdown">
<div class='metadata'>
<h1>TxP Snoozetabs ETL</h1>
<span class='authors'><a href='/feed?authors=sunahsuh'>sunahsuh</a></span>
<span class='date_created'>February 17, 2017</span>
<span class='date_updated'>(Last Updated: March 20, 2017)</span>
<span class='tldr'><p>This notebook transforms pings from the SnoozeTabs testpilot test to a parquet dataset. Docs at https://github.com/bwinton/SnoozeTabs/blob/master/docs/metrics.md</p></span>
<span class='tags'></span>
</div>
<div class="codehilite"><pre><span></span><span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="o">*</span>
<span class="kn">import</span> <span class="nn">dateutil.parser</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="o">*</span>
<span class="kn">import</span> <span class="nn">boto3</span>
<span class="kn">from</span> <span class="nn">moztelemetry</span> <span class="kn">import</span> <span class="n">get_pings_properties</span>
<span class="kn">from</span> <span class="nn">moztelemetry.dataset</span> <span class="kn">import</span> <span class="n">Dataset</span>
<span class="k">class</span> <span class="nc">ColumnConfig</span><span class="p">:</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">,</span> <span class="n">path</span><span class="p">,</span> <span class="n">cleaning_func</span><span class="p">,</span> <span class="n">struct_type</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">name</span>
<span class="bp">self</span><span class="o">.</span><span class="n">path</span> <span class="o">=</span> <span class="n">path</span>
<span class="bp">self</span><span class="o">.</span><span class="n">cleaning_func</span> <span class="o">=</span> <span class="n">cleaning_func</span>
<span class="bp">self</span><span class="o">.</span><span class="n">struct_type</span> <span class="o">=</span> <span class="n">struct_type</span>
<span class="k">class</span> <span class="nc">DataFrameConfig</span><span class="p">:</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">col_configs</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="p">[</span><span class="n">ColumnConfig</span><span class="p">(</span><span class="o">*</span><span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">col_configs</span><span class="p">]</span>
<span class="k">def</span> <span class="nf">toStructType</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="n">StructType</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">StructField</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="n">col</span><span class="o">.</span><span class="n">struct_type</span><span class="p">,</span> <span class="bp">True</span><span class="p">),</span>
<span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">))</span>
<span class="k">def</span> <span class="nf">get_names</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">col</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">get_paths</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">col</span><span class="o">.</span><span class="n">path</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">pings_to_df</span><span class="p">(</span><span class="n">sqlContext</span><span class="p">,</span> <span class="n">pings</span><span class="p">,</span> <span class="n">data_frame_config</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Performs simple data pipelining on raw pings</span>
<span class="sd"> Arguments:</span>
<span class="sd"> data_frame_config: a list of tuples of the form:</span>
<span class="sd"> (name, path, cleaning_func, column_type)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="nf">build_cell</span><span class="p">(</span><span class="n">ping</span><span class="p">,</span> <span class="n">column_config</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Takes a json ping and a column config and returns a cleaned cell&quot;&quot;&quot;</span>
<span class="n">raw_value</span> <span class="o">=</span> <span class="n">ping</span><span class="p">[</span><span class="n">column_config</span><span class="o">.</span><span class="n">path</span><span class="p">]</span>
<span class="n">func</span> <span class="o">=</span> <span class="n">column_config</span><span class="o">.</span><span class="n">cleaning_func</span>
<span class="k">if</span> <span class="n">func</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">func</span><span class="p">(</span><span class="n">raw_value</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">raw_value</span>
<span class="k">def</span> <span class="nf">ping_to_row</span><span class="p">(</span><span class="n">ping</span><span class="p">):</span>
<span class="k">return</span> <span class="p">[</span><span class="n">build_cell</span><span class="p">(</span><span class="n">ping</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">data_frame_config</span><span class="o">.</span><span class="n">columns</span><span class="p">]</span>
<span class="n">filtered_pings</span> <span class="o">=</span> <span class="n">get_pings_properties</span><span class="p">(</span><span class="n">pings</span><span class="p">,</span> <span class="n">data_frame_config</span><span class="o">.</span><span class="n">get_paths</span><span class="p">())</span>
<span class="k">return</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span>
<span class="n">filtered_pings</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">ping_to_row</span><span class="p">),</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">data_frame_config</span><span class="o">.</span><span class="n">toStructType</span><span class="p">())</span>
<span class="k">def</span> <span class="nf">save_df</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">name</span><span class="p">,</span> <span class="n">date_partition</span><span class="p">,</span> <span class="n">partitions</span><span class="o">=</span><span class="mi">1</span><span class="p">):</span>
<span class="k">if</span> <span class="n">date_partition</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">partition_str</span> <span class="o">=</span> <span class="s2">&quot;/submission={day}&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">day</span><span class="o">=</span><span class="n">date_partition</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">partition_str</span><span class="o">=</span><span class="s2">&quot;&quot;</span>
<span class="n">path_fmt</span> <span class="o">=</span> <span class="s2">&quot;s3n://telemetry-parquet/harter/cliqz_{name}/v1{partition_str}&quot;</span>
<span class="n">path</span> <span class="o">=</span> <span class="n">path_fmt</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">,</span> <span class="n">partition_str</span><span class="o">=</span><span class="n">partition_str</span><span class="p">)</span>
<span class="n">df</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">partitions</span><span class="p">)</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">mode</span><span class="p">(</span><span class="s2">&quot;overwrite&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">__main__</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">sqlContext</span><span class="p">,</span> <span class="n">submission_date</span><span class="p">):</span>
<span class="k">if</span> <span class="n">submission_date</span> <span class="ow">is</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">submission_date</span> <span class="o">=</span> <span class="p">(</span><span class="n">date</span><span class="o">.</span><span class="n">today</span><span class="p">()</span> <span class="o">-</span> <span class="n">timedelta</span><span class="p">(</span><span class="mi">1</span><span class="p">))</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">&quot;%Y%m</span><span class="si">%d</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="n">get_doctype_pings</span> <span class="o">=</span> <span class="k">lambda</span> <span class="n">docType</span><span class="p">:</span> <span class="n">Dataset</span><span class="o">.</span><span class="n">from_source</span><span class="p">(</span><span class="s2">&quot;telemetry&quot;</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">docType</span><span class="o">=</span><span class="n">docType</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">submissionDate</span><span class="o">=</span><span class="n">submission_date</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">appName</span><span class="o">=</span><span class="s2">&quot;Firefox&quot;</span><span class="p">)</span> \
<span class="o">.</span><span class="n">records</span><span class="p">(</span><span class="n">sc</span><span class="p">)</span>
<span class="n">old_st</span> <span class="o">=</span> <span class="n">pings_to_df</span><span class="p">(</span>
<span class="n">sqlContext</span><span class="p">,</span>
<span class="n">get_doctype_pings</span><span class="p">(</span><span class="s2">&quot;testpilottest&quot;</span><span class="p">),</span>
<span class="n">DataFrameConfig</span><span class="p">([</span>
<span class="p">(</span><span class="s2">&quot;client_id&quot;</span><span class="p">,</span> <span class="s2">&quot;clientId&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;event&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/testpilotPingData/event&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;snooze_time&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/testpilotPingData/snooze_time&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;snooze_time_type&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/testpilotPingData/snooze_time_type&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;creation_date&quot;</span><span class="p">,</span> <span class="s2">&quot;creationDate&quot;</span><span class="p">,</span> <span class="n">dateutil</span><span class="o">.</span><span class="n">parser</span><span class="o">.</span><span class="n">parse</span><span class="p">,</span> <span class="n">TimestampType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;test&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/test&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;variants&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/variants&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;timestamp&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/timestamp&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;version&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/version&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">())</span>
<span class="p">]))</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="s2">&quot;event IS NOT NULL&quot;</span><span class="p">)</span> \
<span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="s2">&quot;test = &#39;snoozetabs@mozilla.com&#39;&quot;</span><span class="p">)</span>
<span class="n">new_st</span> <span class="o">=</span> <span class="n">pings_to_df</span><span class="p">(</span>
<span class="n">sqlContext</span><span class="p">,</span>
<span class="n">get_doctype_pings</span><span class="p">(</span><span class="s2">&quot;testpilottest&quot;</span><span class="p">),</span>
<span class="n">DataFrameConfig</span><span class="p">([</span>
<span class="p">(</span><span class="s2">&quot;client_id&quot;</span><span class="p">,</span> <span class="s2">&quot;clientId&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;event&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/event&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;snooze_time&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/snooze_time&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;snooze_time_type&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/payload/snooze_time_type&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;creation_date&quot;</span><span class="p">,</span> <span class="s2">&quot;creationDate&quot;</span><span class="p">,</span> <span class="n">dateutil</span><span class="o">.</span><span class="n">parser</span><span class="o">.</span><span class="n">parse</span><span class="p">,</span> <span class="n">TimestampType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;test&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/test&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;variants&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/variants&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;timestamp&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/timestamp&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">LongType</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;version&quot;</span><span class="p">,</span> <span class="s2">&quot;payload/version&quot;</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="n">StringType</span><span class="p">())</span>
<span class="p">]))</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="s2">&quot;event IS NOT NULL&quot;</span><span class="p">)</span> \
<span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="s2">&quot;test = &#39;snoozetabs@mozilla.com&#39;&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="n">old_st</span><span class="o">.</span><span class="n">union</span><span class="p">(</span><span class="n">new_st</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span>Unable to parse whitelist (/mnt/anaconda2/lib/python2.7/site-packages/moztelemetry/histogram-whitelists.json). Assuming all histograms are acceptable.
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">tpt</span> <span class="o">=</span> <span class="n">__main__</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">sqlContext</span><span class="p">,</span> <span class="n">submission_date</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">tpt</span><span class="o">.</span><span class="n">repartition</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="s1">&#39;s3://telemetry-parquet/testpilot/txp_snoozetabs/v2/submission_date={}&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">submission_date</span><span class="p">))</span>
</pre></div>
</div>
</div>
</div>
</div>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<h2> 0 Comments </h2>
</br>
<div>
<div class="row">
<div class="col-md-12">
<textarea class="form-control" type="text" id="comment-text" style="height:87px;" placeholder="Leave a comment..."></textarea>
</div>
</div>
</div>
</br>
<div>
<button class="btn btn-primary" id="post_comment_btn">Post Comment</button>
</div>
</div>
</div>
<br>
<div class="row">
<div class="col-md-12">
</div>
</br>
</div>
</div>
</div>
</div>
<div class="footer">
Served with <span class="glyphicon glyphicon-heart"></span> by <a href="https://github.com/airbnb/knowledge-repo">Knowledge Repo</a> <a href="https://github.com/airbnb/knowledge-repo/releases/tag/v0.7.4">0.7.4</a><br />
<i title="Last checked for updates: 4 minutes ago">Last indexed: 14 hours ago</i>
</div>
<script type='text/javascript'>
$("#searchbar")[0].setSelectionRange(1000, 1000);
$('#searchbar').typeahead({
hint: false,
highlight: true,
minLength: 1
},
{
name: 'knowledge_posts',
limit: 10,
display: function (item) {
return item.title + " - " + item.author;
},
templates: {
empty: Handlebars.compile(
'<div class="tt-not-found">' +
'Unable to find any posts that match the current query' +
'</div>'
),
suggestion: function(data) {
return '<p style="overflow-wrap:break-word"><strong class="text-rausch">' + data.title + '</strong> – ' + data.author + '</p>';
}
},
source: function(q, sync, async) {
$.ajax('/ajax/index/typeahead?search=' + q,
{
success: function(data,status){ async(JSON.parse(data)); }
})
}
});
$('#searchbar').bind('typeahead:select', function(obj, datum, name) {
window.location = '/post/'+encodeURIComponent(datum.path);
});
$('#searchbar').keypress(function(event){
var keycode = (event.keyCode ? event.keyCode : event.which);
if(keycode == '13'){
var path = document.location.pathname;
window.location = '/feed?filters=' + $('#searchbar').val()
}
});
var padding = $('.tt-menu').outerWidth()
$('.tt-menu').width($('#searchbar').width() + padding + "px")
</script>
<script src="/static/js/tooltips.js" type="text/javascript"></script>
<script type="text/javascript">
$("document").ready(function(){
var is_webeditor = false;
var post_id = "20";
var id = "None";
var post_path = "etl/testpilot/snoozetabs.kp"
var data_repo_github_root = ""
tooltipsJx.initializeTooltips(is_webeditor, post_id, id, data_repo_github_root);
$(".btn-rendered").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path);
})
$(".btn-raw").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path) + "?render=raw";
})
$(".btn-webeditor").on("click", function(){
document.location.href = "/edit/" + encodeURI(post_path);
})
});
</script>
<script src="/static/js/helpers.js"></script>
<script src="/static/js/tags.js" type="text/javascript"></script>
<script src="/static/js/icons.js" type="text/javascript"></script>
<script src="/static/js/comments.js" type="text/javascript"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script type="text/javascript"
src="https://cdn.mathjax.org/mathjax/latest/MathJax.js">
</script>
<script>
$(document).on('ready', function(){
// Make the Rendered Markdown Button active
$(".btn-rendered").addClass("btn-active");
// Initialize headers
helpersJx.linkifyHeaders();
// Initialize comments
var post_path = 'etl/testpilot/snoozetabs.kp';
$("#post_comment_btn").on('click', function(){
comment_author = 'knowledge_default';
post_author = 'sunahsuh';
post_title = 'TxP Snoozetabs ETL';
commentsJx.postComment(comment_author, post_author, post_title, post_path);
location.reload();
});
all_comment_delete_buttons = $("[id^=delete_comment]")
$.each(all_comment_delete_buttons, function(i,v){
$(v).on("click", function(){
var id = v.id;
var comment_id = id.split("__")[1];
if(comment_id) {
commentsJx.deleteComment(post_path, comment_id)
location.reload();
}
});
});
$(document.body).on('click',"button[id^=tag-subscription]",function () {
tagsJx.addTagSubscriptionListener($(this)[0]);
});
})
//Turn all the headers to be links
//h1 = Title, don't want that
var all_headers = [$("h2"), $("h3"), $("h4"), $("h5"), $("h6")]
$.each(all_headers, function(index, value){
$.each(value, function(i, v){
var inner_html = v.innerHTML
inner_html_no_special = inner_html.replace(/[^a-zA-Z\- ]/g, "")
var inner_link = "#" + inner_html_no_special.toLowerCase().split(" ").join("-")
v.innerHTML = "<a href='" + inner_link + "' class=link-reset>" + inner_html + "</a>"
})
})
//turn all the tags into links, similar to what's done on the feed page
var tags = $("#renderedMarkdown .metadata .tags")[0]
var tags_list = ['testpilot', 'etl']
var subscriptions_list = []
$.each(tags_list, function(i,tag){
ahref = document.createElement("a")
e_tag = encodeURIComponent(tag)
f_tag = tag.replace("/", "__")
tag_name = "#" + tag
tag_subscription_button_id_name = "tag-subscription-" + i + "__" + f_tag
ahref.setAttribute("data-container", "body")
ahref.setAttribute("data-toggle", "popover")
ahref.setAttribute("data-placement", "bottom")
ahref.setAttribute("data-html", "true")
ahref.setAttribute("data-tag-name", f_tag)
if (subscriptions_list.indexOf(tag) >= 0) {
ahref.setAttribute("class", "label label-subscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-primary btn-unsubscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-remove-sign glyphicon-white'></i>Unsubscribe " +
" </button> " +
" </div>")
} else {
ahref.setAttribute("class", "label label-unsubscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-default btn-subscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-ok-sign glyphicon-filled'></i>Subscribe " +
" </button> " +
" </div>")
}
ahref.setAttribute("href", "/tag_pages?tag=" + e_tag)
ahref.setAttribute("style", "font-weight:normal")
if (i == 0){
ahref.innerHTML = " "
colon = document.createElement("text")
colon.innerHTML = "<b>Tags</b>: "
tags.appendChild(colon)
}
ahref.innerHTML = ahref.innerHTML + tag_name
tags.appendChild(ahref)
if (i != tags_list.length - 1){
comma = document.createElement("text")
comma.innerText = ", "
tags.appendChild(comma)
}
})
tags.nextSibling.remove()
tags.innerHTML += "<i class='glyphicon glyphicon-edit icon-gray' style='font-size:12pt; padding-left:4px' id='tooltip-edit_tags'></i>"
$(".pop").popover({ trigger: "manual" , html: true, animation:false, delay: 100})
.on("mouseenter", function () {
var _this = this;
$(this).popover("show");
$(".popover").on("mouseleave", function () {
$(_this).popover('hide');
});
}).on("mouseleave", function () {
var _this = this;
setTimeout(function () {
if (!$(".popover:hover").length) {
$(_this).popover("hide");
}
}, 300);
});
$('#tooltip-edit_tags').click(function(){
$('#tooltip-edit_tags')[0].setAttribute("style", "display:none")
previousSibling = $("#tooltip-edit_tags")[0].previousSibling
tags_string = tags_list.join(", ")
form = document.createElement("form")
input = document.createElement("input")
tags_text = document.createElement("text")
icon_class = document.createElement("i")
icon_class.setAttribute("class", "glyphicon glyphicon-upload icon-gray")
icon_class.setAttribute("style", "font-size:23px; padding-left:4px")
icon_class.setAttribute("id", "tooltip-save_tags")
tags_text.innerText = "Tags: "
input.setAttribute('type', 'text')
input.setAttribute('name', 'tags_list')
input.setAttribute('value', tags_string)
input.setAttribute('style', 'width:75%; display: inline-block')
input.setAttribute('id' , 'change_tags')
form.appendChild(tags_text)
tags.textContent = " "
form.appendChild(input)
form.appendChild(icon_class)
tags.appendChild(form)
$("#tooltip-save_tags").click(function(){
tags_string = $("#change_tags")[0].value
tags_list = tags_string.split(",")
var re = /^[a-z0-9\-\_\:\/]+$/i
var good = true
for (var i = 0; i < tags_list.length; i++){
tag = tags_list[i]
if (tag.length == 0){
alert("There is a tag with length 0 - possible a trailing comma?")
good = false
break
} else {
tag_name = tag.trim()
if (!(re.test(tag_name))){
alert("The tag contains special characters. Make sure there are only alphanumeric characters in your tag")
good = false
break
}
}
}
if (good) {
var postContent = {}
postContent['tags'] = tags_string
$.ajax({
type: "POST",
dataType: "json",
data: JSON.stringify(postContent),
contentType: "application/json",
url: '/tag_list?post_path=etl/testpilot/snoozetabs.kp',
async: false
});
location.reload()
}
})
tags.nextSibling.remove()
// Allow user to edit tags
var edit_icon = iconsJx.createEditTagsIcon();
$(tags).after(edit_icon);
$("#tooltip-edit_tags").on("click", function(){
var edit_tooltip = $("#tooltip-edit_tags");
edit_tooltip.attr("style", "display:none");
var tags_string = tags_list.join(", ");
var form = $("<form>");
var input = $("<input>");
var tags_text = $("<text>");
tags_text.html("Tags: ");
var icon = iconsJx.createSaveTagsIcon();
input.attr("type", "text");
input.attr("name", "tags_list");
input.attr("style", "width:75%; display: inline-block");
input.attr("id", "change_tags");
form.append(tags_text);
tags.textContent = " ";
tags_text.innerHTML = "Tags: ";
form.append(input);
form.append(icon);
tags.appendChild(form[0]);
$("#change_tags")[0].value = tags_string;
$("#change_tags").keypress(function(e){
if (e.which == 13){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/testpilot/snoozetabs.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false;
};
});
$("#tooltip-save_tags").click(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/testpilot/snoozetabs.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
});
$("form").submit(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "etl/testpilot/snoozetabs.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false
})
});
});
</script>
</body>
</html>

Просмотреть файл

@ -0,0 +1,13 @@
{
"title": "TxP Snoozetabs ETL",
"authors": [
"sunahsuh"
],
"tags": [
"testpilot",
"etl"
],
"publish_date": "2017-02-17",
"updated_at": "2017-03-20",
"tldr": "This notebook transforms pings from the SnoozeTabs testpilot test to a parquet dataset. Docs at https://github.com/bwinton/SnoozeTabs/blob/master/docs/metrics.md"
}

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -0,0 +1,15 @@
{
"title": "This is a Knowledge Template Header",
"authors": [
"sally_smarts",
"wesley_wisdom"
],
"tags": [
"startup",
"firefox",
"example"
],
"publish_date": "2016-06-29",
"updated_at": "2016-06-30",
"tldr": "This is short description of the content and findings of the post."
}

Просмотреть файл

@ -0,0 +1,485 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title> Knowledge </title>
<!-- js includes at the top as post embedded js colliding -->
<script src="https://reports.telemetry.mozilla.org/static/modules/jquery/jquery.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/tether/js/tether.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap/js/bootstrap.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/js/bootstrap-slider.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/typeahead.js/typeahead.bundle.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/handlebars/js/handlebars.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/select2/js/select2.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/hightlight.pack.js/highlight.pack.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/marked.js/marked.js"></script>
<!-- require js is used for plotly, but has a bunch of collisions with other js packages
make sure to have it be last js package imported -->
<script src="https://reports.telemetry.mozilla.org/static/modules/require.js/require.min.js"></script>
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.6.2/html5shiv.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/2.1.0/es5-shim.min.js"></script>
<![endif]-->
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap/css/bootstrap.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/css/bootstrap-slider.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/css/custom.css" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/select2/css/select2.min.css" rel="stylesheet" type="text/css"/>
<link href="https://fonts.googleapis.com/css?family=Lato:400,900|Playfair+Display|Source+Serif+Pro:400,700" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/images/favicon.png" rel="shortcut icon"/>
<link href="https://reports.telemetry.mozilla.org/static/css/codehilite-friendly.css" rel="stylesheet"/>
<style>
.spinner {
position: fixed;
top: 50%;
left: 50%;
margin-left: -50px; /* half width of the spinner gif */
margin-top: -50px; /* half height of the spinner gif */
text-align:center;
z-index:1234;
overflow: auto;
width: 100px; /* width of the spinner gif */
height: 102px; /*hight of the spinner gif +2px to fix IE8 issue */
}
.table {
font-size: 14px;
}
.modal-content {
max-width: 1024px;
}
</style>
</head>
<body>
<div class="container page-container">
<br/>
<div class="container-fluid">
<div class="row">
<div class="col-md-6">
</div>
<div class="col-md-2">
</div>
</div>
<div class="row col-md-12">
</div>
</div>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<div id="renderedMarkdown">
<div class="metadata">
<h1>What, if anything, Useful do we get from Addons Histograms?</h1>
<span class="authors"><a href="/feed?authors=chutten">chutten</a></span>
<span class="date_created">April 04, 2017</span>
<span class="date_updated">(Last Updated: April 04, 2017)</span>
<span class="tldr"><p>We dont get a lot of call for addonHistograms anymore. Maybe we should ditch em.</p></span>
</div>
<h3 id="motivation">Motivation</h3>
<p>Can we get rid of addonHistograms?</p>
<h3 id="what-if-anything-useful-do-we-get-from-addons-histograms">What, if anything, Useful do we get from Addons Histograms?</h3>
<div class="codehilite"><pre><span></span><span class="kn">import</span> <span class="nn">pandas</span> <span class="kn">as</span> <span class="nn">pd</span>
<span class="kn">import</span> <span class="nn">numpy</span> <span class="kn">as</span> <span class="nn">np</span>
<span class="kn">import</span> <span class="nn">matplotlib</span>
<span class="kn">from</span> <span class="nn">matplotlib</span> <span class="kn">import</span> <span class="n">pyplot</span> <span class="k">as</span> <span class="n">plt</span>
<span class="kn">from</span> <span class="nn">moztelemetry.dataset</span> <span class="kn">import</span> <span class="n">Dataset</span>
<span class="kn">from</span> <span class="nn">moztelemetry</span> <span class="kn">import</span> <span class="n">get_pings_properties</span><span class="p">,</span> <span class="n">get_one_ping_per_client</span>
</pre></div>
<div class="codehilite"><pre><span></span>Unable to parse whitelist (/mnt/anaconda2/lib/python2.7/site-packages/moztelemetry/histogram-whitelists.json). Assuming all histograms are acceptable.
</pre></div>
<p>Lets just look at a non-representative 10% of main pings gathered on a recent Tuesday.</p>
<div class="codehilite"><pre><span></span><span class="n">pings</span> <span class="o">=</span> <span class="n">Dataset</span><span class="o">.</span><span class="n">from_source</span><span class="p">(</span><span class="s2">"telemetry"</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">docType</span><span class="o">=</span><span class="s1">'main'</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">submissionDate</span><span class="o">=</span><span class="s2">"20170328"</span><span class="p">)</span> \
<span class="o">.</span><span class="n">records</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">sample</span><span class="o">=</span><span class="mf">0.1</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">subset</span> <span class="o">=</span> <span class="n">get_pings_properties</span><span class="p">(</span><span class="n">pings</span><span class="p">,</span> <span class="p">[</span><span class="s2">"payload/addonHistograms"</span><span class="p">])</span>
</pre></div>
<h4 id="how-many-pings-even-have-addonhistograms">How many pings even have addonHistograms?</h4>
<div class="codehilite"><pre><span></span><span class="n">full_count</span> <span class="o">=</span> <span class="n">subset</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
<span class="n">full_count</span>
</pre></div>
<div class="codehilite"><pre><span></span>37815981
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">filtered</span> <span class="o">=</span> <span class="n">subset</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="n">p</span><span class="p">[</span><span class="s2">"payload/addonHistograms"</span><span class="p">]</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">)</span>
<span class="n">filtered_count</span> <span class="o">=</span> <span class="n">filtered</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
<span class="n">filtered_count</span>
</pre></div>
<div class="codehilite"><pre><span></span>25794
</pre></div>
<div class="codehilite"><pre><span></span><span class="mf">1.0</span> <span class="o">*</span> <span class="n">filtered_count</span> <span class="o">/</span> <span class="n">full_count</span>
</pre></div>
<div class="codehilite"><pre><span></span>0.0006820925787962502
</pre></div>
<h4 id="so-not-many-which-addons-are-they-from">So, not many. Which addons are they from?</h4>
<div class="codehilite"><pre><span></span><span class="n">addons</span> <span class="o">=</span> <span class="n">filtered</span><span class="o">.</span><span class="n">flatMap</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="n">p</span><span class="p">[</span><span class="s1">'payload/addonHistograms'</span><span class="p">]</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">key</span><span class="p">:</span> <span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="mi">1</span><span class="p">))</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">addons</span><span class="o">.</span><span class="n">countByKey</span><span class="p">()</span>
</pre></div>
<div class="codehilite"><pre><span></span>defaultdict(int,
{u'Firebug': 92,
u'shumway@research.mozilla.org': 15,
u'uriloader@pdf.js': 4})
</pre></div>
<p>Wow, so most of those addonHistograms sections are empty.</p>
<p>…And those that arent are from defunct data collection sources. Looks like we can remove this without too many complaint. Excellent.</p>
</div>
</div>
</div>
</div>
</div></body></html>
<div>
</div>
<br/>
<div class="row">
<div class="col-md-12">
</div>
</div>
<script type="text/javascript">
$("#searchbar")[0].setSelectionRange(1000, 1000);
$('#searchbar').typeahead({
hint: false,
highlight: true,
minLength: 1
},
{
name: 'knowledge_posts',
limit: 10,
display: function (item) {
return item.title + " - " + item.author;
},
templates: {
empty: Handlebars.compile(
'<div class="tt-not-found">' +
'Unable to find any posts that match the current query' +
'</div>'
),
suggestion: function(data) {
return '<p style="overflow-wrap:break-word"><strong class="text-rausch">' + data.title + '</strong> – ' + data.author + '</p>';
}
},
source: function(q, sync, async) {
$.ajax('/ajax/index/typeahead?search=' + q,
{
success: function(data,status){ async(JSON.parse(data)); }
})
}
});
$('#searchbar').bind('typeahead:select', function(obj, datum, name) {
window.location = '/post/'+encodeURIComponent(datum.path);
});
$('#searchbar').keypress(function(event){
var keycode = (event.keyCode ? event.keyCode : event.which);
if(keycode == '13'){
var path = document.location.pathname;
window.location = '/feed?filters=' + $('#searchbar').val()
}
});
var padding = $('.tt-menu').outerWidth()
$('.tt-menu').width($('#searchbar').width() + padding + "px")
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/tooltips.js" type="text/javascript"></script>
<script type="text/javascript">
$("document").ready(function(){
var is_webeditor = false;
var post_id = "9";
var id = "None";
var post_path = "projects/addons_histograms.kp"
var data_repo_github_root = ""
tooltipsJx.initializeTooltips(is_webeditor, post_id, id, data_repo_github_root);
$(".btn-rendered").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path);
})
$(".btn-raw").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path) + "?render=raw";
})
$(".btn-webeditor").on("click", function(){
document.location.href = "/edit/" + encodeURI(post_path);
})
});
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/tags.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/icons.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/comments.js" type="text/javascript"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js" type="text/javascript">
</script>
<script>
$(document).on('ready', function(){
// Make the Rendered Markdown Button active
$(".btn-rendered").addClass("btn-active");
// Initialize headers
helpersJx.linkifyHeaders();
// Initialize comments
var post_path = 'projects/addons_histograms.kp';
$("#post_comment_btn").on('click', function(){
comment_author = 'knowledge_default';
post_author = 'chutten';
post_title = 'What, if anything, Useful do we get from Addons Histograms?';
commentsJx.postComment(comment_author, post_author, post_title, post_path);
location.reload();
});
all_comment_delete_buttons = $("[id^=delete_comment]")
$.each(all_comment_delete_buttons, function(i,v){
$(v).on("click", function(){
var id = v.id;
var comment_id = id.split("__")[1];
if(comment_id) {
commentsJx.deleteComment(post_path, comment_id)
location.reload();
}
});
});
$(document.body).on('click',"button[id^=tag-subscription]",function () {
tagsJx.addTagSubscriptionListener($(this)[0]);
});
})
//Turn all the headers to be links
//h1 = Title, don't want that
var all_headers = [$("h2"), $("h3"), $("h4"), $("h5"), $("h6")]
$.each(all_headers, function(index, value){
$.each(value, function(i, v){
var inner_html = v.innerHTML
inner_html_no_special = inner_html.replace(/[^a-zA-Z\- ]/g, "")
var inner_link = "#" + inner_html_no_special.toLowerCase().split(" ").join("-")
v.innerHTML = "<a href='" + inner_link + "' class=link-reset>" + inner_html + "</a>"
})
})
//turn all the tags into links, similar to what's done on the feed page
var tags = $("#renderedMarkdown .metadata .tags")[0]
var tags_list = ['addons', 'firefox', 'telemetry']
var subscriptions_list = []
$.each(tags_list, function(i,tag){
ahref = document.createElement("a")
e_tag = encodeURIComponent(tag)
f_tag = tag.replace("/", "__")
tag_name = "#" + tag
tag_subscription_button_id_name = "tag-subscription-" + i + "__" + f_tag
ahref.setAttribute("data-container", "body")
ahref.setAttribute("data-toggle", "popover")
ahref.setAttribute("data-placement", "bottom")
ahref.setAttribute("data-html", "true")
ahref.setAttribute("data-tag-name", f_tag)
if (subscriptions_list.indexOf(tag) >= 0) {
ahref.setAttribute("class", "label label-subscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-primary btn-unsubscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-remove-sign glyphicon-white'></i>Unsubscribe " +
" </button> " +
" </div>")
} else {
ahref.setAttribute("class", "label label-unsubscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-default btn-subscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-ok-sign glyphicon-filled'></i>Subscribe " +
" </button> " +
" </div>")
}
ahref.setAttribute("href", "/tag_pages?tag=" + e_tag)
ahref.setAttribute("style", "font-weight:normal")
if (i == 0){
ahref.innerHTML = " "
colon = document.createElement("text")
colon.innerHTML = "<b>Tags</b>: "
tags.appendChild(colon)
}
ahref.innerHTML = ahref.innerHTML + tag_name
tags.appendChild(ahref)
if (i != tags_list.length - 1){
comma = document.createElement("text")
comma.innerText = ", "
tags.appendChild(comma)
}
})
tags.nextSibling.remove()
tags.innerHTML += "<i class='glyphicon glyphicon-edit icon-gray' style='font-size:12pt; padding-left:4px' id='tooltip-edit_tags'></i>"
$(".pop").popover({ trigger: "manual" , html: true, animation:false, delay: 100})
.on("mouseenter", function () {
var _this = this;
$(this).popover("show");
$(".popover").on("mouseleave", function () {
$(_this).popover('hide');
});
}).on("mouseleave", function () {
var _this = this;
setTimeout(function () {
if (!$(".popover:hover").length) {
$(_this).popover("hide");
}
}, 300);
});
$('#tooltip-edit_tags').click(function(){
$('#tooltip-edit_tags')[0].setAttribute("style", "display:none")
previousSibling = $("#tooltip-edit_tags")[0].previousSibling
tags_string = tags_list.join(", ")
form = document.createElement("form")
input = document.createElement("input")
tags_text = document.createElement("text")
icon_class = document.createElement("i")
icon_class.setAttribute("class", "glyphicon glyphicon-upload icon-gray")
icon_class.setAttribute("style", "font-size:23px; padding-left:4px")
icon_class.setAttribute("id", "tooltip-save_tags")
tags_text.innerText = "Tags: "
input.setAttribute('type', 'text')
input.setAttribute('name', 'tags_list')
input.setAttribute('value', tags_string)
input.setAttribute('style', 'width:75%; display: inline-block')
input.setAttribute('id' , 'change_tags')
form.appendChild(tags_text)
tags.textContent = " "
form.appendChild(input)
form.appendChild(icon_class)
tags.appendChild(form)
$("#tooltip-save_tags").click(function(){
tags_string = $("#change_tags")[0].value
tags_list = tags_string.split(",")
var re = /^[a-z0-9\-\_\:\/]+$/i
var good = true
for (var i = 0; i < tags_list.length; i++){
tag = tags_list[i]
if (tag.length == 0){
alert("There is a tag with length 0 - possible a trailing comma?")
good = false
break
} else {
tag_name = tag.trim()
if (!(re.test(tag_name))){
alert("The tag contains special characters. Make sure there are only alphanumeric characters in your tag")
good = false
break
}
}
}
if (good) {
var postContent = {}
postContent['tags'] = tags_string
$.ajax({
type: "POST",
dataType: "json",
data: JSON.stringify(postContent),
contentType: "application/json",
url: '/tag_list?post_path=projects/addons_histograms.kp',
async: false
});
location.reload()
}
})
tags.nextSibling.remove()
// Allow user to edit tags
var edit_icon = iconsJx.createEditTagsIcon();
$(tags).after(edit_icon);
$("#tooltip-edit_tags").on("click", function(){
var edit_tooltip = $("#tooltip-edit_tags");
edit_tooltip.attr("style", "display:none");
var tags_string = tags_list.join(", ");
var form = $("<form>");
var input = $("<input>");
var tags_text = $("<text>");
tags_text.html("Tags: ");
var icon = iconsJx.createSaveTagsIcon();
input.attr("type", "text");
input.attr("name", "tags_list");
input.attr("style", "width:75%; display: inline-block");
input.attr("id", "change_tags");
form.append(tags_text);
tags.textContent = " ";
tags_text.innerHTML = "Tags: ";
form.append(input);
form.append(icon);
tags.appendChild(form[0]);
$("#change_tags")[0].value = tags_string;
$("#change_tags").keypress(function(e){
if (e.which == 13){
var tags_string = $("#change_tags")[0].value;
var post_path = "projects/addons_histograms.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false;
};
});
$("#tooltip-save_tags").click(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "projects/addons_histograms.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
});
$("form").submit(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "projects/addons_histograms.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false
})
});
});
</script>

Просмотреть файл

@ -0,0 +1,623 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title> Knowledge </title>
<!-- js includes at the top as post embedded js colliding -->
<script src="/static/modules/jquery/jquery.min.js"></script>
<script src="/static/modules/tether/js/tether.min.js"></script>
<script src="/static/modules/bootstrap/js/bootstrap.min.js"></script>
<script src="/static/modules/bootstrap-slider/js/bootstrap-slider.min.js"></script>
<script src="/static/modules/typeahead.js/typeahead.bundle.min.js"></script>
<script src="/static/modules/handlebars/js/handlebars.js"></script>
<script src="/static/js/helpers.js"></script>
<script src="/static/modules/select2/js/select2.min.js"></script>
<script src="/static/modules/hightlight.pack.js/highlight.pack.js"></script>
<script src="/static/modules/marked.js/marked.js"></script>
<!-- require js is used for plotly, but has a bunch of collisions with other js packages
make sure to have it be last js package imported -->
<script src="/static/modules/require.js/require.min.js"></script>
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.6.2/html5shiv.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/2.1.0/es5-shim.min.js"></script>
<![endif]-->
<link rel="stylesheet" href="/static/modules/bootstrap/css/bootstrap.min.css">
<link rel="stylesheet" href="/static/modules/bootstrap-slider/css/bootstrap-slider.min.css">
<link rel="stylesheet" type='text/css' href="/static/css/custom.css">
<link rel="stylesheet" type='text/css' href="/static/modules/select2/css/select2.min.css">
<link href='https://fonts.googleapis.com/css?family=Lato:400,900|Playfair+Display|Source+Serif+Pro:400,700' rel='stylesheet' type='text/css'>
<link rel="shortcut icon" href="/static/images/favicon.png">
<link rel="stylesheet" href="/static/css/codehilite-friendly.css">
<style>
.spinner {
position: fixed;
top: 50%;
left: 50%;
margin-left: -50px; /* half width of the spinner gif */
margin-top: -50px; /* half height of the spinner gif */
text-align:center;
z-index:1234;
overflow: auto;
width: 100px; /* width of the spinner gif */
height: 102px; /*hight of the spinner gif +2px to fix IE8 issue */
}
.table {
font-size: 14px;
}
.modal-content {
max-width: 1024px;
}
</style>
</head>
<body>
<div class="navbar navbar-knowledge" role="navigation">
<div class="container page-container">
<a href="/" aria-selected="false" class="logo-image navbar-text" style='margin-top:16px; margin-left: -7px'>
<img width="125" src='/static/images/logo-white.svg'></img>
</a>
<a id="feed_tab" href="/" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Home
</a>
<a id="favorites_tab" href="/favorites" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Favorites
</a>
<a id="help_tab" href="/about" aria-selected="false" class="navbar-text" style='margin-top:18px'>
About
</a>
<a id="stats_tab" href="/stats" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Stats
</a>
<a id="webposts_tab" href="/create"
class="btn btn-primary navbar-text pull-right"
style='border-radius:4px; margin-top:10px; margin-right:-10px; background-color: #00a699; color:white; border-color: #00a699'>
Write a Post!
</a>
<div class="pull-right">
<div class="form-group">
<input class="form-control" id="searchbar" placeholder="Search for Knowledge" style="text-align:right">
</div>
</div>
</div>
</div>
<div class="container page-container">
<br>
<div class="container-fluid">
<div class="row">
<div class="col-md-6">
<div class="btn-group" role="group">
<button type="button" class="btn btn-default btn-rendered">
View Post
</button>
<button type="button" class="btn btn-default btn-raw">
View Raw Markdown
</button>
</div>
</div>
<div class="col-md-2">
</div>
<div class="col-md-4 text-right">
<i class="glyphicon glyphicon-eye-open" id="pageview_stats" style='color: #9CA299'></i>
<div id="pageview_stats" style="display: inline-block">
Viewed 1 times by 1 different users
</div>
<i class="glyphicon glyphicon-heart-empty glyphicon-clickable pop" style="font-size:16pt" id="tooltip-like" data-placement="bottom"
data-trigger="#tooltip-like"
data-container="body"
data-toggle="popover"
data-content='<div>Like This Post</div>'></i>
</div>
</div>
<div class="row col-md-12">
</div>
</div>
<div class='container-fluid'>
<div class="row">
<div class="col-md-12">
<div id="renderedMarkdown">
<div class='metadata'>
<h1>What, if anything, Useful do we get from Addons Histograms?</h1>
<span class='authors'><a href='/feed?authors=chutten'>chutten</a></span>
<span class='date_created'>April 04, 2017</span>
<span class='date_updated'>(Last Updated: April 04, 2017)</span>
<span class='tldr'><p>We don&rsquo;t get a lot of call for addonHistograms anymore. Maybe we should ditch &lsquo;em.</p></span>
<span class='tags'></span>
</div>
<h3 id="motivation">Motivation</h3>
<p>Can we get rid of addonHistograms?</p>
<h3 id="what-if-anything-useful-do-we-get-from-addons-histograms">What, if anything, Useful do we get from Addons Histograms?</h3>
<div class="codehilite"><pre><span></span><span class="kn">import</span> <span class="nn">pandas</span> <span class="kn">as</span> <span class="nn">pd</span>
<span class="kn">import</span> <span class="nn">numpy</span> <span class="kn">as</span> <span class="nn">np</span>
<span class="kn">import</span> <span class="nn">matplotlib</span>
<span class="kn">from</span> <span class="nn">matplotlib</span> <span class="kn">import</span> <span class="n">pyplot</span> <span class="k">as</span> <span class="n">plt</span>
<span class="kn">from</span> <span class="nn">moztelemetry.dataset</span> <span class="kn">import</span> <span class="n">Dataset</span>
<span class="kn">from</span> <span class="nn">moztelemetry</span> <span class="kn">import</span> <span class="n">get_pings_properties</span><span class="p">,</span> <span class="n">get_one_ping_per_client</span>
</pre></div>
<div class="codehilite"><pre><span></span>Unable to parse whitelist (/mnt/anaconda2/lib/python2.7/site-packages/moztelemetry/histogram-whitelists.json). Assuming all histograms are acceptable.
</pre></div>
<p>Let&rsquo;s just look at a non-representative 10% of main pings gathered on a recent Tuesday.</p>
<div class="codehilite"><pre><span></span><span class="n">pings</span> <span class="o">=</span> <span class="n">Dataset</span><span class="o">.</span><span class="n">from_source</span><span class="p">(</span><span class="s2">&quot;telemetry&quot;</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">docType</span><span class="o">=</span><span class="s1">&#39;main&#39;</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">submissionDate</span><span class="o">=</span><span class="s2">&quot;20170328&quot;</span><span class="p">)</span> \
<span class="o">.</span><span class="n">records</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">sample</span><span class="o">=</span><span class="mf">0.1</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">subset</span> <span class="o">=</span> <span class="n">get_pings_properties</span><span class="p">(</span><span class="n">pings</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;payload/addonHistograms&quot;</span><span class="p">])</span>
</pre></div>
<h4 id="how-many-pings-even-have-addonhistograms">How many pings even have addonHistograms?</h4>
<div class="codehilite"><pre><span></span><span class="n">full_count</span> <span class="o">=</span> <span class="n">subset</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
<span class="n">full_count</span>
</pre></div>
<div class="codehilite"><pre><span></span>37815981
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">filtered</span> <span class="o">=</span> <span class="n">subset</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="n">p</span><span class="p">[</span><span class="s2">&quot;payload/addonHistograms&quot;</span><span class="p">]</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">)</span>
<span class="n">filtered_count</span> <span class="o">=</span> <span class="n">filtered</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
<span class="n">filtered_count</span>
</pre></div>
<div class="codehilite"><pre><span></span>25794
</pre></div>
<div class="codehilite"><pre><span></span><span class="mf">1.0</span> <span class="o">*</span> <span class="n">filtered_count</span> <span class="o">/</span> <span class="n">full_count</span>
</pre></div>
<div class="codehilite"><pre><span></span>0.0006820925787962502
</pre></div>
<h4 id="so-not-many-which-addons-are-they-from">So, not many. Which addons are they from?</h4>
<div class="codehilite"><pre><span></span><span class="n">addons</span> <span class="o">=</span> <span class="n">filtered</span><span class="o">.</span><span class="n">flatMap</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="n">p</span><span class="p">[</span><span class="s1">&#39;payload/addonHistograms&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">key</span><span class="p">:</span> <span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="mi">1</span><span class="p">))</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">addons</span><span class="o">.</span><span class="n">countByKey</span><span class="p">()</span>
</pre></div>
<div class="codehilite"><pre><span></span>defaultdict(int,
{u&#39;Firebug&#39;: 92,
u&#39;shumway@research.mozilla.org&#39;: 15,
u&#39;uriloader@pdf.js&#39;: 4})
</pre></div>
<p>Wow, so most of those addonHistograms sections are empty.</p>
<p>&hellip;And those that aren&rsquo;t are from defunct data collection sources. Looks like we can remove this without too many complaint. Excellent.</p>
</div>
</div>
</div>
</div>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<h2> 0 Comments </h2>
</br>
<div>
<div class="row">
<div class="col-md-12">
<textarea class="form-control" type="text" id="comment-text" style="height:87px;" placeholder="Leave a comment..."></textarea>
</div>
</div>
</div>
</br>
<div>
<button class="btn btn-primary" id="post_comment_btn">Post Comment</button>
</div>
</div>
</div>
<br>
<div class="row">
<div class="col-md-12">
</div>
</br>
</div>
</div>
</div>
</div>
<div class="footer">
Served with <span class="glyphicon glyphicon-heart"></span> by <a href="https://github.com/airbnb/knowledge-repo">Knowledge Repo</a> <a href="https://github.com/airbnb/knowledge-repo/releases/tag/v0.7.4">0.7.4</a><br />
<i title="Last checked for updates: 5 minutes ago">Last indexed: 14 hours ago</i>
</div>
<script type='text/javascript'>
$("#searchbar")[0].setSelectionRange(1000, 1000);
$('#searchbar').typeahead({
hint: false,
highlight: true,
minLength: 1
},
{
name: 'knowledge_posts',
limit: 10,
display: function (item) {
return item.title + " - " + item.author;
},
templates: {
empty: Handlebars.compile(
'<div class="tt-not-found">' +
'Unable to find any posts that match the current query' +
'</div>'
),
suggestion: function(data) {
return '<p style="overflow-wrap:break-word"><strong class="text-rausch">' + data.title + '</strong> – ' + data.author + '</p>';
}
},
source: function(q, sync, async) {
$.ajax('/ajax/index/typeahead?search=' + q,
{
success: function(data,status){ async(JSON.parse(data)); }
})
}
});
$('#searchbar').bind('typeahead:select', function(obj, datum, name) {
window.location = '/post/'+encodeURIComponent(datum.path);
});
$('#searchbar').keypress(function(event){
var keycode = (event.keyCode ? event.keyCode : event.which);
if(keycode == '13'){
var path = document.location.pathname;
window.location = '/feed?filters=' + $('#searchbar').val()
}
});
var padding = $('.tt-menu').outerWidth()
$('.tt-menu').width($('#searchbar').width() + padding + "px")
</script>
<script src="/static/js/tooltips.js" type="text/javascript"></script>
<script type="text/javascript">
$("document").ready(function(){
var is_webeditor = false;
var post_id = "9";
var id = "None";
var post_path = "projects/addons_histograms.kp"
var data_repo_github_root = ""
tooltipsJx.initializeTooltips(is_webeditor, post_id, id, data_repo_github_root);
$(".btn-rendered").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path);
})
$(".btn-raw").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path) + "?render=raw";
})
$(".btn-webeditor").on("click", function(){
document.location.href = "/edit/" + encodeURI(post_path);
})
});
</script>
<script src="/static/js/helpers.js"></script>
<script src="/static/js/tags.js" type="text/javascript"></script>
<script src="/static/js/icons.js" type="text/javascript"></script>
<script src="/static/js/comments.js" type="text/javascript"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script type="text/javascript"
src="https://cdn.mathjax.org/mathjax/latest/MathJax.js">
</script>
<script>
$(document).on('ready', function(){
// Make the Rendered Markdown Button active
$(".btn-rendered").addClass("btn-active");
// Initialize headers
helpersJx.linkifyHeaders();
// Initialize comments
var post_path = 'projects/addons_histograms.kp';
$("#post_comment_btn").on('click', function(){
comment_author = 'knowledge_default';
post_author = 'chutten';
post_title = 'What, if anything, Useful do we get from Addons Histograms?';
commentsJx.postComment(comment_author, post_author, post_title, post_path);
location.reload();
});
all_comment_delete_buttons = $("[id^=delete_comment]")
$.each(all_comment_delete_buttons, function(i,v){
$(v).on("click", function(){
var id = v.id;
var comment_id = id.split("__")[1];
if(comment_id) {
commentsJx.deleteComment(post_path, comment_id)
location.reload();
}
});
});
$(document.body).on('click',"button[id^=tag-subscription]",function () {
tagsJx.addTagSubscriptionListener($(this)[0]);
});
})
//Turn all the headers to be links
//h1 = Title, don't want that
var all_headers = [$("h2"), $("h3"), $("h4"), $("h5"), $("h6")]
$.each(all_headers, function(index, value){
$.each(value, function(i, v){
var inner_html = v.innerHTML
inner_html_no_special = inner_html.replace(/[^a-zA-Z\- ]/g, "")
var inner_link = "#" + inner_html_no_special.toLowerCase().split(" ").join("-")
v.innerHTML = "<a href='" + inner_link + "' class=link-reset>" + inner_html + "</a>"
})
})
//turn all the tags into links, similar to what's done on the feed page
var tags = $("#renderedMarkdown .metadata .tags")[0]
var tags_list = ['addons', 'firefox', 'telemetry']
var subscriptions_list = []
$.each(tags_list, function(i,tag){
ahref = document.createElement("a")
e_tag = encodeURIComponent(tag)
f_tag = tag.replace("/", "__")
tag_name = "#" + tag
tag_subscription_button_id_name = "tag-subscription-" + i + "__" + f_tag
ahref.setAttribute("data-container", "body")
ahref.setAttribute("data-toggle", "popover")
ahref.setAttribute("data-placement", "bottom")
ahref.setAttribute("data-html", "true")
ahref.setAttribute("data-tag-name", f_tag)
if (subscriptions_list.indexOf(tag) >= 0) {
ahref.setAttribute("class", "label label-subscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-primary btn-unsubscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-remove-sign glyphicon-white'></i>Unsubscribe " +
" </button> " +
" </div>")
} else {
ahref.setAttribute("class", "label label-unsubscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-default btn-subscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-ok-sign glyphicon-filled'></i>Subscribe " +
" </button> " +
" </div>")
}
ahref.setAttribute("href", "/tag_pages?tag=" + e_tag)
ahref.setAttribute("style", "font-weight:normal")
if (i == 0){
ahref.innerHTML = " "
colon = document.createElement("text")
colon.innerHTML = "<b>Tags</b>: "
tags.appendChild(colon)
}
ahref.innerHTML = ahref.innerHTML + tag_name
tags.appendChild(ahref)
if (i != tags_list.length - 1){
comma = document.createElement("text")
comma.innerText = ", "
tags.appendChild(comma)
}
})
tags.nextSibling.remove()
tags.innerHTML += "<i class='glyphicon glyphicon-edit icon-gray' style='font-size:12pt; padding-left:4px' id='tooltip-edit_tags'></i>"
$(".pop").popover({ trigger: "manual" , html: true, animation:false, delay: 100})
.on("mouseenter", function () {
var _this = this;
$(this).popover("show");
$(".popover").on("mouseleave", function () {
$(_this).popover('hide');
});
}).on("mouseleave", function () {
var _this = this;
setTimeout(function () {
if (!$(".popover:hover").length) {
$(_this).popover("hide");
}
}, 300);
});
$('#tooltip-edit_tags').click(function(){
$('#tooltip-edit_tags')[0].setAttribute("style", "display:none")
previousSibling = $("#tooltip-edit_tags")[0].previousSibling
tags_string = tags_list.join(", ")
form = document.createElement("form")
input = document.createElement("input")
tags_text = document.createElement("text")
icon_class = document.createElement("i")
icon_class.setAttribute("class", "glyphicon glyphicon-upload icon-gray")
icon_class.setAttribute("style", "font-size:23px; padding-left:4px")
icon_class.setAttribute("id", "tooltip-save_tags")
tags_text.innerText = "Tags: "
input.setAttribute('type', 'text')
input.setAttribute('name', 'tags_list')
input.setAttribute('value', tags_string)
input.setAttribute('style', 'width:75%; display: inline-block')
input.setAttribute('id' , 'change_tags')
form.appendChild(tags_text)
tags.textContent = " "
form.appendChild(input)
form.appendChild(icon_class)
tags.appendChild(form)
$("#tooltip-save_tags").click(function(){
tags_string = $("#change_tags")[0].value
tags_list = tags_string.split(",")
var re = /^[a-z0-9\-\_\:\/]+$/i
var good = true
for (var i = 0; i < tags_list.length; i++){
tag = tags_list[i]
if (tag.length == 0){
alert("There is a tag with length 0 - possible a trailing comma?")
good = false
break
} else {
tag_name = tag.trim()
if (!(re.test(tag_name))){
alert("The tag contains special characters. Make sure there are only alphanumeric characters in your tag")
good = false
break
}
}
}
if (good) {
var postContent = {}
postContent['tags'] = tags_string
$.ajax({
type: "POST",
dataType: "json",
data: JSON.stringify(postContent),
contentType: "application/json",
url: '/tag_list?post_path=projects/addons_histograms.kp',
async: false
});
location.reload()
}
})
tags.nextSibling.remove()
// Allow user to edit tags
var edit_icon = iconsJx.createEditTagsIcon();
$(tags).after(edit_icon);
$("#tooltip-edit_tags").on("click", function(){
var edit_tooltip = $("#tooltip-edit_tags");
edit_tooltip.attr("style", "display:none");
var tags_string = tags_list.join(", ");
var form = $("<form>");
var input = $("<input>");
var tags_text = $("<text>");
tags_text.html("Tags: ");
var icon = iconsJx.createSaveTagsIcon();
input.attr("type", "text");
input.attr("name", "tags_list");
input.attr("style", "width:75%; display: inline-block");
input.attr("id", "change_tags");
form.append(tags_text);
tags.textContent = " ";
tags_text.innerHTML = "Tags: ";
form.append(input);
form.append(icon);
tags.appendChild(form[0]);
$("#change_tags")[0].value = tags_string;
$("#change_tags").keypress(function(e){
if (e.which == 13){
var tags_string = $("#change_tags")[0].value;
var post_path = "projects/addons_histograms.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false;
};
});
$("#tooltip-save_tags").click(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "projects/addons_histograms.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
});
$("form").submit(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "projects/addons_histograms.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false
})
});
});
</script>
</body>
</html>

Просмотреть файл

@ -0,0 +1,14 @@
{
"title": "What, if anything, Useful do we get from Addons Histograms?",
"authors": [
"chutten"
],
"tags": [
"addons",
"firefox",
"telemetry"
],
"publish_date": "2017-04-04",
"updated_at": "2017-04-04",
"tldr": "We don't get a lot of call for addonHistograms anymore. Maybe we should ditch 'em."
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,13 @@
{
"title": "Firefox Application Update Out Of Date dashboard",
"authors": [
"rstrong"
],
"tags": [
"firefox",
"app_update"
],
"publish_date": "2017-02-16",
"updated_at": "2017-02-16",
"tldr": "Creates the JSON data files used by the Firefox Application Update Out Of Date dashboard."
}

Просмотреть файл

@ -0,0 +1,545 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title> Knowledge </title>
<!-- js includes at the top as post embedded js colliding -->
<script src="https://reports.telemetry.mozilla.org/static/modules/jquery/jquery.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/tether/js/tether.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap/js/bootstrap.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/js/bootstrap-slider.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/typeahead.js/typeahead.bundle.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/handlebars/js/handlebars.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/select2/js/select2.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/hightlight.pack.js/highlight.pack.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/marked.js/marked.js"></script>
<!-- require js is used for plotly, but has a bunch of collisions with other js packages
make sure to have it be last js package imported -->
<script src="https://reports.telemetry.mozilla.org/static/modules/require.js/require.min.js"></script>
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.6.2/html5shiv.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/2.1.0/es5-shim.min.js"></script>
<![endif]-->
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap/css/bootstrap.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/css/bootstrap-slider.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/css/custom.css" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/select2/css/select2.min.css" rel="stylesheet" type="text/css"/>
<link href="https://fonts.googleapis.com/css?family=Lato:400,900|Playfair+Display|Source+Serif+Pro:400,700" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/images/favicon.png" rel="shortcut icon"/>
<link href="https://reports.telemetry.mozilla.org/static/css/codehilite-friendly.css" rel="stylesheet"/>
<style>
.spinner {
position: fixed;
top: 50%;
left: 50%;
margin-left: -50px; /* half width of the spinner gif */
margin-top: -50px; /* half height of the spinner gif */
text-align:center;
z-index:1234;
overflow: auto;
width: 100px; /* width of the spinner gif */
height: 102px; /*hight of the spinner gif +2px to fix IE8 issue */
}
.table {
font-size: 14px;
}
.modal-content {
max-width: 1024px;
}
</style>
</head>
<body>
<div class="container page-container">
<br/>
<div class="container-fluid">
<div class="row">
<div class="col-md-6">
</div>
<div class="col-md-2">
</div>
</div>
<div class="row col-md-12">
</div>
</div>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<div id="renderedMarkdown">
<div class="metadata">
<h1>Prefer repartition to coalesce in Spark</h1>
<span class="authors"><a href="/feed?authors=Ryan+Harter+%28%3Aharter%29">Ryan Harter (:harter)</a></span>
<span class="date_created">March 02, 2017</span>
<span class="date_updated">(Last Updated: March 02, 2017)</span>
<span class="tldr"><p>When saving data to parquet in Spark/ATMO, avoid using coalesce.</p></span>
</div>
<h1 id="introduction">Introduction</h1>
<p>I ran into some Spark weirdness when working on some ETL.
Specifically, when repartitioning a parquet file with <code>coalesce()</code>, the parallelism for the entire job (including upstream tasks) was constrained by the number of coalesce partitions.
Instead, I expected the upstream jobs to use all available cores.
We should be limited by the number of file partitions only when its time to actually write the file.</p>
<p>Its probably easier if I demonstrate.
Below Ill create a small example dataframe containing 10 rows.
Ill map a slow function over the example dataframe in a few different ways.
Id expect these calculations to take a fixed amount of time, since theyre happening in parallel.
However, for one example, <strong>execution time will increase linearly with the number of rows</strong>.</p>
<h2 id="setup">Setup</h2>
<div class="codehilite"><pre><span></span><span class="kn">import</span> <span class="nn">time</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="n">LongType</span>
<span class="n">path</span> <span class="o">=</span> <span class="s2">"~/tmp.parquet"</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">sc</span><span class="o">.</span><span class="n">defaultParallelism</span>
</pre></div>
<div class="codehilite"><pre><span></span>32
</pre></div>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">slow_func</span><span class="p">(</span><span class="n">ping</span><span class="p">):</span>
<span class="sd">"""Identity function that takes 1s to return"""</span>
<span class="n">time</span><span class="o">.</span><span class="n">sleep</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="k">return</span><span class="p">(</span><span class="n">ping</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">timer</span><span class="p">(</span><span class="n">func</span><span class="p">):</span>
<span class="sd">"""Times the execution of a function"""</span>
<span class="n">start_time</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span>
<span class="n">func</span><span class="p">()</span>
<span class="k">return</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span> <span class="o">-</span> <span class="n">start_time</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="c1"># Example usage:</span>
<span class="n">timer</span><span class="p">(</span><span class="k">lambda</span><span class="p">:</span> <span class="n">slow_func</span><span class="p">(</span><span class="mi">10</span><span class="p">))</span>
</pre></div>
<div class="codehilite"><pre><span></span>1.001082181930542
</pre></div>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">create_frame</span><span class="p">(</span><span class="n">rdd</span><span class="p">):</span>
<span class="k">return</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">rdd</span><span class="p">,</span> <span class="n">schema</span><span class="o">=</span><span class="n">LongType</span><span class="p">())</span>
</pre></div>
<h2 id="simple-rdd">Simple RDD</h2>
<p>First, lets look at a simple RDD. Everything seems to work as expected here. Execution time levels off to ~3.7 as the dataset increases:</p>
<div class="codehilite"><pre><span></span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">timer</span><span class="p">(</span><span class="k">lambda</span><span class="p">:</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="n">x</span><span class="p">))</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">slow_func</span><span class="p">)</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="n">x</span><span class="p">)),</span> <span class="nb">range</span><span class="p">(</span><span class="mi">10</span><span class="p">))</span>
</pre></div>
<div class="codehilite"><pre><span></span>[0.07758498191833496,
118.664391040802,
2.453991174697876,
2.390385866165161,
2.3567309379577637,
2.3262758255004883,
2.3200111389160156,
3.3115720748901367,
3.3115429878234863,
3.274951934814453]
</pre></div>
<h2 id="spark-dataframe">Spark DataFrame</h2>
<p>Lets create a Spark DataFrame and write the contents to parquet without any modification. Again, things seem to be behaving here. Execution time is fairly flat.</p>
<div class="codehilite"><pre><span></span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">timer</span><span class="p">(</span><span class="k">lambda</span><span class="p">:</span> <span class="n">create_frame</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="n">x</span><span class="p">)))</span>\
<span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">mode</span><span class="p">(</span><span class="s2">"overwrite"</span><span class="p">)</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="n">path</span><span class="p">)),</span>
<span class="nb">range</span><span class="p">(</span><span class="mi">10</span><span class="p">))</span>
</pre></div>
<div class="codehilite"><pre><span></span>[5.700469017028809,
1.5091090202331543,
1.4622771739959717,
1.448883056640625,
1.4437789916992188,
1.4351229667663574,
1.4368910789489746,
1.4349958896636963,
1.4199819564819336,
1.4395389556884766]
</pre></div>
<h2 id="offending-example">Offending Example</h2>
<p>Now, lets map the slow function over the DataFrame before saving. This should increase execution time by one second for every dataset. However, it looks like <strong>execution time is increasing by one second for each row</strong>.</p>
<div class="codehilite"><pre><span></span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">timer</span><span class="p">(</span><span class="k">lambda</span><span class="p">:</span> <span class="n">create_frame</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="n">x</span><span class="p">))</span>\
<span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">slow_func</span><span class="p">))</span>\
<span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">mode</span><span class="p">(</span><span class="s2">"overwrite"</span><span class="p">)</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="n">path</span><span class="p">)),</span>
<span class="nb">range</span><span class="p">(</span><span class="mi">10</span><span class="p">))</span>
</pre></div>
<div class="codehilite"><pre><span></span>[1.42529296875,
2.436065912246704,
3.3423829078674316,
4.332568883895874,
5.268526077270508,
6.280202865600586,
7.169728994369507,
8.18229603767395,
9.098582029342651,
10.119444131851196]
</pre></div>
<h2 id="repartition-fixes-the-issue">Repartition fixes the issue</h2>
<p>Using <code>repartition</code> instead of <code>coalesce</code> fixes the issue.</p>
<div class="codehilite"><pre><span></span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">timer</span><span class="p">(</span><span class="k">lambda</span><span class="p">:</span> <span class="n">create_frame</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="n">x</span><span class="p">))</span>\
<span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">slow_func</span><span class="p">))</span>\
<span class="o">.</span><span class="n">repartition</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">mode</span><span class="p">(</span><span class="s2">"overwrite"</span><span class="p">)</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="n">path</span><span class="p">)),</span>
<span class="nb">range</span><span class="p">(</span><span class="mi">10</span><span class="p">))</span>
</pre></div>
<div class="codehilite"><pre><span></span>[0.8304200172424316,
1.276075839996338,
1.2515549659729004,
1.2429919242858887,
1.2587580680847168,
1.2490499019622803,
1.6439399719238281,
1.229665994644165,
1.2340660095214844,
1.2454640865325928]
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">sc</span><span class="o">.</span><span class="n">cancelAllJobs</span><span class="p">()</span>
</pre></div>
</div>
</div>
</div>
</div>
</div></body></html>
<div>
</div>
<br/>
<div class="row">
<div class="col-md-12">
</div>
</div>
<script type="text/javascript">
$("#searchbar")[0].setSelectionRange(1000, 1000);
$('#searchbar').typeahead({
hint: false,
highlight: true,
minLength: 1
},
{
name: 'knowledge_posts',
limit: 10,
display: function (item) {
return item.title + " - " + item.author;
},
templates: {
empty: Handlebars.compile(
'<div class="tt-not-found">' +
'Unable to find any posts that match the current query' +
'</div>'
),
suggestion: function(data) {
return '<p style="overflow-wrap:break-word"><strong class="text-rausch">' + data.title + '</strong> – ' + data.author + '</p>';
}
},
source: function(q, sync, async) {
$.ajax('/ajax/index/typeahead?search=' + q,
{
success: function(data,status){ async(JSON.parse(data)); }
})
}
});
$('#searchbar').bind('typeahead:select', function(obj, datum, name) {
window.location = '/post/'+encodeURIComponent(datum.path);
});
$('#searchbar').keypress(function(event){
var keycode = (event.keyCode ? event.keyCode : event.which);
if(keycode == '13'){
var path = document.location.pathname;
window.location = '/feed?filters=' + $('#searchbar').val()
}
});
var padding = $('.tt-menu').outerWidth()
$('.tt-menu').width($('#searchbar').width() + padding + "px")
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/tooltips.js" type="text/javascript"></script>
<script type="text/javascript">
$("document").ready(function(){
var is_webeditor = false;
var post_id = "34";
var id = "None";
var post_path = "projects/avoid_coalesce.kp"
var data_repo_github_root = ""
tooltipsJx.initializeTooltips(is_webeditor, post_id, id, data_repo_github_root);
$(".btn-rendered").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path);
})
$(".btn-raw").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path) + "?render=raw";
})
$(".btn-webeditor").on("click", function(){
document.location.href = "/edit/" + encodeURI(post_path);
})
});
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/tags.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/icons.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/comments.js" type="text/javascript"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js" type="text/javascript">
</script>
<script>
$(document).on('ready', function(){
// Make the Rendered Markdown Button active
$(".btn-rendered").addClass("btn-active");
// Initialize headers
helpersJx.linkifyHeaders();
// Initialize comments
var post_path = 'projects/avoid_coalesce.kp';
$("#post_comment_btn").on('click', function(){
comment_author = 'knowledge_default';
post_author = 'Ryan Harter (:harter)';
post_title = 'Prefer repartition to coalesce in Spark';
commentsJx.postComment(comment_author, post_author, post_title, post_path);
location.reload();
});
all_comment_delete_buttons = $("[id^=delete_comment]")
$.each(all_comment_delete_buttons, function(i,v){
$(v).on("click", function(){
var id = v.id;
var comment_id = id.split("__")[1];
if(comment_id) {
commentsJx.deleteComment(post_path, comment_id)
location.reload();
}
});
});
$(document.body).on('click',"button[id^=tag-subscription]",function () {
tagsJx.addTagSubscriptionListener($(this)[0]);
});
})
//Turn all the headers to be links
//h1 = Title, don't want that
var all_headers = [$("h2"), $("h3"), $("h4"), $("h5"), $("h6")]
$.each(all_headers, function(index, value){
$.each(value, function(i, v){
var inner_html = v.innerHTML
inner_html_no_special = inner_html.replace(/[^a-zA-Z\- ]/g, "")
var inner_link = "#" + inner_html_no_special.toLowerCase().split(" ").join("-")
v.innerHTML = "<a href='" + inner_link + "' class=link-reset>" + inner_html + "</a>"
})
})
//turn all the tags into links, similar to what's done on the feed page
var tags = $("#renderedMarkdown .metadata .tags")[0]
var tags_list = ['Spark', 'ATMO']
var subscriptions_list = []
$.each(tags_list, function(i,tag){
ahref = document.createElement("a")
e_tag = encodeURIComponent(tag)
f_tag = tag.replace("/", "__")
tag_name = "#" + tag
tag_subscription_button_id_name = "tag-subscription-" + i + "__" + f_tag
ahref.setAttribute("data-container", "body")
ahref.setAttribute("data-toggle", "popover")
ahref.setAttribute("data-placement", "bottom")
ahref.setAttribute("data-html", "true")
ahref.setAttribute("data-tag-name", f_tag)
if (subscriptions_list.indexOf(tag) >= 0) {
ahref.setAttribute("class", "label label-subscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-primary btn-unsubscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-remove-sign glyphicon-white'></i>Unsubscribe " +
" </button> " +
" </div>")
} else {
ahref.setAttribute("class", "label label-unsubscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-default btn-subscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-ok-sign glyphicon-filled'></i>Subscribe " +
" </button> " +
" </div>")
}
ahref.setAttribute("href", "/tag_pages?tag=" + e_tag)
ahref.setAttribute("style", "font-weight:normal")
if (i == 0){
ahref.innerHTML = " "
colon = document.createElement("text")
colon.innerHTML = "<b>Tags</b>: "
tags.appendChild(colon)
}
ahref.innerHTML = ahref.innerHTML + tag_name
tags.appendChild(ahref)
if (i != tags_list.length - 1){
comma = document.createElement("text")
comma.innerText = ", "
tags.appendChild(comma)
}
})
tags.nextSibling.remove()
tags.innerHTML += "<i class='glyphicon glyphicon-edit icon-gray' style='font-size:12pt; padding-left:4px' id='tooltip-edit_tags'></i>"
$(".pop").popover({ trigger: "manual" , html: true, animation:false, delay: 100})
.on("mouseenter", function () {
var _this = this;
$(this).popover("show");
$(".popover").on("mouseleave", function () {
$(_this).popover('hide');
});
}).on("mouseleave", function () {
var _this = this;
setTimeout(function () {
if (!$(".popover:hover").length) {
$(_this).popover("hide");
}
}, 300);
});
$('#tooltip-edit_tags').click(function(){
$('#tooltip-edit_tags')[0].setAttribute("style", "display:none")
previousSibling = $("#tooltip-edit_tags")[0].previousSibling
tags_string = tags_list.join(", ")
form = document.createElement("form")
input = document.createElement("input")
tags_text = document.createElement("text")
icon_class = document.createElement("i")
icon_class.setAttribute("class", "glyphicon glyphicon-upload icon-gray")
icon_class.setAttribute("style", "font-size:23px; padding-left:4px")
icon_class.setAttribute("id", "tooltip-save_tags")
tags_text.innerText = "Tags: "
input.setAttribute('type', 'text')
input.setAttribute('name', 'tags_list')
input.setAttribute('value', tags_string)
input.setAttribute('style', 'width:75%; display: inline-block')
input.setAttribute('id' , 'change_tags')
form.appendChild(tags_text)
tags.textContent = " "
form.appendChild(input)
form.appendChild(icon_class)
tags.appendChild(form)
$("#tooltip-save_tags").click(function(){
tags_string = $("#change_tags")[0].value
tags_list = tags_string.split(",")
var re = /^[a-z0-9\-\_\:\/]+$/i
var good = true
for (var i = 0; i < tags_list.length; i++){
tag = tags_list[i]
if (tag.length == 0){
alert("There is a tag with length 0 - possible a trailing comma?")
good = false
break
} else {
tag_name = tag.trim()
if (!(re.test(tag_name))){
alert("The tag contains special characters. Make sure there are only alphanumeric characters in your tag")
good = false
break
}
}
}
if (good) {
var postContent = {}
postContent['tags'] = tags_string
$.ajax({
type: "POST",
dataType: "json",
data: JSON.stringify(postContent),
contentType: "application/json",
url: '/tag_list?post_path=projects/avoid_coalesce.kp',
async: false
});
location.reload()
}
})
tags.nextSibling.remove()
// Allow user to edit tags
var edit_icon = iconsJx.createEditTagsIcon();
$(tags).after(edit_icon);
$("#tooltip-edit_tags").on("click", function(){
var edit_tooltip = $("#tooltip-edit_tags");
edit_tooltip.attr("style", "display:none");
var tags_string = tags_list.join(", ");
var form = $("<form>");
var input = $("<input>");
var tags_text = $("<text>");
tags_text.html("Tags: ");
var icon = iconsJx.createSaveTagsIcon();
input.attr("type", "text");
input.attr("name", "tags_list");
input.attr("style", "width:75%; display: inline-block");
input.attr("id", "change_tags");
form.append(tags_text);
tags.textContent = " ";
tags_text.innerHTML = "Tags: ";
form.append(input);
form.append(icon);
tags.appendChild(form[0]);
$("#change_tags")[0].value = tags_string;
$("#change_tags").keypress(function(e){
if (e.which == 13){
var tags_string = $("#change_tags")[0].value;
var post_path = "projects/avoid_coalesce.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false;
};
});
$("#tooltip-save_tags").click(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "projects/avoid_coalesce.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
});
$("form").submit(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "projects/avoid_coalesce.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false
})
});
});
</script>

Просмотреть файл

@ -0,0 +1,689 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title> Knowledge </title>
<!-- js includes at the top as post embedded js colliding -->
<script src="/static/modules/jquery/jquery.min.js"></script>
<script src="/static/modules/tether/js/tether.min.js"></script>
<script src="/static/modules/bootstrap/js/bootstrap.min.js"></script>
<script src="/static/modules/bootstrap-slider/js/bootstrap-slider.min.js"></script>
<script src="/static/modules/typeahead.js/typeahead.bundle.min.js"></script>
<script src="/static/modules/handlebars/js/handlebars.js"></script>
<script src="/static/js/helpers.js"></script>
<script src="/static/modules/select2/js/select2.min.js"></script>
<script src="/static/modules/hightlight.pack.js/highlight.pack.js"></script>
<script src="/static/modules/marked.js/marked.js"></script>
<!-- require js is used for plotly, but has a bunch of collisions with other js packages
make sure to have it be last js package imported -->
<script src="/static/modules/require.js/require.min.js"></script>
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.6.2/html5shiv.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/2.1.0/es5-shim.min.js"></script>
<![endif]-->
<link rel="stylesheet" href="/static/modules/bootstrap/css/bootstrap.min.css">
<link rel="stylesheet" href="/static/modules/bootstrap-slider/css/bootstrap-slider.min.css">
<link rel="stylesheet" type='text/css' href="/static/css/custom.css">
<link rel="stylesheet" type='text/css' href="/static/modules/select2/css/select2.min.css">
<link href='https://fonts.googleapis.com/css?family=Lato:400,900|Playfair+Display|Source+Serif+Pro:400,700' rel='stylesheet' type='text/css'>
<link rel="shortcut icon" href="/static/images/favicon.png">
<link rel="stylesheet" href="/static/css/codehilite-friendly.css">
<style>
.spinner {
position: fixed;
top: 50%;
left: 50%;
margin-left: -50px; /* half width of the spinner gif */
margin-top: -50px; /* half height of the spinner gif */
text-align:center;
z-index:1234;
overflow: auto;
width: 100px; /* width of the spinner gif */
height: 102px; /*hight of the spinner gif +2px to fix IE8 issue */
}
.table {
font-size: 14px;
}
.modal-content {
max-width: 1024px;
}
</style>
</head>
<body>
<div class="navbar navbar-knowledge" role="navigation">
<div class="container page-container">
<a href="/" aria-selected="false" class="logo-image navbar-text" style='margin-top:16px; margin-left: -7px'>
<img width="125" src='/static/images/logo-white.svg'></img>
</a>
<a id="feed_tab" href="/" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Home
</a>
<a id="favorites_tab" href="/favorites" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Favorites
</a>
<a id="help_tab" href="/about" aria-selected="false" class="navbar-text" style='margin-top:18px'>
About
</a>
<a id="stats_tab" href="/stats" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Stats
</a>
<a id="webposts_tab" href="/create"
class="btn btn-primary navbar-text pull-right"
style='border-radius:4px; margin-top:10px; margin-right:-10px; background-color: #00a699; color:white; border-color: #00a699'>
Write a Post!
</a>
<div class="pull-right">
<div class="form-group">
<input class="form-control" id="searchbar" placeholder="Search for Knowledge" style="text-align:right">
</div>
</div>
</div>
</div>
<div class="container page-container">
<br>
<div class="container-fluid">
<div class="row">
<div class="col-md-6">
<div class="btn-group" role="group">
<button type="button" class="btn btn-default btn-rendered">
View Post
</button>
<button type="button" class="btn btn-default btn-raw">
View Raw Markdown
</button>
</div>
</div>
<div class="col-md-2">
</div>
<div class="col-md-4 text-right">
<i class="glyphicon glyphicon-eye-open" id="pageview_stats" style='color: #9CA299'></i>
<div id="pageview_stats" style="display: inline-block">
Viewed 2 times by 1 different users
</div>
<i class="glyphicon glyphicon-heart-empty glyphicon-clickable pop" style="font-size:16pt" id="tooltip-like" data-placement="bottom"
data-trigger="#tooltip-like"
data-container="body"
data-toggle="popover"
data-content='<div>Like This Post</div>'></i>
</div>
</div>
<div class="row col-md-12">
</div>
</div>
<div class='container-fluid'>
<div class="row">
<div class="col-md-12">
<div id="renderedMarkdown">
<div class='metadata'>
<h1>Prefer repartition to coalesce in Spark</h1>
<span class='authors'><a href='/feed?authors=Ryan+Harter+%28%3Aharter%29'>Ryan Harter (:harter)</a></span>
<span class='date_created'>March 02, 2017</span>
<span class='date_updated'>(Last Updated: March 02, 2017)</span>
<span class='tldr'><p>When saving data to parquet in Spark/ATMO, avoid using coalesce.</p></span>
<span class='tags'></span>
</div>
<h1 id="introduction">Introduction</h1>
<p>I ran into some Spark weirdness when working on some ETL.
Specifically, when repartitioning a parquet file with <code>coalesce()</code>, the parallelism for the entire job (including upstream tasks) was constrained by the number of coalesce partitions.
Instead, I expected the upstream jobs to use all available cores.
We should be limited by the number of file partitions only when its time to actually write the file.</p>
<p>It&rsquo;s probably easier if I demonstrate.
Below I&rsquo;ll create a small example dataframe containing 10 rows.
I&rsquo;ll map a slow function over the example dataframe in a few different ways.
I&rsquo;d expect these calculations to take a fixed amount of time, since they&rsquo;re happening in parallel.
However, for one example, <strong>execution time will increase linearly with the number of rows</strong>.</p>
<h2 id="setup">Setup</h2>
<div class="codehilite"><pre><span></span><span class="kn">import</span> <span class="nn">time</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="n">LongType</span>
<span class="n">path</span> <span class="o">=</span> <span class="s2">&quot;~/tmp.parquet&quot;</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">sc</span><span class="o">.</span><span class="n">defaultParallelism</span>
</pre></div>
<div class="codehilite"><pre><span></span>32
</pre></div>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">slow_func</span><span class="p">(</span><span class="n">ping</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Identity function that takes 1s to return&quot;&quot;&quot;</span>
<span class="n">time</span><span class="o">.</span><span class="n">sleep</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="k">return</span><span class="p">(</span><span class="n">ping</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">timer</span><span class="p">(</span><span class="n">func</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Times the execution of a function&quot;&quot;&quot;</span>
<span class="n">start_time</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span>
<span class="n">func</span><span class="p">()</span>
<span class="k">return</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span> <span class="o">-</span> <span class="n">start_time</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="c1"># Example usage:</span>
<span class="n">timer</span><span class="p">(</span><span class="k">lambda</span><span class="p">:</span> <span class="n">slow_func</span><span class="p">(</span><span class="mi">10</span><span class="p">))</span>
</pre></div>
<div class="codehilite"><pre><span></span>1.001082181930542
</pre></div>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">create_frame</span><span class="p">(</span><span class="n">rdd</span><span class="p">):</span>
<span class="k">return</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">rdd</span><span class="p">,</span> <span class="n">schema</span><span class="o">=</span><span class="n">LongType</span><span class="p">())</span>
</pre></div>
<h2 id="simple-rdd">Simple RDD</h2>
<p>First, let&rsquo;s look at a simple RDD. Everything seems to work as expected here. Execution time levels off to ~3.7 as the dataset increases:</p>
<div class="codehilite"><pre><span></span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">timer</span><span class="p">(</span><span class="k">lambda</span><span class="p">:</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="n">x</span><span class="p">))</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">slow_func</span><span class="p">)</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="n">x</span><span class="p">)),</span> <span class="nb">range</span><span class="p">(</span><span class="mi">10</span><span class="p">))</span>
</pre></div>
<div class="codehilite"><pre><span></span>[0.07758498191833496,
118.664391040802,
2.453991174697876,
2.390385866165161,
2.3567309379577637,
2.3262758255004883,
2.3200111389160156,
3.3115720748901367,
3.3115429878234863,
3.274951934814453]
</pre></div>
<h2 id="spark-dataframe">Spark DataFrame</h2>
<p>Let&rsquo;s create a Spark DataFrame and write the contents to parquet without any modification. Again, things seem to be behaving here. Execution time is fairly flat.</p>
<div class="codehilite"><pre><span></span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">timer</span><span class="p">(</span><span class="k">lambda</span><span class="p">:</span> <span class="n">create_frame</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="n">x</span><span class="p">)))</span>\
<span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">mode</span><span class="p">(</span><span class="s2">&quot;overwrite&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="n">path</span><span class="p">)),</span>
<span class="nb">range</span><span class="p">(</span><span class="mi">10</span><span class="p">))</span>
</pre></div>
<div class="codehilite"><pre><span></span>[5.700469017028809,
1.5091090202331543,
1.4622771739959717,
1.448883056640625,
1.4437789916992188,
1.4351229667663574,
1.4368910789489746,
1.4349958896636963,
1.4199819564819336,
1.4395389556884766]
</pre></div>
<h2 id="offending-example">Offending Example</h2>
<p>Now, let&rsquo;s map the slow function over the DataFrame before saving. This should increase execution time by one second for every dataset. However, it looks like <strong>execution time is increasing by one second for each row</strong>.</p>
<div class="codehilite"><pre><span></span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">timer</span><span class="p">(</span><span class="k">lambda</span><span class="p">:</span> <span class="n">create_frame</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="n">x</span><span class="p">))</span>\
<span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">slow_func</span><span class="p">))</span>\
<span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">mode</span><span class="p">(</span><span class="s2">&quot;overwrite&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="n">path</span><span class="p">)),</span>
<span class="nb">range</span><span class="p">(</span><span class="mi">10</span><span class="p">))</span>
</pre></div>
<div class="codehilite"><pre><span></span>[1.42529296875,
2.436065912246704,
3.3423829078674316,
4.332568883895874,
5.268526077270508,
6.280202865600586,
7.169728994369507,
8.18229603767395,
9.098582029342651,
10.119444131851196]
</pre></div>
<h2 id="repartition-fixes-the-issue">Repartition fixes the issue</h2>
<p>Using <code>repartition</code> instead of <code>coalesce</code> fixes the issue.</p>
<div class="codehilite"><pre><span></span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">timer</span><span class="p">(</span><span class="k">lambda</span><span class="p">:</span> <span class="n">create_frame</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="n">x</span><span class="p">))</span>\
<span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">slow_func</span><span class="p">))</span>\
<span class="o">.</span><span class="n">repartition</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">mode</span><span class="p">(</span><span class="s2">&quot;overwrite&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="n">path</span><span class="p">)),</span>
<span class="nb">range</span><span class="p">(</span><span class="mi">10</span><span class="p">))</span>
</pre></div>
<div class="codehilite"><pre><span></span>[0.8304200172424316,
1.276075839996338,
1.2515549659729004,
1.2429919242858887,
1.2587580680847168,
1.2490499019622803,
1.6439399719238281,
1.229665994644165,
1.2340660095214844,
1.2454640865325928]
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">sc</span><span class="o">.</span><span class="n">cancelAllJobs</span><span class="p">()</span>
</pre></div>
</div>
</div>
</div>
</div>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<h2> 0 Comments </h2>
</br>
<div>
<div class="row">
<div class="col-md-12">
<textarea class="form-control" type="text" id="comment-text" style="height:87px;" placeholder="Leave a comment..."></textarea>
</div>
</div>
</div>
</br>
<div>
<button class="btn btn-primary" id="post_comment_btn">Post Comment</button>
</div>
</div>
</div>
<br>
<div class="row">
<div class="col-md-12">
</div>
</br>
</div>
</div>
</div>
</div>
<div class="footer">
Served with <span class="glyphicon glyphicon-heart"></span> by <a href="https://github.com/airbnb/knowledge-repo">Knowledge Repo</a> <a href="https://github.com/airbnb/knowledge-repo/releases/tag/v0.7.4">0.7.4</a><br />
<i title="Last checked for updates: 47 seconds ago">Last indexed: 17 hours ago</i>
</div>
<script type='text/javascript'>
$("#searchbar")[0].setSelectionRange(1000, 1000);
$('#searchbar').typeahead({
hint: false,
highlight: true,
minLength: 1
},
{
name: 'knowledge_posts',
limit: 10,
display: function (item) {
return item.title + " - " + item.author;
},
templates: {
empty: Handlebars.compile(
'<div class="tt-not-found">' +
'Unable to find any posts that match the current query' +
'</div>'
),
suggestion: function(data) {
return '<p style="overflow-wrap:break-word"><strong class="text-rausch">' + data.title + '</strong> – ' + data.author + '</p>';
}
},
source: function(q, sync, async) {
$.ajax('/ajax/index/typeahead?search=' + q,
{
success: function(data,status){ async(JSON.parse(data)); }
})
}
});
$('#searchbar').bind('typeahead:select', function(obj, datum, name) {
window.location = '/post/'+encodeURIComponent(datum.path);
});
$('#searchbar').keypress(function(event){
var keycode = (event.keyCode ? event.keyCode : event.which);
if(keycode == '13'){
var path = document.location.pathname;
window.location = '/feed?filters=' + $('#searchbar').val()
}
});
var padding = $('.tt-menu').outerWidth()
$('.tt-menu').width($('#searchbar').width() + padding + "px")
</script>
<script src="/static/js/tooltips.js" type="text/javascript"></script>
<script type="text/javascript">
$("document").ready(function(){
var is_webeditor = false;
var post_id = "34";
var id = "None";
var post_path = "projects/avoid_coalesce.kp"
var data_repo_github_root = ""
tooltipsJx.initializeTooltips(is_webeditor, post_id, id, data_repo_github_root);
$(".btn-rendered").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path);
})
$(".btn-raw").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path) + "?render=raw";
})
$(".btn-webeditor").on("click", function(){
document.location.href = "/edit/" + encodeURI(post_path);
})
});
</script>
<script src="/static/js/helpers.js"></script>
<script src="/static/js/tags.js" type="text/javascript"></script>
<script src="/static/js/icons.js" type="text/javascript"></script>
<script src="/static/js/comments.js" type="text/javascript"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script type="text/javascript"
src="https://cdn.mathjax.org/mathjax/latest/MathJax.js">
</script>
<script>
$(document).on('ready', function(){
// Make the Rendered Markdown Button active
$(".btn-rendered").addClass("btn-active");
// Initialize headers
helpersJx.linkifyHeaders();
// Initialize comments
var post_path = 'projects/avoid_coalesce.kp';
$("#post_comment_btn").on('click', function(){
comment_author = 'knowledge_default';
post_author = 'Ryan Harter (:harter)';
post_title = 'Prefer repartition to coalesce in Spark';
commentsJx.postComment(comment_author, post_author, post_title, post_path);
location.reload();
});
all_comment_delete_buttons = $("[id^=delete_comment]")
$.each(all_comment_delete_buttons, function(i,v){
$(v).on("click", function(){
var id = v.id;
var comment_id = id.split("__")[1];
if(comment_id) {
commentsJx.deleteComment(post_path, comment_id)
location.reload();
}
});
});
$(document.body).on('click',"button[id^=tag-subscription]",function () {
tagsJx.addTagSubscriptionListener($(this)[0]);
});
})
//Turn all the headers to be links
//h1 = Title, don't want that
var all_headers = [$("h2"), $("h3"), $("h4"), $("h5"), $("h6")]
$.each(all_headers, function(index, value){
$.each(value, function(i, v){
var inner_html = v.innerHTML
inner_html_no_special = inner_html.replace(/[^a-zA-Z\- ]/g, "")
var inner_link = "#" + inner_html_no_special.toLowerCase().split(" ").join("-")
v.innerHTML = "<a href='" + inner_link + "' class=link-reset>" + inner_html + "</a>"
})
})
//turn all the tags into links, similar to what's done on the feed page
var tags = $("#renderedMarkdown .metadata .tags")[0]
var tags_list = ['Spark', 'ATMO']
var subscriptions_list = []
$.each(tags_list, function(i,tag){
ahref = document.createElement("a")
e_tag = encodeURIComponent(tag)
f_tag = tag.replace("/", "__")
tag_name = "#" + tag
tag_subscription_button_id_name = "tag-subscription-" + i + "__" + f_tag
ahref.setAttribute("data-container", "body")
ahref.setAttribute("data-toggle", "popover")
ahref.setAttribute("data-placement", "bottom")
ahref.setAttribute("data-html", "true")
ahref.setAttribute("data-tag-name", f_tag)
if (subscriptions_list.indexOf(tag) >= 0) {
ahref.setAttribute("class", "label label-subscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-primary btn-unsubscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-remove-sign glyphicon-white'></i>Unsubscribe " +
" </button> " +
" </div>")
} else {
ahref.setAttribute("class", "label label-unsubscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-default btn-subscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-ok-sign glyphicon-filled'></i>Subscribe " +
" </button> " +
" </div>")
}
ahref.setAttribute("href", "/tag_pages?tag=" + e_tag)
ahref.setAttribute("style", "font-weight:normal")
if (i == 0){
ahref.innerHTML = " "
colon = document.createElement("text")
colon.innerHTML = "<b>Tags</b>: "
tags.appendChild(colon)
}
ahref.innerHTML = ahref.innerHTML + tag_name
tags.appendChild(ahref)
if (i != tags_list.length - 1){
comma = document.createElement("text")
comma.innerText = ", "
tags.appendChild(comma)
}
})
tags.nextSibling.remove()
tags.innerHTML += "<i class='glyphicon glyphicon-edit icon-gray' style='font-size:12pt; padding-left:4px' id='tooltip-edit_tags'></i>"
$(".pop").popover({ trigger: "manual" , html: true, animation:false, delay: 100})
.on("mouseenter", function () {
var _this = this;
$(this).popover("show");
$(".popover").on("mouseleave", function () {
$(_this).popover('hide');
});
}).on("mouseleave", function () {
var _this = this;
setTimeout(function () {
if (!$(".popover:hover").length) {
$(_this).popover("hide");
}
}, 300);
});
$('#tooltip-edit_tags').click(function(){
$('#tooltip-edit_tags')[0].setAttribute("style", "display:none")
previousSibling = $("#tooltip-edit_tags")[0].previousSibling
tags_string = tags_list.join(", ")
form = document.createElement("form")
input = document.createElement("input")
tags_text = document.createElement("text")
icon_class = document.createElement("i")
icon_class.setAttribute("class", "glyphicon glyphicon-upload icon-gray")
icon_class.setAttribute("style", "font-size:23px; padding-left:4px")
icon_class.setAttribute("id", "tooltip-save_tags")
tags_text.innerText = "Tags: "
input.setAttribute('type', 'text')
input.setAttribute('name', 'tags_list')
input.setAttribute('value', tags_string)
input.setAttribute('style', 'width:75%; display: inline-block')
input.setAttribute('id' , 'change_tags')
form.appendChild(tags_text)
tags.textContent = " "
form.appendChild(input)
form.appendChild(icon_class)
tags.appendChild(form)
$("#tooltip-save_tags").click(function(){
tags_string = $("#change_tags")[0].value
tags_list = tags_string.split(",")
var re = /^[a-z0-9\-\_\:\/]+$/i
var good = true
for (var i = 0; i < tags_list.length; i++){
tag = tags_list[i]
if (tag.length == 0){
alert("There is a tag with length 0 - possible a trailing comma?")
good = false
break
} else {
tag_name = tag.trim()
if (!(re.test(tag_name))){
alert("The tag contains special characters. Make sure there are only alphanumeric characters in your tag")
good = false
break
}
}
}
if (good) {
var postContent = {}
postContent['tags'] = tags_string
$.ajax({
type: "POST",
dataType: "json",
data: JSON.stringify(postContent),
contentType: "application/json",
url: '/tag_list?post_path=projects/avoid_coalesce.kp',
async: false
});
location.reload()
}
})
tags.nextSibling.remove()
// Allow user to edit tags
var edit_icon = iconsJx.createEditTagsIcon();
$(tags).after(edit_icon);
$("#tooltip-edit_tags").on("click", function(){
var edit_tooltip = $("#tooltip-edit_tags");
edit_tooltip.attr("style", "display:none");
var tags_string = tags_list.join(", ");
var form = $("<form>");
var input = $("<input>");
var tags_text = $("<text>");
tags_text.html("Tags: ");
var icon = iconsJx.createSaveTagsIcon();
input.attr("type", "text");
input.attr("name", "tags_list");
input.attr("style", "width:75%; display: inline-block");
input.attr("id", "change_tags");
form.append(tags_text);
tags.textContent = " ";
tags_text.innerHTML = "Tags: ";
form.append(input);
form.append(icon);
tags.appendChild(form[0]);
$("#change_tags")[0].value = tags_string;
$("#change_tags").keypress(function(e){
if (e.which == 13){
var tags_string = $("#change_tags")[0].value;
var post_path = "projects/avoid_coalesce.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false;
};
});
$("#tooltip-save_tags").click(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "projects/avoid_coalesce.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
});
$("form").submit(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "projects/avoid_coalesce.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false
})
});
});
</script>
</body>
</html>

Просмотреть файл

@ -0,0 +1,13 @@
{
"title": "Prefer repartition to coalesce in Spark",
"authors": [
"Ryan Harter (:harter)"
],
"tags": [
"Spark",
"ATMO"
],
"publish_date": "2017-03-02",
"updated_at": "2017-03-02",
"tldr": "When saving data to parquet in Spark/ATMO, avoid using coalesce."
}

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -0,0 +1,13 @@
{
"title": "Crash Ping Submission and Recording Delays by Channel",
"authors": [
"chutten"
],
"tags": [
"main ping",
"delay"
],
"publish_date": "2017-01-27",
"updated_at": "2017-01-27",
"tldr": "How long does it take before we get crash pings from users in each channel?"
}

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -0,0 +1,14 @@
{
"title": "Crash Ping Submission and Recording Delays - pingSender",
"authors": [
"chutten"
],
"tags": [
"crash ping",
"delay",
"pingSender"
],
"publish_date": "2017-03-07",
"updated_at": "2017-03-07",
"tldr": "How long does it take before we get crash pings from users that have pingSender vs users who don't?"
}

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -0,0 +1,14 @@
{
"title": "Duplicate Crash Pings",
"authors": [
"chutten"
],
"tags": [
"duplicate",
"dedupe",
"crash"
],
"publish_date": "2017-04-07",
"updated_at": "2017-04-07",
"tldr": "When the patches landed to dedupe crash pings (bug 1354468 has the list), did they work?"
}

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -0,0 +1,12 @@
{
"title": "BHR vs Input Lag Analysis",
"authors": [
"dthayer"
],
"tags": [
"bhr"
],
"publish_date": "2017-07-20",
"updated_at": "2017-07-20",
"tldr": "Analysis of the correlation between BHR hangs and \"Input Lag\" hangs."
}

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -0,0 +1,12 @@
{
"title": "Health ping data analysis (Nightly)",
"authors": [
"Kate Ustiuzhanina"
],
"tags": [
"firefox, telemetry, health"
],
"publish_date": "2017-08-24",
"updated_at": "2017-08-24",
"tldr": "Validate incoming data for the new health ping and look at how clients behave."
}

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -0,0 +1,12 @@
{
"title": "Health ping data analysis (Beta)",
"authors": [
"Kate Ustiuzhanina"
],
"tags": [
"firefox, telemetry, health"
],
"publish_date": "2017-08-24",
"updated_at": "2017-08-24",
"tldr": "Validate incoming data for the new health ping and look at how clients behave."
}

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -0,0 +1,14 @@
{
"title": "Main Ping Submission Delay - pingSender",
"authors": [
"dexter"
],
"tags": [
"main ping",
"delay",
"pingSender"
],
"publish_date": "2017-05-02",
"updated_at": "2017-05-02",
"tldr": "How long does it take before we get main pings from users that have pingSender vs users who don't?"
}

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -0,0 +1,14 @@
{
"title": "Main Ping Submission Delay (Beta Channel) - pingSender",
"authors": [
"dexter"
],
"tags": [
"main ping",
"delay",
"pingSender"
],
"publish_date": "2017-06-22",
"updated_at": "2017-06-22",
"tldr": "How long does it take before we get main pings from users that have pingSender vs users who don't, in the Beta channel?"
}

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -0,0 +1,14 @@
{
"title": "Main Ping Submission Delay (Beta Channel)",
"authors": [
"dexter"
],
"tags": [
"main ping",
"delay",
"pingSender"
],
"publish_date": "2017-07-11",
"updated_at": "2017-07-11",
"tldr": "How long does it take before we get main pings (all reasons) from users that have pingSender vs users who don't, in the Beta channel?"
}

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -0,0 +1,15 @@
{
"title": "new-profile ping validation on Beta",
"authors": [
"dexter"
],
"tags": [
"new-profile",
"latency",
"telemetry",
"spark"
],
"publish_date": "2017-07-04",
"updated_at": "2017-07-04",
"tldr": "This notebook verifies that the 'new-profile' ping behaves as expected on the Beta channel."
}

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -0,0 +1,15 @@
{
"title": "new-profile ping validation on Nightly",
"authors": [
"dexter"
],
"tags": [
"tutorial",
"examples",
"telemetry",
"spark"
],
"publish_date": "2017-06-07",
"updated_at": "2017-06-07",
"tldr": "This notebook verifies that the 'new-profile' ping behaves as expected on the Nightly channel."
}

Просмотреть файл

@ -0,0 +1,467 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title> Knowledge </title>
<!-- js includes at the top as post embedded js colliding -->
<script src="https://reports.telemetry.mozilla.org/static/modules/jquery/jquery.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/tether/js/tether.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap/js/bootstrap.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/js/bootstrap-slider.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/typeahead.js/typeahead.bundle.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/handlebars/js/handlebars.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/select2/js/select2.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/hightlight.pack.js/highlight.pack.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/marked.js/marked.js"></script>
<!-- require js is used for plotly, but has a bunch of collisions with other js packages
make sure to have it be last js package imported -->
<script src="https://reports.telemetry.mozilla.org/static/modules/require.js/require.min.js"></script>
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.6.2/html5shiv.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/2.1.0/es5-shim.min.js"></script>
<![endif]-->
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap/css/bootstrap.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/css/bootstrap-slider.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/css/custom.css" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/select2/css/select2.min.css" rel="stylesheet" type="text/css"/>
<link href="https://fonts.googleapis.com/css?family=Lato:400,900|Playfair+Display|Source+Serif+Pro:400,700" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/images/favicon.png" rel="shortcut icon"/>
<link href="https://reports.telemetry.mozilla.org/static/css/codehilite-friendly.css" rel="stylesheet"/>
<style>
.spinner {
position: fixed;
top: 50%;
left: 50%;
margin-left: -50px; /* half width of the spinner gif */
margin-top: -50px; /* half height of the spinner gif */
text-align:center;
z-index:1234;
overflow: auto;
width: 100px; /* width of the spinner gif */
height: 102px; /*hight of the spinner gif +2px to fix IE8 issue */
}
.table {
font-size: 14px;
}
.modal-content {
max-width: 1024px;
}
</style>
</head>
<body>
<div class="container page-container">
<br/>
<div class="container-fluid">
<div class="row">
<div class="col-md-6">
</div>
<div class="col-md-2">
</div>
</div>
<div class="row col-md-12">
</div>
</div>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<div id="renderedMarkdown">
<div class="metadata">
<h1>Linux User Counts are Easy to Overestimate</h1>
<span class="authors"><a href="/feed?authors=Ryan+Harter+%28%3Aharter%29">Ryan Harter (:harter)</a></span>
<span class="date_created">February 14, 2017</span>
<span class="date_updated">(Last Updated: February 14, 2017)</span>
<span class="tldr"><p>The longitudinal, main_summary, and cross_sectional datasets can yield misleading Linux user counts over time</p></span>
</div>
<h1 id="linux-user-counts-are-easy-to-overestimate">Linux User Counts are Easy to Overestimate</h1>
<p>This is primarily a summary of <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1333960">Bug 1333960</a> for the public repo.</p>
<h2 id="table-of-contents">Table of Contents</h2>
<div class="toc">
<ul>
<li><a href="#linux-user-counts-are-easy-to-overestimate">Linux User Counts are Easy to Overestimate</a><ul>
<li><a href="#table-of-contents">Table of Contents</a></li>
<li><a href="#problem">Problem</a></li>
<li><a href="#solution">Solution</a></li>
<li><a href="#tldr">TLDR</a></li>
</ul>
</li>
</ul>
</div>
<h2 id="problem">Problem</h2>
<p>I ran into some strangeness when trying to count users for major OSs.
Specifically, my queries consistently showed more Linux users than Mac users
(<a href="https://sql.telemetry.mozilla.org/queries/2374/source#table">example query</a>).
However, if we take the exact same data and look at users per day we show the opposite trend:
more Mac than Linux users every day (<a href="https://sql.telemetry.mozilla.org/queries/2400/source">query</a>).</p>
<h2 id="solution">Solution</h2>
<p>It turns out the root of this problem is <code>client_id</code> churn.
The queries showing more users on Linux than Darwin
state that weve seen more Linux <code>client_id</code>s than we have Darwin <code>client_id</code>s over time.
But, what if a large portion of those Linux <code>client_id</code>s havent been active for months? </p>
<p>Consider <a href="https://sql.telemetry.mozilla.org/queries/2399/source#4430">this graph</a> showing the most recent ping for each Linux and Mac <code>client_id</code>.
There are many more stale Linux <code>client_id</code>s.
If its hard to see look at <a href="https://bug1333960.bmoattachments.org/attachment.cgi?id=8830740&amp;t=62USxvVHZrR5w3yO8bLvEH">this graph</a> for a clearer image based off of the same data.</p>
<h2 id="tldr">TLDR</h2>
<p>In short, consider your time window when trying to count users with <code>client_id</code>s.
<code>client_id</code> churn is a growing problem as you expand your window.</p>
</div>
</div>
</div>
</div>
</div></body></html>
<div>
</div>
<br/>
<div class="row">
<div class="col-md-12">
</div>
</div>
<script type="text/javascript">
$("#searchbar")[0].setSelectionRange(1000, 1000);
$('#searchbar').typeahead({
hint: false,
highlight: true,
minLength: 1
},
{
name: 'knowledge_posts',
limit: 10,
display: function (item) {
return item.title + " - " + item.author;
},
templates: {
empty: Handlebars.compile(
'<div class="tt-not-found">' +
'Unable to find any posts that match the current query' +
'</div>'
),
suggestion: function(data) {
return '<p style="overflow-wrap:break-word"><strong class="text-rausch">' + data.title + '</strong> – ' + data.author + '</p>';
}
},
source: function(q, sync, async) {
$.ajax('/ajax/index/typeahead?search=' + q,
{
success: function(data,status){ async(JSON.parse(data)); }
})
}
});
$('#searchbar').bind('typeahead:select', function(obj, datum, name) {
window.location = '/post/'+encodeURIComponent(datum.path);
});
$('#searchbar').keypress(function(event){
var keycode = (event.keyCode ? event.keyCode : event.which);
if(keycode == '13'){
var path = document.location.pathname;
window.location = '/feed?filters=' + $('#searchbar').val()
}
});
var padding = $('.tt-menu').outerWidth()
$('.tt-menu').width($('#searchbar').width() + padding + "px")
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/tooltips.js" type="text/javascript"></script>
<script type="text/javascript">
$("document").ready(function(){
var is_webeditor = false;
var post_id = "41";
var id = "None";
var post_path = "projects/os_churn_md.kp"
var data_repo_github_root = ""
tooltipsJx.initializeTooltips(is_webeditor, post_id, id, data_repo_github_root);
$(".btn-rendered").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path);
})
$(".btn-raw").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path) + "?render=raw";
})
$(".btn-webeditor").on("click", function(){
document.location.href = "/edit/" + encodeURI(post_path);
})
});
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/tags.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/icons.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/comments.js" type="text/javascript"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js" type="text/javascript">
</script>
<script>
$(document).on('ready', function(){
// Make the Rendered Markdown Button active
$(".btn-rendered").addClass("btn-active");
// Initialize headers
helpersJx.linkifyHeaders();
// Initialize comments
var post_path = 'projects/os_churn_md.kp';
$("#post_comment_btn").on('click', function(){
comment_author = 'knowledge_default';
post_author = 'Ryan Harter (:harter)';
post_title = 'Linux User Counts are Easy to Overestimate';
commentsJx.postComment(comment_author, post_author, post_title, post_path);
location.reload();
});
all_comment_delete_buttons = $("[id^=delete_comment]")
$.each(all_comment_delete_buttons, function(i,v){
$(v).on("click", function(){
var id = v.id;
var comment_id = id.split("__")[1];
if(comment_id) {
commentsJx.deleteComment(post_path, comment_id)
location.reload();
}
});
});
$(document.body).on('click',"button[id^=tag-subscription]",function () {
tagsJx.addTagSubscriptionListener($(this)[0]);
});
})
//Turn all the headers to be links
//h1 = Title, don't want that
var all_headers = [$("h2"), $("h3"), $("h4"), $("h5"), $("h6")]
$.each(all_headers, function(index, value){
$.each(value, function(i, v){
var inner_html = v.innerHTML
inner_html_no_special = inner_html.replace(/[^a-zA-Z\- ]/g, "")
var inner_link = "#" + inner_html_no_special.toLowerCase().split(" ").join("-")
v.innerHTML = "<a href='" + inner_link + "' class=link-reset>" + inner_html + "</a>"
})
})
//turn all the tags into links, similar to what's done on the feed page
var tags = $("#renderedMarkdown .metadata .tags")[0]
var tags_list = ['main ping']
var subscriptions_list = []
$.each(tags_list, function(i,tag){
ahref = document.createElement("a")
e_tag = encodeURIComponent(tag)
f_tag = tag.replace("/", "__")
tag_name = "#" + tag
tag_subscription_button_id_name = "tag-subscription-" + i + "__" + f_tag
ahref.setAttribute("data-container", "body")
ahref.setAttribute("data-toggle", "popover")
ahref.setAttribute("data-placement", "bottom")
ahref.setAttribute("data-html", "true")
ahref.setAttribute("data-tag-name", f_tag)
if (subscriptions_list.indexOf(tag) >= 0) {
ahref.setAttribute("class", "label label-subscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-primary btn-unsubscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-remove-sign glyphicon-white'></i>Unsubscribe " +
" </button> " +
" </div>")
} else {
ahref.setAttribute("class", "label label-unsubscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-default btn-subscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-ok-sign glyphicon-filled'></i>Subscribe " +
" </button> " +
" </div>")
}
ahref.setAttribute("href", "/tag_pages?tag=" + e_tag)
ahref.setAttribute("style", "font-weight:normal")
if (i == 0){
ahref.innerHTML = " "
colon = document.createElement("text")
colon.innerHTML = "<b>Tags</b>: "
tags.appendChild(colon)
}
ahref.innerHTML = ahref.innerHTML + tag_name
tags.appendChild(ahref)
if (i != tags_list.length - 1){
comma = document.createElement("text")
comma.innerText = ", "
tags.appendChild(comma)
}
})
tags.nextSibling.remove()
tags.innerHTML += "<i class='glyphicon glyphicon-edit icon-gray' style='font-size:12pt; padding-left:4px' id='tooltip-edit_tags'></i>"
$(".pop").popover({ trigger: "manual" , html: true, animation:false, delay: 100})
.on("mouseenter", function () {
var _this = this;
$(this).popover("show");
$(".popover").on("mouseleave", function () {
$(_this).popover('hide');
});
}).on("mouseleave", function () {
var _this = this;
setTimeout(function () {
if (!$(".popover:hover").length) {
$(_this).popover("hide");
}
}, 300);
});
$('#tooltip-edit_tags').click(function(){
$('#tooltip-edit_tags')[0].setAttribute("style", "display:none")
previousSibling = $("#tooltip-edit_tags")[0].previousSibling
tags_string = tags_list.join(", ")
form = document.createElement("form")
input = document.createElement("input")
tags_text = document.createElement("text")
icon_class = document.createElement("i")
icon_class.setAttribute("class", "glyphicon glyphicon-upload icon-gray")
icon_class.setAttribute("style", "font-size:23px; padding-left:4px")
icon_class.setAttribute("id", "tooltip-save_tags")
tags_text.innerText = "Tags: "
input.setAttribute('type', 'text')
input.setAttribute('name', 'tags_list')
input.setAttribute('value', tags_string)
input.setAttribute('style', 'width:75%; display: inline-block')
input.setAttribute('id' , 'change_tags')
form.appendChild(tags_text)
tags.textContent = " "
form.appendChild(input)
form.appendChild(icon_class)
tags.appendChild(form)
$("#tooltip-save_tags").click(function(){
tags_string = $("#change_tags")[0].value
tags_list = tags_string.split(",")
var re = /^[a-z0-9\-\_\:\/]+$/i
var good = true
for (var i = 0; i < tags_list.length; i++){
tag = tags_list[i]
if (tag.length == 0){
alert("There is a tag with length 0 - possible a trailing comma?")
good = false
break
} else {
tag_name = tag.trim()
if (!(re.test(tag_name))){
alert("The tag contains special characters. Make sure there are only alphanumeric characters in your tag")
good = false
break
}
}
}
if (good) {
var postContent = {}
postContent['tags'] = tags_string
$.ajax({
type: "POST",
dataType: "json",
data: JSON.stringify(postContent),
contentType: "application/json",
url: '/tag_list?post_path=projects/os_churn_md.kp',
async: false
});
location.reload()
}
})
tags.nextSibling.remove()
// Allow user to edit tags
var edit_icon = iconsJx.createEditTagsIcon();
$(tags).after(edit_icon);
$("#tooltip-edit_tags").on("click", function(){
var edit_tooltip = $("#tooltip-edit_tags");
edit_tooltip.attr("style", "display:none");
var tags_string = tags_list.join(", ");
var form = $("<form>");
var input = $("<input>");
var tags_text = $("<text>");
tags_text.html("Tags: ");
var icon = iconsJx.createSaveTagsIcon();
input.attr("type", "text");
input.attr("name", "tags_list");
input.attr("style", "width:75%; display: inline-block");
input.attr("id", "change_tags");
form.append(tags_text);
tags.textContent = " ";
tags_text.innerHTML = "Tags: ";
form.append(input);
form.append(icon);
tags.appendChild(form[0]);
$("#change_tags")[0].value = tags_string;
$("#change_tags").keypress(function(e){
if (e.which == 13){
var tags_string = $("#change_tags")[0].value;
var post_path = "projects/os_churn_md.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false;
};
});
$("#tooltip-save_tags").click(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "projects/os_churn_md.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
});
$("form").submit(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "projects/os_churn_md.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false
})
});
});
</script>

Просмотреть файл

@ -0,0 +1,579 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title> Knowledge </title>
<!-- js includes at the top as post embedded js colliding -->
<script src="/static/modules/jquery/jquery.min.js"></script>
<script src="/static/modules/tether/js/tether.min.js"></script>
<script src="/static/modules/bootstrap/js/bootstrap.min.js"></script>
<script src="/static/modules/bootstrap-slider/js/bootstrap-slider.min.js"></script>
<script src="/static/modules/typeahead.js/typeahead.bundle.min.js"></script>
<script src="/static/modules/handlebars/js/handlebars.js"></script>
<script src="/static/js/helpers.js"></script>
<script src="/static/modules/select2/js/select2.min.js"></script>
<script src="/static/modules/hightlight.pack.js/highlight.pack.js"></script>
<script src="/static/modules/marked.js/marked.js"></script>
<!-- require js is used for plotly, but has a bunch of collisions with other js packages
make sure to have it be last js package imported -->
<script src="/static/modules/require.js/require.min.js"></script>
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.6.2/html5shiv.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/2.1.0/es5-shim.min.js"></script>
<![endif]-->
<link rel="stylesheet" href="/static/modules/bootstrap/css/bootstrap.min.css">
<link rel="stylesheet" href="/static/modules/bootstrap-slider/css/bootstrap-slider.min.css">
<link rel="stylesheet" type='text/css' href="/static/css/custom.css">
<link rel="stylesheet" type='text/css' href="/static/modules/select2/css/select2.min.css">
<link href='https://fonts.googleapis.com/css?family=Lato:400,900|Playfair+Display|Source+Serif+Pro:400,700' rel='stylesheet' type='text/css'>
<link rel="shortcut icon" href="/static/images/favicon.png">
<link rel="stylesheet" href="/static/css/codehilite-friendly.css">
<style>
.spinner {
position: fixed;
top: 50%;
left: 50%;
margin-left: -50px; /* half width of the spinner gif */
margin-top: -50px; /* half height of the spinner gif */
text-align:center;
z-index:1234;
overflow: auto;
width: 100px; /* width of the spinner gif */
height: 102px; /*hight of the spinner gif +2px to fix IE8 issue */
}
.table {
font-size: 14px;
}
.modal-content {
max-width: 1024px;
}
</style>
</head>
<body>
<div class="navbar navbar-knowledge" role="navigation">
<div class="container page-container">
<a href="/" aria-selected="false" class="logo-image navbar-text" style='margin-top:16px; margin-left: -7px'>
<img width="125" src='/static/images/logo-white.svg'></img>
</a>
<a id="feed_tab" href="/" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Home
</a>
<a id="favorites_tab" href="/favorites" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Favorites
</a>
<a id="help_tab" href="/about" aria-selected="false" class="navbar-text" style='margin-top:18px'>
About
</a>
<a id="stats_tab" href="/stats" aria-selected="false" class="navbar-text" style='margin-top:18px'>
Stats
</a>
<a id="webposts_tab" href="/create"
class="btn btn-primary navbar-text pull-right"
style='border-radius:4px; margin-top:10px; margin-right:-10px; background-color: #00a699; color:white; border-color: #00a699'>
Write a Post!
</a>
<div class="pull-right">
<div class="form-group">
<input class="form-control" id="searchbar" placeholder="Search for Knowledge" style="text-align:right">
</div>
</div>
</div>
</div>
<div class="container page-container">
<br>
<div class="container-fluid">
<div class="row">
<div class="col-md-6">
<div class="btn-group" role="group">
<button type="button" class="btn btn-default btn-rendered">
View Post
</button>
<button type="button" class="btn btn-default btn-raw">
View Raw Markdown
</button>
</div>
</div>
<div class="col-md-2">
</div>
<div class="col-md-4 text-right">
<i class="glyphicon glyphicon-eye-open" id="pageview_stats" style='color: #9CA299'></i>
<div id="pageview_stats" style="display: inline-block">
Viewed 1 times by 1 different users
</div>
<i class="glyphicon glyphicon-heart-empty glyphicon-clickable pop" style="font-size:16pt" id="tooltip-like" data-placement="bottom"
data-trigger="#tooltip-like"
data-container="body"
data-toggle="popover"
data-content='<div>Like This Post</div>'></i>
</div>
</div>
<div class="row col-md-12">
</div>
</div>
<div class='container-fluid'>
<div class="row">
<div class="col-md-12">
<div id="renderedMarkdown">
<div class='metadata'>
<h1>Linux User Counts are Easy to Overestimate</h1>
<span class='authors'><a href='/feed?authors=Ryan+Harter+%28%3Aharter%29'>Ryan Harter (:harter)</a></span>
<span class='date_created'>February 14, 2017</span>
<span class='date_updated'>(Last Updated: February 14, 2017)</span>
<span class='tldr'><p>The longitudinal, main_summary, and cross_sectional datasets can yield misleading Linux user counts over time</p></span>
<span class='tags'></span>
</div>
<h1 id="linux-user-counts-are-easy-to-overestimate">Linux User Counts are Easy to Overestimate</h1>
<p>This is primarily a summary of <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1333960">Bug 1333960</a> for the public repo.</p>
<h2 id="table-of-contents">Table of Contents</h2>
<div class="toc">
<ul>
<li><a href="#linux-user-counts-are-easy-to-overestimate">Linux User Counts are Easy to Overestimate</a><ul>
<li><a href="#table-of-contents">Table of Contents</a></li>
<li><a href="#problem">Problem</a></li>
<li><a href="#solution">Solution</a></li>
<li><a href="#tldr">TLDR</a></li>
</ul>
</li>
</ul>
</div>
<h2 id="problem">Problem</h2>
<p>I ran into some strangeness when trying to count users for major OS&rsquo;s.
Specifically, my queries consistently showed more Linux users than Mac users
(<a href="https://sql.telemetry.mozilla.org/queries/2374/source#table">example query</a>).
However, if we take the exact same data and look at users per day we show the opposite trend:
more Mac than Linux users every day (<a href="https://sql.telemetry.mozilla.org/queries/2400/source">query</a>).</p>
<h2 id="solution">Solution</h2>
<p>It turns out the root of this problem is <code>client_id</code> churn.
The queries showing more users on Linux than Darwin
state that we&rsquo;ve seen more Linux <code>client_id</code>&lsquo;s than we have Darwin <code>client_id</code>&lsquo;s over time.
But, what if a large portion of those Linux <code>client_id</code>&lsquo;s haven&rsquo;t been active for months? </p>
<p>Consider <a href="https://sql.telemetry.mozilla.org/queries/2399/source#4430">this graph</a> showing the most recent ping for each Linux and Mac <code>client_id</code>.
There are many more stale Linux <code>client_id</code>&lsquo;s.
If it&rsquo;s hard to see look at <a href="https://bug1333960.bmoattachments.org/attachment.cgi?id=8830740&amp;t=62USxvVHZrR5w3yO8bLvEH">this graph</a> for a clearer image based off of the same data.</p>
<h2 id="tldr">TLDR</h2>
<p>In short, consider your time window when trying to count users with <code>client_id</code>s.
<code>client_id</code> churn is a growing problem as you expand your window.</p>
</div>
</div>
</div>
</div>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<h2> 0 Comments </h2>
</br>
<div>
<div class="row">
<div class="col-md-12">
<textarea class="form-control" type="text" id="comment-text" style="height:87px;" placeholder="Leave a comment..."></textarea>
</div>
</div>
</div>
</br>
<div>
<button class="btn btn-primary" id="post_comment_btn">Post Comment</button>
</div>
</div>
</div>
<br>
<div class="row">
<div class="col-md-12">
</div>
</br>
</div>
</div>
</div>
</div>
<div class="footer">
Served with <span class="glyphicon glyphicon-heart"></span> by <a href="https://github.com/airbnb/knowledge-repo">Knowledge Repo</a> <a href="https://github.com/airbnb/knowledge-repo/releases/tag/v0.7.4">0.7.4</a><br />
<i title="Last checked for updates: 36 seconds ago">Last indexed: 14 hours ago</i>
</div>
<script type='text/javascript'>
$("#searchbar")[0].setSelectionRange(1000, 1000);
$('#searchbar').typeahead({
hint: false,
highlight: true,
minLength: 1
},
{
name: 'knowledge_posts',
limit: 10,
display: function (item) {
return item.title + " - " + item.author;
},
templates: {
empty: Handlebars.compile(
'<div class="tt-not-found">' +
'Unable to find any posts that match the current query' +
'</div>'
),
suggestion: function(data) {
return '<p style="overflow-wrap:break-word"><strong class="text-rausch">' + data.title + '</strong> – ' + data.author + '</p>';
}
},
source: function(q, sync, async) {
$.ajax('/ajax/index/typeahead?search=' + q,
{
success: function(data,status){ async(JSON.parse(data)); }
})
}
});
$('#searchbar').bind('typeahead:select', function(obj, datum, name) {
window.location = '/post/'+encodeURIComponent(datum.path);
});
$('#searchbar').keypress(function(event){
var keycode = (event.keyCode ? event.keyCode : event.which);
if(keycode == '13'){
var path = document.location.pathname;
window.location = '/feed?filters=' + $('#searchbar').val()
}
});
var padding = $('.tt-menu').outerWidth()
$('.tt-menu').width($('#searchbar').width() + padding + "px")
</script>
<script src="/static/js/tooltips.js" type="text/javascript"></script>
<script type="text/javascript">
$("document").ready(function(){
var is_webeditor = false;
var post_id = "41";
var id = "None";
var post_path = "projects/os_churn_md.kp"
var data_repo_github_root = ""
tooltipsJx.initializeTooltips(is_webeditor, post_id, id, data_repo_github_root);
$(".btn-rendered").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path);
})
$(".btn-raw").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path) + "?render=raw";
})
$(".btn-webeditor").on("click", function(){
document.location.href = "/edit/" + encodeURI(post_path);
})
});
</script>
<script src="/static/js/helpers.js"></script>
<script src="/static/js/tags.js" type="text/javascript"></script>
<script src="/static/js/icons.js" type="text/javascript"></script>
<script src="/static/js/comments.js" type="text/javascript"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script type="text/javascript"
src="https://cdn.mathjax.org/mathjax/latest/MathJax.js">
</script>
<script>
$(document).on('ready', function(){
// Make the Rendered Markdown Button active
$(".btn-rendered").addClass("btn-active");
// Initialize headers
helpersJx.linkifyHeaders();
// Initialize comments
var post_path = 'projects/os_churn_md.kp';
$("#post_comment_btn").on('click', function(){
comment_author = 'knowledge_default';
post_author = 'Ryan Harter (:harter)';
post_title = 'Linux User Counts are Easy to Overestimate';
commentsJx.postComment(comment_author, post_author, post_title, post_path);
location.reload();
});
all_comment_delete_buttons = $("[id^=delete_comment]")
$.each(all_comment_delete_buttons, function(i,v){
$(v).on("click", function(){
var id = v.id;
var comment_id = id.split("__")[1];
if(comment_id) {
commentsJx.deleteComment(post_path, comment_id)
location.reload();
}
});
});
$(document.body).on('click',"button[id^=tag-subscription]",function () {
tagsJx.addTagSubscriptionListener($(this)[0]);
});
})
//Turn all the headers to be links
//h1 = Title, don't want that
var all_headers = [$("h2"), $("h3"), $("h4"), $("h5"), $("h6")]
$.each(all_headers, function(index, value){
$.each(value, function(i, v){
var inner_html = v.innerHTML
inner_html_no_special = inner_html.replace(/[^a-zA-Z\- ]/g, "")
var inner_link = "#" + inner_html_no_special.toLowerCase().split(" ").join("-")
v.innerHTML = "<a href='" + inner_link + "' class=link-reset>" + inner_html + "</a>"
})
})
//turn all the tags into links, similar to what's done on the feed page
var tags = $("#renderedMarkdown .metadata .tags")[0]
var tags_list = ['main ping']
var subscriptions_list = []
$.each(tags_list, function(i,tag){
ahref = document.createElement("a")
e_tag = encodeURIComponent(tag)
f_tag = tag.replace("/", "__")
tag_name = "#" + tag
tag_subscription_button_id_name = "tag-subscription-" + i + "__" + f_tag
ahref.setAttribute("data-container", "body")
ahref.setAttribute("data-toggle", "popover")
ahref.setAttribute("data-placement", "bottom")
ahref.setAttribute("data-html", "true")
ahref.setAttribute("data-tag-name", f_tag)
if (subscriptions_list.indexOf(tag) >= 0) {
ahref.setAttribute("class", "label label-subscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-primary btn-unsubscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-remove-sign glyphicon-white'></i>Unsubscribe " +
" </button> " +
" </div>")
} else {
ahref.setAttribute("class", "label label-unsubscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-default btn-subscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-ok-sign glyphicon-filled'></i>Subscribe " +
" </button> " +
" </div>")
}
ahref.setAttribute("href", "/tag_pages?tag=" + e_tag)
ahref.setAttribute("style", "font-weight:normal")
if (i == 0){
ahref.innerHTML = " "
colon = document.createElement("text")
colon.innerHTML = "<b>Tags</b>: "
tags.appendChild(colon)
}
ahref.innerHTML = ahref.innerHTML + tag_name
tags.appendChild(ahref)
if (i != tags_list.length - 1){
comma = document.createElement("text")
comma.innerText = ", "
tags.appendChild(comma)
}
})
tags.nextSibling.remove()
tags.innerHTML += "<i class='glyphicon glyphicon-edit icon-gray' style='font-size:12pt; padding-left:4px' id='tooltip-edit_tags'></i>"
$(".pop").popover({ trigger: "manual" , html: true, animation:false, delay: 100})
.on("mouseenter", function () {
var _this = this;
$(this).popover("show");
$(".popover").on("mouseleave", function () {
$(_this).popover('hide');
});
}).on("mouseleave", function () {
var _this = this;
setTimeout(function () {
if (!$(".popover:hover").length) {
$(_this).popover("hide");
}
}, 300);
});
$('#tooltip-edit_tags').click(function(){
$('#tooltip-edit_tags')[0].setAttribute("style", "display:none")
previousSibling = $("#tooltip-edit_tags")[0].previousSibling
tags_string = tags_list.join(", ")
form = document.createElement("form")
input = document.createElement("input")
tags_text = document.createElement("text")
icon_class = document.createElement("i")
icon_class.setAttribute("class", "glyphicon glyphicon-upload icon-gray")
icon_class.setAttribute("style", "font-size:23px; padding-left:4px")
icon_class.setAttribute("id", "tooltip-save_tags")
tags_text.innerText = "Tags: "
input.setAttribute('type', 'text')
input.setAttribute('name', 'tags_list')
input.setAttribute('value', tags_string)
input.setAttribute('style', 'width:75%; display: inline-block')
input.setAttribute('id' , 'change_tags')
form.appendChild(tags_text)
tags.textContent = " "
form.appendChild(input)
form.appendChild(icon_class)
tags.appendChild(form)
$("#tooltip-save_tags").click(function(){
tags_string = $("#change_tags")[0].value
tags_list = tags_string.split(",")
var re = /^[a-z0-9\-\_\:\/]+$/i
var good = true
for (var i = 0; i < tags_list.length; i++){
tag = tags_list[i]
if (tag.length == 0){
alert("There is a tag with length 0 - possible a trailing comma?")
good = false
break
} else {
tag_name = tag.trim()
if (!(re.test(tag_name))){
alert("The tag contains special characters. Make sure there are only alphanumeric characters in your tag")
good = false
break
}
}
}
if (good) {
var postContent = {}
postContent['tags'] = tags_string
$.ajax({
type: "POST",
dataType: "json",
data: JSON.stringify(postContent),
contentType: "application/json",
url: '/tag_list?post_path=projects/os_churn_md.kp',
async: false
});
location.reload()
}
})
tags.nextSibling.remove()
// Allow user to edit tags
var edit_icon = iconsJx.createEditTagsIcon();
$(tags).after(edit_icon);
$("#tooltip-edit_tags").on("click", function(){
var edit_tooltip = $("#tooltip-edit_tags");
edit_tooltip.attr("style", "display:none");
var tags_string = tags_list.join(", ");
var form = $("<form>");
var input = $("<input>");
var tags_text = $("<text>");
tags_text.html("Tags: ");
var icon = iconsJx.createSaveTagsIcon();
input.attr("type", "text");
input.attr("name", "tags_list");
input.attr("style", "width:75%; display: inline-block");
input.attr("id", "change_tags");
form.append(tags_text);
tags.textContent = " ";
tags_text.innerHTML = "Tags: ";
form.append(input);
form.append(icon);
tags.appendChild(form[0]);
$("#change_tags")[0].value = tags_string;
$("#change_tags").keypress(function(e){
if (e.which == 13){
var tags_string = $("#change_tags")[0].value;
var post_path = "projects/os_churn_md.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false;
};
});
$("#tooltip-save_tags").click(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "projects/os_churn_md.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
});
$("form").submit(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "projects/os_churn_md.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false
})
});
});
</script>
</body>
</html>

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -0,0 +1,13 @@
{
"title": "Main Ping Submission and Recording Delays by Channel",
"authors": [
"chutten"
],
"tags": [
"main ping",
"delay"
],
"publish_date": "2017-01-20",
"updated_at": "2017-01-20",
"tldr": "How long does it take before we get pings from users in each channel?"
}

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -0,0 +1,13 @@
{
"title": "One Problematic Aurora 51 Client",
"authors": [
"chutten"
],
"tags": [
"aurora",
"firefox"
],
"publish_date": "2017-02-22",
"updated_at": "2017-02-22",
"tldr": "Taking a look at one problematic client on Aurora leads to a broad examination of the types of hosts that are sending us this data and some seriously-speculative conclusions."
}

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -0,0 +1,14 @@
{
"title": "That Aurora 51 Client",
"authors": [
"chutten"
],
"tags": [
"misbehaviour",
"aurora 51",
"one client"
],
"publish_date": "2017-04-28",
"updated_at": "2017-04-28",
"tldr": "More explorations into that 'one' Aurora 51 client"
}

Просмотреть файл

@ -0,0 +1,484 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title> Knowledge </title>
<!-- js includes at the top as post embedded js colliding -->
<script src="https://reports.telemetry.mozilla.org/static/modules/jquery/jquery.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/tether/js/tether.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap/js/bootstrap.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/js/bootstrap-slider.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/typeahead.js/typeahead.bundle.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/handlebars/js/handlebars.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/select2/js/select2.min.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/hightlight.pack.js/highlight.pack.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/modules/marked.js/marked.js"></script>
<!-- require js is used for plotly, but has a bunch of collisions with other js packages
make sure to have it be last js package imported -->
<script src="https://reports.telemetry.mozilla.org/static/modules/require.js/require.min.js"></script>
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.6.2/html5shiv.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/2.1.0/es5-shim.min.js"></script>
<![endif]-->
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap/css/bootstrap.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/bootstrap-slider/css/bootstrap-slider.min.css" rel="stylesheet"/>
<link href="https://reports.telemetry.mozilla.org/static/css/custom.css" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/modules/select2/css/select2.min.css" rel="stylesheet" type="text/css"/>
<link href="https://fonts.googleapis.com/css?family=Lato:400,900|Playfair+Display|Source+Serif+Pro:400,700" rel="stylesheet" type="text/css"/>
<link href="https://reports.telemetry.mozilla.org/static/images/favicon.png" rel="shortcut icon"/>
<link href="https://reports.telemetry.mozilla.org/static/css/codehilite-friendly.css" rel="stylesheet"/>
<style>
.spinner {
position: fixed;
top: 50%;
left: 50%;
margin-left: -50px; /* half width of the spinner gif */
margin-top: -50px; /* half height of the spinner gif */
text-align:center;
z-index:1234;
overflow: auto;
width: 100px; /* width of the spinner gif */
height: 102px; /*hight of the spinner gif +2px to fix IE8 issue */
}
.table {
font-size: 14px;
}
.modal-content {
max-width: 1024px;
}
</style>
</head>
<body>
<div class="container page-container">
<br/>
<div class="container-fluid">
<div class="row">
<div class="col-md-6">
</div>
<div class="col-md-2">
</div>
</div>
<div class="row col-md-12">
</div>
</div>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<div id="renderedMarkdown">
<div class="metadata">
<h1>TELEMETRY_SEND Failure Logs</h1>
<span class="authors"><a href="/feed?authors=chutten">chutten</a></span>
<span class="date_created">May 05, 2017</span>
<span class="date_updated">(Last Updated: May 05, 2017)</span>
<span class="tldr"><p>What kind of failures are we seeing when people fail to send Telemetry pings? (bug 1319026)</p></span>
</div>
<h3 id="telemetry_send-failure-logs">TELEMETRY_SEND Failure Logs</h3>
<p><a href="https://bugzilla.mozilla.org/show_bug.cgi?id=1319026">Bug 1319026</a> introduced logs to try and nail down what kinds of failures users experience when trying to send Telemetry pings. Lets see what weve managed to collect.</p>
<div class="codehilite"><pre><span></span><span class="kn">import</span> <span class="nn">ujson</span> <span class="kn">as</span> <span class="nn">json</span>
<span class="kn">import</span> <span class="nn">matplotlib.pyplot</span> <span class="kn">as</span> <span class="nn">plt</span>
<span class="kn">import</span> <span class="nn">pandas</span> <span class="kn">as</span> <span class="nn">pd</span>
<span class="kn">import</span> <span class="nn">numpy</span> <span class="kn">as</span> <span class="nn">np</span>
<span class="kn">import</span> <span class="nn">plotly.plotly</span> <span class="kn">as</span> <span class="nn">py</span>
<span class="kn">from</span> <span class="nn">plotly.graph_objs</span> <span class="kn">import</span> <span class="o">*</span>
<span class="kn">from</span> <span class="nn">moztelemetry</span> <span class="kn">import</span> <span class="n">get_pings_properties</span><span class="p">,</span> <span class="n">get_one_ping_per_client</span>
<span class="kn">from</span> <span class="nn">moztelemetry.dataset</span> <span class="kn">import</span> <span class="n">Dataset</span>
<span class="o">%</span><span class="n">matplotlib</span> <span class="n">inline</span>
</pre></div>
<div class="codehilite"><pre><span></span>Unable to parse whitelist (/mnt/anaconda2/lib/python2.7/site-packages/moztelemetry/histogram-whitelists.json). Assuming all histograms are acceptable.
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">pings</span> <span class="o">=</span> <span class="n">Dataset</span><span class="o">.</span><span class="n">from_source</span><span class="p">(</span><span class="s2">"telemetry"</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">docType</span><span class="o">=</span><span class="s1">'main'</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">appUpdateChannel</span><span class="o">=</span><span class="s1">'nightly'</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">submissionDate</span><span class="o">=</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span> <span class="o">&gt;=</span> <span class="s2">"20170429"</span><span class="p">)</span> \
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">appBuildId</span><span class="o">=</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span> <span class="o">&gt;=</span> <span class="s1">'20170429'</span><span class="p">)</span> \
<span class="o">.</span><span class="n">records</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">sample</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">subset</span> <span class="o">=</span> <span class="n">get_pings_properties</span><span class="p">(</span><span class="n">pings</span><span class="p">,</span> <span class="p">[</span><span class="s2">"clientId"</span><span class="p">,</span>
<span class="s2">"environment/system/os/name"</span><span class="p">,</span>
<span class="s2">"payload/log"</span><span class="p">])</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">log_entries</span> <span class="o">=</span> <span class="n">subset</span>\
<span class="o">.</span><span class="n">flatMap</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="p">[]</span> <span class="k">if</span> <span class="n">p</span><span class="p">[</span><span class="s1">'payload/log'</span><span class="p">]</span> <span class="ow">is</span> <span class="bp">None</span> <span class="k">else</span> <span class="p">[</span><span class="n">l</span> <span class="k">for</span> <span class="n">l</span> <span class="ow">in</span> <span class="n">p</span><span class="p">[</span><span class="s1">'payload/log'</span><span class="p">]</span> <span class="k">if</span> <span class="n">l</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">==</span> <span class="s1">'TELEMETRY_SEND_FAILURE'</span><span class="p">])</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">log_entries</span> <span class="o">=</span> <span class="n">log_entries</span><span class="o">.</span><span class="n">cache</span><span class="p">()</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">error_counts</span> <span class="o">=</span> <span class="n">log_entries</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">l</span><span class="p">:</span> <span class="p">(</span><span class="nb">tuple</span><span class="p">(</span><span class="n">l</span><span class="p">[</span><span class="mi">2</span><span class="p">:]),</span> <span class="mi">1</span><span class="p">))</span><span class="o">.</span><span class="n">countByKey</span><span class="p">()</span>
</pre></div>
<div class="codehilite"><pre><span></span><span class="n">entries_count</span> <span class="o">=</span> <span class="n">log_entries</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
<span class="nb">sorted</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">i</span><span class="p">:</span> <span class="p">(</span><span class="s1">'{:.2%}'</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="mf">1.0</span> <span class="o">*</span> <span class="n">i</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span> <span class="o">/</span> <span class="n">entries_count</span><span class="p">),</span> <span class="n">i</span><span class="p">),</span> <span class="n">error_counts</span><span class="o">.</span><span class="n">iteritems</span><span class="p">()),</span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">1</span><span class="p">][</span><span class="mi">1</span><span class="p">],</span> <span class="n">reverse</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
</pre></div>
<div class="codehilite"><pre><span></span>[('72.16%', ((u'errorhandler', u'error'), 530178)),
('27.04%', ((u'errorhandler', u'timeout'), 198698)),
('0.73%', ((u'5xx failure', u'504'), 5327)),
('0.07%', ((u'errorhandler', u'abort'), 530)),
('0.00%', ((u"4xx 'failure'", u'403'), 7)),
('0.00%', ((u'5xx failure', u'502'), 3))]
</pre></div>
<h4 id="conclusion">Conclusion</h4>
<p>Alrighty, looks like were mostly “error”. Not too helpful, but does narrow things down a bit.</p>
<p>“timeout” is the reason for more than one in every four failures. Thats a smaller cohort than Id originally thought.</p>
<p>A few Gateway Timeouts (504) which could be server load, very few aborts, and essentially no Forbidden (403) or Bad Gateway (502).</p>
</div>
</div>
</div>
</div>
</div></body></html>
<div>
</div>
<br/>
<div class="row">
<div class="col-md-12">
</div>
</div>
<script type="text/javascript">
$("#searchbar")[0].setSelectionRange(1000, 1000);
$('#searchbar').typeahead({
hint: false,
highlight: true,
minLength: 1
},
{
name: 'knowledge_posts',
limit: 10,
display: function (item) {
return item.title + " - " + item.author;
},
templates: {
empty: Handlebars.compile(
'<div class="tt-not-found">' +
'Unable to find any posts that match the current query' +
'</div>'
),
suggestion: function(data) {
return '<p style="overflow-wrap:break-word"><strong class="text-rausch">' + data.title + '</strong> – ' + data.author + '</p>';
}
},
source: function(q, sync, async) {
$.ajax('/ajax/index/typeahead?search=' + q,
{
success: function(data,status){ async(JSON.parse(data)); }
})
}
});
$('#searchbar').bind('typeahead:select', function(obj, datum, name) {
window.location = '/post/'+encodeURIComponent(datum.path);
});
$('#searchbar').keypress(function(event){
var keycode = (event.keyCode ? event.keyCode : event.which);
if(keycode == '13'){
var path = document.location.pathname;
window.location = '/feed?filters=' + $('#searchbar').val()
}
});
var padding = $('.tt-menu').outerWidth()
$('.tt-menu').width($('#searchbar').width() + padding + "px")
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/tooltips.js" type="text/javascript"></script>
<script type="text/javascript">
$("document").ready(function(){
var is_webeditor = false;
var post_id = "42";
var id = "None";
var post_path = "projects/telemetry_send_failures.kp"
var data_repo_github_root = ""
tooltipsJx.initializeTooltips(is_webeditor, post_id, id, data_repo_github_root);
$(".btn-rendered").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path);
})
$(".btn-raw").on("click", function(){
document.location.href = "/post/" + encodeURI(post_path) + "?render=raw";
})
$(".btn-webeditor").on("click", function(){
document.location.href = "/edit/" + encodeURI(post_path);
})
});
</script>
<script src="https://reports.telemetry.mozilla.org/static/js/helpers.js"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/tags.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/icons.js" type="text/javascript"></script>
<script src="https://reports.telemetry.mozilla.org/static/js/comments.js" type="text/javascript"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js" type="text/javascript">
</script>
<script>
$(document).on('ready', function(){
// Make the Rendered Markdown Button active
$(".btn-rendered").addClass("btn-active");
// Initialize headers
helpersJx.linkifyHeaders();
// Initialize comments
var post_path = 'projects/telemetry_send_failures.kp';
$("#post_comment_btn").on('click', function(){
comment_author = 'knowledge_default';
post_author = 'chutten';
post_title = 'TELEMETRY_SEND Failure Logs';
commentsJx.postComment(comment_author, post_author, post_title, post_path);
location.reload();
});
all_comment_delete_buttons = $("[id^=delete_comment]")
$.each(all_comment_delete_buttons, function(i,v){
$(v).on("click", function(){
var id = v.id;
var comment_id = id.split("__")[1];
if(comment_id) {
commentsJx.deleteComment(post_path, comment_id)
location.reload();
}
});
});
$(document.body).on('click',"button[id^=tag-subscription]",function () {
tagsJx.addTagSubscriptionListener($(this)[0]);
});
})
//Turn all the headers to be links
//h1 = Title, don't want that
var all_headers = [$("h2"), $("h3"), $("h4"), $("h5"), $("h6")]
$.each(all_headers, function(index, value){
$.each(value, function(i, v){
var inner_html = v.innerHTML
inner_html_no_special = inner_html.replace(/[^a-zA-Z\- ]/g, "")
var inner_link = "#" + inner_html_no_special.toLowerCase().split(" ").join("-")
v.innerHTML = "<a href='" + inner_link + "' class=link-reset>" + inner_html + "</a>"
})
})
//turn all the tags into links, similar to what's done on the feed page
var tags = $("#renderedMarkdown .metadata .tags")[0]
var tags_list = ['log', 'failure', 'telemetry', 'send']
var subscriptions_list = []
$.each(tags_list, function(i,tag){
ahref = document.createElement("a")
e_tag = encodeURIComponent(tag)
f_tag = tag.replace("/", "__")
tag_name = "#" + tag
tag_subscription_button_id_name = "tag-subscription-" + i + "__" + f_tag
ahref.setAttribute("data-container", "body")
ahref.setAttribute("data-toggle", "popover")
ahref.setAttribute("data-placement", "bottom")
ahref.setAttribute("data-html", "true")
ahref.setAttribute("data-tag-name", f_tag)
if (subscriptions_list.indexOf(tag) >= 0) {
ahref.setAttribute("class", "label label-subscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-primary btn-unsubscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-remove-sign glyphicon-white'></i>Unsubscribe " +
" </button> " +
" </div>")
} else {
ahref.setAttribute("class", "label label-unsubscribed pop")
ahref.setAttribute("data-content", "<div class='content'>" +
" <button class='btn btn-small btn-default btn-subscribe'" +
" title='' " +
" id='" + tag_subscription_button_id_name + "'> " +
" <i class='glyphicon glyphicon-ok-sign glyphicon-filled'></i>Subscribe " +
" </button> " +
" </div>")
}
ahref.setAttribute("href", "/tag_pages?tag=" + e_tag)
ahref.setAttribute("style", "font-weight:normal")
if (i == 0){
ahref.innerHTML = " "
colon = document.createElement("text")
colon.innerHTML = "<b>Tags</b>: "
tags.appendChild(colon)
}
ahref.innerHTML = ahref.innerHTML + tag_name
tags.appendChild(ahref)
if (i != tags_list.length - 1){
comma = document.createElement("text")
comma.innerText = ", "
tags.appendChild(comma)
}
})
tags.nextSibling.remove()
tags.innerHTML += "<i class='glyphicon glyphicon-edit icon-gray' style='font-size:12pt; padding-left:4px' id='tooltip-edit_tags'></i>"
$(".pop").popover({ trigger: "manual" , html: true, animation:false, delay: 100})
.on("mouseenter", function () {
var _this = this;
$(this).popover("show");
$(".popover").on("mouseleave", function () {
$(_this).popover('hide');
});
}).on("mouseleave", function () {
var _this = this;
setTimeout(function () {
if (!$(".popover:hover").length) {
$(_this).popover("hide");
}
}, 300);
});
$('#tooltip-edit_tags').click(function(){
$('#tooltip-edit_tags')[0].setAttribute("style", "display:none")
previousSibling = $("#tooltip-edit_tags")[0].previousSibling
tags_string = tags_list.join(", ")
form = document.createElement("form")
input = document.createElement("input")
tags_text = document.createElement("text")
icon_class = document.createElement("i")
icon_class.setAttribute("class", "glyphicon glyphicon-upload icon-gray")
icon_class.setAttribute("style", "font-size:23px; padding-left:4px")
icon_class.setAttribute("id", "tooltip-save_tags")
tags_text.innerText = "Tags: "
input.setAttribute('type', 'text')
input.setAttribute('name', 'tags_list')
input.setAttribute('value', tags_string)
input.setAttribute('style', 'width:75%; display: inline-block')
input.setAttribute('id' , 'change_tags')
form.appendChild(tags_text)
tags.textContent = " "
form.appendChild(input)
form.appendChild(icon_class)
tags.appendChild(form)
$("#tooltip-save_tags").click(function(){
tags_string = $("#change_tags")[0].value
tags_list = tags_string.split(",")
var re = /^[a-z0-9\-\_\:\/]+$/i
var good = true
for (var i = 0; i < tags_list.length; i++){
tag = tags_list[i]
if (tag.length == 0){
alert("There is a tag with length 0 - possible a trailing comma?")
good = false
break
} else {
tag_name = tag.trim()
if (!(re.test(tag_name))){
alert("The tag contains special characters. Make sure there are only alphanumeric characters in your tag")
good = false
break
}
}
}
if (good) {
var postContent = {}
postContent['tags'] = tags_string
$.ajax({
type: "POST",
dataType: "json",
data: JSON.stringify(postContent),
contentType: "application/json",
url: '/tag_list?post_path=projects/telemetry_send_failures.kp',
async: false
});
location.reload()
}
})
tags.nextSibling.remove()
// Allow user to edit tags
var edit_icon = iconsJx.createEditTagsIcon();
$(tags).after(edit_icon);
$("#tooltip-edit_tags").on("click", function(){
var edit_tooltip = $("#tooltip-edit_tags");
edit_tooltip.attr("style", "display:none");
var tags_string = tags_list.join(", ");
var form = $("<form>");
var input = $("<input>");
var tags_text = $("<text>");
tags_text.html("Tags: ");
var icon = iconsJx.createSaveTagsIcon();
input.attr("type", "text");
input.attr("name", "tags_list");
input.attr("style", "width:75%; display: inline-block");
input.attr("id", "change_tags");
form.append(tags_text);
tags.textContent = " ";
tags_text.innerHTML = "Tags: ";
form.append(input);
form.append(icon);
tags.appendChild(form[0]);
$("#change_tags")[0].value = tags_string;
$("#change_tags").keypress(function(e){
if (e.which == 13){
var tags_string = $("#change_tags")[0].value;
var post_path = "projects/telemetry_send_failures.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false;
};
});
$("#tooltip-save_tags").click(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "projects/telemetry_send_failures.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
});
$("form").submit(function(){
var tags_string = $("#change_tags")[0].value;
var post_path = "projects/telemetry_send_failures.kp";
tagsJx.changeAndSaveTags(post_path, tags_string);
return false
})
});
});
</script>

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше