Remove merge conflicts
This commit is contained in:
benmiroglio 2017-04-05 10:48:57 -07:00 коммит произвёл Mark Reid
Родитель 8465203547
Коммит 265f431958
3 изменённых файлов: 23 добавлений и 22 удалений

Просмотреть файл

@ -1 +1 @@
5 6

Просмотреть файл

@ -7,7 +7,7 @@ tags:
- okr - okr
- derived dataset - derived dataset
created_at: 2017-02-08 00:00:00 created_at: 2017-02-08 00:00:00
updated_at: 2017-02-28 10:06:12.500299 updated_at: 2017-03-24 11:10:05.989452
tldr: script to be run daily that contructs the addon_aggregates table in re:dash tldr: script to be run daily that contructs the addon_aggregates table in re:dash
--- ---
# Add-ons 2017 OKR Data Collection # Add-ons 2017 OKR Data Collection
@ -112,7 +112,7 @@ ms = ms.filter(ms.submission_date_s3 == target_date)
These are the aggregations / joins that we **don't** want to do in re:dash. These are the aggregations / joins that we **don't** want to do in re:dash.
* The resulting table is one row per distinct client, day, and install type * The resulting table is one row per distinct client, day, channel, and install type
+ foreign_install = true -> side-loaded add-on, foreign_install = false -> self-installed add-on + foreign_install = true -> side-loaded add-on, foreign_install = false -> self-installed add-on
* Each client has a static field for profile_creation_date and min_install_day (earliest add-on installation date) * Each client has a static field for profile_creation_date and min_install_day (earliest add-on installation date)
* Each client has a daily field `user_type` * Each client has a daily field `user_type`
@ -136,20 +136,22 @@ default_theme_id = "{972ce4c6-7e08-4474-a285-3208198ce6fd}"
# count of distinct client submission_date, install type # count of distinct client submission_date, install type
count_by_client_day = addons\ count_by_client_day = addons\
.select(['client_id', 'submission_date_s3', .select(['client_id', 'submission_date_s3', 'normalized_channel',
'foreign_install', 'addon_id'])\ 'foreign_install', 'addon_id'])\
.distinct()\ .distinct()\
.groupBy(['client_id', 'submission_date_s3','foreign_install'])\ .groupBy(['client_id', 'submission_date_s3','foreign_install', 'normalized_channel'])\
.count() .count()
# count of clients that have only foreign_installed, only self_installed and both # count of clients that have only foreign_installed, only self_installed and both
user_types = count_by_client_day\ user_types = count_by_client_day\
.select(['client_id', 'submission_date_s3', bool_to_int('foreign_install').alias('user_type')])\ .select(['client_id', 'submission_date_s3', 'normalized_channel',
.groupBy(['client_id', 'submission_date_s3'])\ bool_to_int('foreign_install').alias('user_type')])\
.groupBy(['client_id', 'submission_date_s3', 'normalized_channel'])\
.sum('user_type')\ .sum('user_type')\
.withColumnRenamed('sum(user_type)', 'user_type') .withColumnRenamed('sum(user_type)', 'user_type')
count_by_client_day = count_by_client_day.join(user_types, on=['client_id', 'submission_date_s3']) count_by_client_day = count_by_client_day.join(user_types,
on=['client_id', 'submission_date_s3', 'normalized_channel'])
# does a client have a custom theme? # does a client have a custom theme?
@ -195,10 +197,6 @@ current = optimize_repartition(current, record_size=38)
current.write.format("parquet")\ current.write.format("parquet")\
.save('s3://' + dest + '/submission_date_s3={}'.format(target_date), mode='overwrite') .save('s3://' + dest + '/submission_date_s3={}'.format(target_date), mode='overwrite')
``` ```
-- Found 59350013 records -- Repartitioning with 9 partitions
CPU times: user 240 ms, sys: 60 ms, total: 300 ms
Wall time: 10min 20s
```python ```python
@ -214,3 +212,4 @@ current.printSchema()
|-- has_custom_theme: long (nullable = true) |-- has_custom_theme: long (nullable = true)
|-- n_custom_theme_clients: integer (nullable = false) |-- n_custom_theme_clients: integer (nullable = false)
|-- n_clients: integer (nullable = false) |-- n_clients: integer (nullable = false)

Просмотреть файл

@ -172,7 +172,7 @@
"source": [ "source": [
"These are the aggregations / joins that we **don't** want to do in re:dash.\n", "These are the aggregations / joins that we **don't** want to do in re:dash.\n",
"\n", "\n",
"* The resulting table is one row per distinct client, day, and install type\n", "* The resulting table is one row per distinct client, day, channel, and install type\n",
" + foreign_install = true -> side-loaded add-on, foreign_install = false -> self-installed add-on\n", " + foreign_install = true -> side-loaded add-on, foreign_install = false -> self-installed add-on\n",
"* Each client has a static field for profile_creation_date and min_install_day (earliest add-on installation date)\n", "* Each client has a static field for profile_creation_date and min_install_day (earliest add-on installation date)\n",
"* Each client has a daily field `user_type`\n", "* Each client has a daily field `user_type`\n",
@ -204,20 +204,22 @@
"\n", "\n",
"# count of distinct client submission_date, install type\n", "# count of distinct client submission_date, install type\n",
"count_by_client_day = addons\\\n", "count_by_client_day = addons\\\n",
" .select(['client_id', 'submission_date_s3',\n", " .select(['client_id', 'submission_date_s3', 'normalized_channel',\n",
" 'foreign_install', 'addon_id'])\\\n", " 'foreign_install', 'addon_id'])\\\n",
" .distinct()\\\n", " .distinct()\\\n",
" .groupBy(['client_id', 'submission_date_s3','foreign_install'])\\\n", " .groupBy(['client_id', 'submission_date_s3','foreign_install', 'normalized_channel'])\\\n",
" .count()\n", " .count()\n",
"\n", "\n",
"# count of clients that have only foreign_installed, only self_installed and both\n", "# count of clients that have only foreign_installed, only self_installed and both\n",
"user_types = count_by_client_day\\\n", "user_types = count_by_client_day\\\n",
" .select(['client_id', 'submission_date_s3', bool_to_int('foreign_install').alias('user_type')])\\\n", " .select(['client_id', 'submission_date_s3', 'normalized_channel',\n",
" .groupBy(['client_id', 'submission_date_s3'])\\\n", " bool_to_int('foreign_install').alias('user_type')])\\\n",
" .groupBy(['client_id', 'submission_date_s3', 'normalized_channel'])\\\n",
" .sum('user_type')\\\n", " .sum('user_type')\\\n",
" .withColumnRenamed('sum(user_type)', 'user_type')\n", " .withColumnRenamed('sum(user_type)', 'user_type')\n",
"\n", "\n",
"count_by_client_day = count_by_client_day.join(user_types, on=['client_id', 'submission_date_s3'])\n", "count_by_client_day = count_by_client_day.join(user_types, \n",
" on=['client_id', 'submission_date_s3', 'normalized_channel'])\n",
"\n", "\n",
"\n", "\n",
"# does a client have a custom theme?\n", "# does a client have a custom theme?\n",
@ -279,9 +281,9 @@
"metadata": { "metadata": {
"anaconda-cloud": {}, "anaconda-cloud": {},
"kernelspec": { "kernelspec": {
"display_name": "Python [Root]", "display_name": "Python [conda root]",
"language": "python", "language": "python",
"name": "Python [Root]" "name": "conda-root-py"
}, },
"language_info": { "language_info": {
"codemirror_mode": { "codemirror_mode": {