Remove merge conflicts
This commit is contained in:
benmiroglio 2017-04-05 10:48:57 -07:00 коммит произвёл Mark Reid
Родитель 8465203547
Коммит 265f431958
3 изменённых файлов: 23 добавлений и 22 удалений

Просмотреть файл

@ -1 +1 @@
5
6

Просмотреть файл

@ -7,7 +7,7 @@ tags:
- okr
- derived dataset
created_at: 2017-02-08 00:00:00
updated_at: 2017-02-28 10:06:12.500299
updated_at: 2017-03-24 11:10:05.989452
tldr: script to be run daily that contructs the addon_aggregates table in re:dash
---
# Add-ons 2017 OKR Data Collection
@ -112,7 +112,7 @@ ms = ms.filter(ms.submission_date_s3 == target_date)
These are the aggregations / joins that we **don't** want to do in re:dash.
* The resulting table is one row per distinct client, day, and install type
* The resulting table is one row per distinct client, day, channel, and install type
+ foreign_install = true -> side-loaded add-on, foreign_install = false -> self-installed add-on
* Each client has a static field for profile_creation_date and min_install_day (earliest add-on installation date)
* Each client has a daily field `user_type`
@ -136,20 +136,22 @@ default_theme_id = "{972ce4c6-7e08-4474-a285-3208198ce6fd}"
# count of distinct client submission_date, install type
count_by_client_day = addons\
.select(['client_id', 'submission_date_s3',
'foreign_install', 'addon_id'])\
.select(['client_id', 'submission_date_s3', 'normalized_channel',
'foreign_install', 'addon_id'])\
.distinct()\
.groupBy(['client_id', 'submission_date_s3','foreign_install'])\
.groupBy(['client_id', 'submission_date_s3','foreign_install', 'normalized_channel'])\
.count()
# count of clients that have only foreign_installed, only self_installed and both
user_types = count_by_client_day\
.select(['client_id', 'submission_date_s3', bool_to_int('foreign_install').alias('user_type')])\
.groupBy(['client_id', 'submission_date_s3'])\
.select(['client_id', 'submission_date_s3', 'normalized_channel',
bool_to_int('foreign_install').alias('user_type')])\
.groupBy(['client_id', 'submission_date_s3', 'normalized_channel'])\
.sum('user_type')\
.withColumnRenamed('sum(user_type)', 'user_type')
count_by_client_day = count_by_client_day.join(user_types, on=['client_id', 'submission_date_s3'])
count_by_client_day = count_by_client_day.join(user_types,
on=['client_id', 'submission_date_s3', 'normalized_channel'])
# does a client have a custom theme?
@ -195,10 +197,6 @@ current = optimize_repartition(current, record_size=38)
current.write.format("parquet")\
.save('s3://' + dest + '/submission_date_s3={}'.format(target_date), mode='overwrite')
```
-- Found 59350013 records -- Repartitioning with 9 partitions
CPU times: user 240 ms, sys: 60 ms, total: 300 ms
Wall time: 10min 20s
```python
@ -214,3 +212,4 @@ current.printSchema()
|-- has_custom_theme: long (nullable = true)
|-- n_custom_theme_clients: integer (nullable = false)
|-- n_clients: integer (nullable = false)

Просмотреть файл

@ -172,7 +172,7 @@
"source": [
"These are the aggregations / joins that we **don't** want to do in re:dash.\n",
"\n",
"* The resulting table is one row per distinct client, day, and install type\n",
"* The resulting table is one row per distinct client, day, channel, and install type\n",
" + foreign_install = true -> side-loaded add-on, foreign_install = false -> self-installed add-on\n",
"* Each client has a static field for profile_creation_date and min_install_day (earliest add-on installation date)\n",
"* Each client has a daily field `user_type`\n",
@ -204,20 +204,22 @@
"\n",
"# count of distinct client submission_date, install type\n",
"count_by_client_day = addons\\\n",
" .select(['client_id', 'submission_date_s3',\n",
" 'foreign_install', 'addon_id'])\\\n",
" .select(['client_id', 'submission_date_s3', 'normalized_channel',\n",
" 'foreign_install', 'addon_id'])\\\n",
" .distinct()\\\n",
" .groupBy(['client_id', 'submission_date_s3','foreign_install'])\\\n",
" .groupBy(['client_id', 'submission_date_s3','foreign_install', 'normalized_channel'])\\\n",
" .count()\n",
"\n",
"# count of clients that have only foreign_installed, only self_installed and both\n",
"user_types = count_by_client_day\\\n",
" .select(['client_id', 'submission_date_s3', bool_to_int('foreign_install').alias('user_type')])\\\n",
" .groupBy(['client_id', 'submission_date_s3'])\\\n",
" .select(['client_id', 'submission_date_s3', 'normalized_channel',\n",
" bool_to_int('foreign_install').alias('user_type')])\\\n",
" .groupBy(['client_id', 'submission_date_s3', 'normalized_channel'])\\\n",
" .sum('user_type')\\\n",
" .withColumnRenamed('sum(user_type)', 'user_type')\n",
"\n",
"count_by_client_day = count_by_client_day.join(user_types, on=['client_id', 'submission_date_s3'])\n",
"count_by_client_day = count_by_client_day.join(user_types, \n",
" on=['client_id', 'submission_date_s3', 'normalized_channel'])\n",
"\n",
"\n",
"# does a client have a custom theme?\n",
@ -279,9 +281,9 @@
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [Root]",
"display_name": "Python [conda root]",
"language": "python",
"name": "Python [Root]"
"name": "conda-root-py"
},
"language_info": {
"codemirror_mode": {