taar/analysis/TAARExperimentV2Retention.i...

968 строки
207 KiB
Plaintext
Исходник Постоянная ссылка Обычный вид История

2018-07-11 01:26:10 +03:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 99,
2018-07-11 01:26:10 +03:00
"metadata": {},
"outputs": [],
"source": [
"import pyspark.sql.functions as F\n",
"import datetime as dt\n",
"import pandas as pd\n",
"import pyspark.sql.types as st\n",
"import matplotlib.pyplot as plt\n",
"import seaborn\n",
"import numpy as np\n",
"import statsmodels.api as sm\n",
"from IPython.display import Markdown\n",
2018-07-11 01:26:10 +03:00
"\n",
"seaborn.set_style(\"whitegrid\")\n",
"sc.setLogLevel(\"INFO\")\n",
"udf = F.udf\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 2,
2018-07-11 01:26:10 +03:00
"metadata": {},
"outputs": [],
"source": [
"PERIODS = {}\n",
"N_WEEKS = 12\n",
2018-07-11 01:26:10 +03:00
"for i in range(1, N_WEEKS + 1):\n",
" PERIODS[i] = {\n",
" 'start': i * 7,\n",
" 'end': i * 7 + 6\n",
" }\n",
"\n",
"\n",
"def date_plus_x_days(date, x):\n",
" \"\"\"\n",
" Returns a string date x days away from <date>\n",
" \n",
" Params:\n",
" date (str): date in %Y%m%s format\n",
" x (int) number of days to add to <date> (can be negative)\n",
" \n",
" >>> date_plus_x_days(\"20180101\", 1)\n",
" \"20180102\"\n",
" \n",
" >>> date_plus_x_days(\"20180510\", -9)\n",
" \"20180501\"\n",
" \"\"\"\n",
" new_date = dt.datetime.strptime(date, '%Y%m%d') + dt.timedelta(days=x)\n",
" return new_date.strftime('%Y%m%d')\n",
"\n",
"\n",
"def date_diff(d1, d2, fmt='%Y%m%d'):\n",
" \"\"\"\n",
" Returns days elapsed from d2 to d1 as an integer\n",
"\n",
" Params:\n",
" d1 (str)\n",
" d2 (str)\n",
" fmt (str): format of d1 and d2 (must be the same)\n",
"\n",
" >>> date_diff('20170205', '20170201')\n",
" 4\n",
"\n",
" >>> date_diff('20170201', '20170205)\n",
" -4\n",
" \"\"\"\n",
" try:\n",
" return (pd.to_datetime(d1, format=fmt) - \n",
" pd.to_datetime(d2, format=fmt)).days\n",
" except:\n",
" return None\n",
"\n",
"\n",
"@udf(returnType=st.IntegerType())\n",
"def get_period(anchor, submission_date_s3):\n",
" \"\"\"\n",
" Given an anchor and a submission_date_s3,\n",
" returns what period a ping belongs to. This \n",
" is a spark UDF (see decoration).\n",
"\n",
" Params:\n",
" anchor (col): anchor date\n",
" submission_date_s3 (col): a ping's submission_date to s3\n",
"\n",
" Global:\n",
" PERIODS (dict): defined globally based on n-week method\n",
"\n",
" Returns an integer indicating the retention period\n",
" \"\"\"\n",
" if anchor is not None:\n",
" diff = date_diff(submission_date_s3, anchor)\n",
" if diff >= 7: # exclude first 7 days\n",
" for period in sorted(PERIODS):\n",
" if diff <= PERIODS[period]['end']:\n",
" return period\n",
" \n",
"def get_retention(data):\n",
" branch_counts = (\n",
" data\n",
" .groupby(\"branch\")\n",
" .agg(F.countDistinct(\"client_id\").alias(\"total_clients\"))\n",
" )\n",
"\n",
" weekly_counts = (\n",
" data\n",
" .groupby(\"period\", \"branch\")\n",
" .agg(F.countDistinct(\"client_id\").alias(\"n_week_clients\"))\n",
" )\n",
"\n",
" retention_by_branch = (\n",
" weekly_counts\n",
" .join(branch_counts, on='branch')\n",
" .withColumn(\"retention\", F.col(\"n_week_clients\") / F.col(\"total_clients\"))\n",
" )\n",
" \n",
" ret_df = retention_by_branch.toPandas()\n",
" ret_df.fillna(0, inplace=True)\n",
" \n",
" return ret_df"
2018-07-11 01:26:10 +03:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Data Prep"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Load in cleaned experiment data, generated from [this notebook](https://github.com/mozilla/taar/blob/master/analysis/TAARExperimentV2Analysis.ipynb). Filter to clients that loaded the discopane (this is for the control group, since we have already cross referenced the TAAR logs. Clients who did not load the discopane never saw the control, serving as noise in the experiment."
2018-07-11 01:26:10 +03:00
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"S3_PATH = \"s3://net-mozaws-prod-us-west-2-pipeline-analysis/taarv2/cleaned_data/\"\n",
"clean_data = sqlContext.read.parquet(S3_PATH).filter(\"discopane_loaded = true\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
2018-07-11 01:26:10 +03:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------------+-------------------------+\n",
"| branch|count(DISTINCT client_id)|\n",
"+-------------+-------------------------+\n",
"|ensemble-taar| 108381|\n",
"| control| 139913|\n",
"| linear-taar| 108071|\n",
"+-------------+-------------------------+\n",
"\n"
]
2018-07-11 01:26:10 +03:00
}
],
"source": [
"clean_data.groupby(\"branch\").agg(F.countDistinct(\"client_id\")).show()"
2018-07-11 01:26:10 +03:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Grab the min and max submission dates for filtering `main_summary`."
]
},
{
"cell_type": "code",
"execution_count": 9,
2018-07-11 01:26:10 +03:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"20180312 20180423\n"
]
}
],
2018-07-11 01:26:10 +03:00
"source": [
"min_date = clean_data.select(F.min('submission_date_s3').alias('min_d')).collect()[0].min_d\n",
"max_date = clean_data.select(F.max('submission_date_s3').alias('max_d')).collect()[0].max_d\n",
"print min_date, max_date"
2018-07-11 01:26:10 +03:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Load in `main_summary`, filtered to the min date of the experiment, and 42 days beyond its compleition to allow for 6-week Retention Analysis. We then join `main_summary` with the experiment data."
]
},
{
"cell_type": "code",
"execution_count": 10,
2018-07-11 01:26:10 +03:00
"metadata": {},
"outputs": [],
"source": [
"ms = (\n",
" sqlContext.read.option(\"mergeSchema\", True)\n",
" .parquet(\"s3://telemetry-parquet/main_summary/v4\")\n",
" .filter(\"submission_date_s3 >= '{}'\".format(min_date))\n",
" .filter(\"submission_date_s3 <= '{}'\".format(date_plus_x_days(max_date, 7*N_WEEKS)))\n",
2018-07-11 01:26:10 +03:00
" .filter(\"normalized_channel = 'release'\")\n",
" .filter(\"app_name = 'Firefox'\")\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 11,
2018-07-11 01:26:10 +03:00
"metadata": {},
"outputs": [],
"source": [
"# a client's enrollment date is determined by their first appearance in the experiment\n",
"enrollment_dates = (\n",
" clean_data.groupby(\"client_id\", \"branch\")\n",
" .agg(F.min('submission_date_s3')\n",
" .alias(\"enrollment_date\"))\n",
")"
2018-07-11 01:26:10 +03:00
]
},
{
"cell_type": "code",
"execution_count": 12,
2018-07-11 01:26:10 +03:00
"metadata": {},
"outputs": [],
"source": [
"# join main_summary to exp data\n",
"joined = enrollment_dates.join(ms.select(\"submission_date_s3\", \"client_id\"), on=\"client_id\", how='left')"
2018-07-11 01:26:10 +03:00
]
},
{
"cell_type": "code",
"execution_count": 13,
2018-07-11 01:26:10 +03:00
"metadata": {},
"outputs": [],
"source": [
"# verify join contains same number of distinct clients as the experiment data.\n",
2018-07-11 01:26:10 +03:00
"# this also initializes our cache\n",
"jc = joined.select(\"client_id\").distinct().count()\n",
"cc = clean_data.select(\"client_id\").distinct().count()"
]
},
{
"cell_type": "code",
"execution_count": 14,
2018-07-11 01:26:10 +03:00
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
2018-07-11 01:26:10 +03:00
]
},
"execution_count": 14,
2018-07-11 01:26:10 +03:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"jc - cc"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Calculate Retention Data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Perform 12-week retention analysis based on [this example](https://docs.telemetry.mozilla.org/cookbooks/retention.html). [1-12]-Week Retention are additionally included since we can get them at a low cost. We expand out to 12-week retention to better validate the data since the TAAR branches exhibit suspiciously similar retention values."
2018-07-11 01:26:10 +03:00
]
},
{
"cell_type": "code",
"execution_count": 15,
2018-07-11 01:26:10 +03:00
"metadata": {},
"outputs": [],
"source": [
"joined = (\n",
" joined.withColumn(\"period\", get_period(\"enrollment_date\", \"submission_date_s3\"))\n",
" .filter(\"enrollment_date <= '{}'\".format(max_date))\n",
").distinct().cache()"
2018-07-11 01:26:10 +03:00
]
},
{
"cell_type": "code",
"execution_count": 16,
2018-07-11 01:26:10 +03:00
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"18640155"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
2018-07-11 01:26:10 +03:00
"source": [
"joined.count()"
2018-07-11 01:26:10 +03:00
]
},
{
"cell_type": "code",
"execution_count": 29,
2018-07-11 01:26:10 +03:00
"metadata": {},
"outputs": [],
"source": [
"ret_df = get_retention(joined)"
2018-07-11 01:26:10 +03:00
]
},
{
"cell_type": "code",
"execution_count": 31,
2018-07-11 01:26:10 +03:00
"metadata": {},
"outputs": [],
"source": [
"ret_df.to_csv(\"taar_v2_retention.csv\", index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Write to s3 since this job is quite expensive and should only be run once."
]
},
{
"cell_type": "code",
"execution_count": 32,
2018-07-11 01:26:10 +03:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Completed 1.7 KiB/1.7 KiB (22.2 KiB/s) with 1 file(s) remaining\r",
2018-07-11 01:26:10 +03:00
"upload: ./taar_v2_retention.csv to s3://net-mozaws-prod-us-west-2-pipeline-analysis/taarv2/taar_v2_retention.csv\n"
]
}
],
"source": [
"%%bash\n",
"aws s3 cp taar_v2_retention.csv s3://net-mozaws-prod-us-west-2-pipeline-analysis/taarv2/"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Load processed Retention Data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This section loads the data generated above without having to the re-run the entire notebook."
]
},
{
"cell_type": "code",
"execution_count": 33,
2018-07-11 01:26:10 +03:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Completed 1.7 KiB/1.7 KiB (29.9 KiB/s) with 1 file(s) remaining\r",
2018-07-11 01:26:10 +03:00
"download: s3://net-mozaws-prod-us-west-2-pipeline-analysis/taarv2/taar_v2_retention.csv to ./taar_v2_retention.csv\n"
]
}
],
"source": [
"%%bash \n",
"aws s3 cp s3://net-mozaws-prod-us-west-2-pipeline-analysis/taarv2/taar_v2_retention.csv ."
]
},
{
"cell_type": "code",
"execution_count": 36,
2018-07-11 01:26:10 +03:00
"metadata": {},
"outputs": [],
"source": [
"ret_df = pd.read_csv(\"taar_v2_retention.csv\")\n",
"ret_df.fillna(0, inplace=True)"
2018-07-11 01:26:10 +03:00
]
},
{
"cell_type": "code",
"execution_count": 37,
2018-07-11 01:26:10 +03:00
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA/QAAAInCAYAAAA2+BJXAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAAPYQAAD2EBqD+naQAAIABJREFUeJzs3Xd8FVX+//HXTS+QAIHQO+EQOgREURQBQUFERbCwggqu\nBXe/dlAUG4irrv5UbOi6FEUFRETEAtIFRIoILhwQCIQiSC8Jqff3x9zEJCaQQHJvyvv5ePAImTlz\n5jP3Hnb9zOfMGZfb7UZEREREREREShc/XwcgIiIiIiIiIoWnhF5ERERERESkFFJCLyIiIiIiIlIK\nKaEXERERERERKYWU0IuIiIiIiIiUQkroRUREREREREohJfQiIiIiIiIipZASehEREREREZFSSAm9\niIiIiIiISCmkhF5ERMRLjDFjjDEZxpgIX8dS0hhjlhljvvPBeYd6vpPW3j53aeL5ftb6Og4REckp\nwNcBiIhI6WGMCQceBS7w/KkM3GatnVyIPq4AbvIcHwvsstY2yqOdAYYCVwCNgZPAWuApa+2as5xj\nAPApcJ219otc+9YDrYDLrbWLc+3b5YnnkoJeTyG5PX8KzRgzFHgv26Z04HfgO+AJa+2+c+y3D9De\nWvvcuRxfyHO1AG4A/mOt3Z1r9zl/NkWg2M9rjPkQuCXbpnRgL7AMeNZaa4s7hvPkq+9GRETOQBV6\nEREpjKrAk0Az4GfO7T/yb8FJ6I8Ce87QbhhOQv8T8CDwb6ApsNIY0+0s51jm+ZkjMTfGVARaAKnA\nxbn21QHqAEsLdBW+4QYeA/4G3AV8AwwBFhpjAs+xz6txvlNvaAk8BdTLY9/lQG8vxeELbuAUMAjn\n+xsKTAJ6AsuNMdV8GJuIiJRSqtCLiEhh7AVqWGsPGGPicJLtwnoMGGatTTfGfImTYOdlKk41PjFz\ngzHmv8Am4GlgQX4nsNbuM8bsIFdCD1wEuIDpeey7BCfp+qHgl+ITX1trf/H8/QNjzFGcGx59gFnn\n0J+ryCIr2LnyvAlkrU3zYhy+kmqt/Tj7BmPMauBznJsZk/I70BgTlv3fgoiICCihFxGRQrDWpgIH\nzrOP3wvYbl0e2w4bY5YClxWgi2XAjcaYYGttsmfbxcBG4GvgjVzt80zojTFDgH8AzYEknKr4I9ba\nvbnaXYRzo+FCnP9/XQU8Zq1deaYgjTENcW5OHAWusNYeLMC1ZbcUeAjnsYTcffcBRgLtcKZ4LwYe\ntdZu9uyfglMxdhtjMjyHpVlrgzz7XcADONXkxsARnORzpLX2eLbz7PZc7/8DXsZ5pGEPMNpaO9XT\nJvORATewzHmiAjfQxVq73BizDEi01vbM1m808ALOzYoIYDPwb2vth9naNAa2AvcDyTiPhNQC1gP3\n5DWO8lHBGPMecD3g77nO+621xzzn+RDoAdS01ua4KWGMWQBUs9a2KuC5stuPc6Mj64aGMWYYMAHo\nAtzqickNVDfGNABGAN2AukAiMB/ne92VRx8XATfjfM+hOOP3TmvtkVzX0Afns2vnOdcm4BVr7bRc\n7VoA44FOwGFPm1fO4bpFRKQIaMq9iIiUNjWAgiS9y4BAnMQj08XAcmAFUMkY0zLbvs7A5uyJjjHm\nKeADnOTmAZyEtRew2BhTIVu7K4BFOAnTaOBxoArOVPh2+QVojIkBlgB/AF3PIZkHaOj5mTtBuw2Y\n7dn+KDAGJ9Fe6nm8AOBNnJsJGfw5FXxwtm4+AJ7HuRHwD2CiZ//Xxpjs/w3hBgzwCU7C+CBwDJjk\nuUaAhZ7zATzjOdetgM3WR/b4w3A+m5txKtcPA8eBycaYe/L4HIbgJPVv4TxC0Bj4LFec+XEBb3uO\nGQ1M8VznjGxtpgDVcNZ0yB5nLeBSz/6znscYE+X5U90Y0xnnBsgBYG62dpmfxbtAE5wbRS96tnUC\nOgIf4Xwn7+CMye+NMcF59PEWzloVoz3X2A94Pdc1DAO+xLlp8jzODYP1nn6zq4pzM2wNzr8HC7xk\njOlegGsXEZFioAq9iIiUGsaYLjgVx2cL0HwZTqJ2CbDEGOOPkwz911q73Riz37Nvoyc5bwX8J9u5\nGuEkho9aa/+dbfssnMX57gZe9lSx3wa+tdZek63dBOB/wHM4z6nnvpbmwDwgHrgqe8X7LCoZY6KA\nEJybEKNwns3+KlvfFXFuPrxlrf1Htu2TcZKwkcB91tqVxpitwKV5TAXvipMkD7DWfpZt+xLPua4n\nZ8LbDOhsrf3R024msAu4HXjc85kvA+4F5llrl5/lOu8BYoAbrbUzPH2+g3ND5nljzERrbVK29rWB\nJtbak5622zzx9cBZOPBsTgE9rLUZnuP3AGONMVdaa7/B+a5+x7kRkb2/QZ6fUwtwjkicmzfZJQA9\nc1fMPfZba3vk2jbLWvtp9g3GmLk44/1anMUgs/vdWtsnW9sg4B5jzF3W2kRjTCXgVc/x3T2zcPJT\nG7g5s2rveQQmAWcGx/dnOE5ERIqJKvQiIlIqeBYNmwpsA146W3tr7SbgEH8+K98WCMNJCPH8zFwY\nrzPONOtl2broj1Pl/CxbVTUK2Adsx1nEDaAD0AiYmqtdOE5Vumse4bX17NuKk8wVNJl34cwE+AMn\nkfoUpxJ+jbV2f7Z2VwIVgU9yxZSOs+7B5ZzdDTif36JcfazGefQgdx+/ZCbzAJ54tuJ8NufiKmBP\nZjLv6TMdp7ocgTMdPbupmcm8x1Kcz6sg53cD72Ym8x5verb39pw7A2f8XWuMCc3W7hZgSR6r9ufl\nJNAd5yZDT5ybQkk4Mx5yx+nGmTKfQ7bHRzDGBBpjqgBbgBNA+7yuK9e2pThjPXNhwitx/l2MO0sy\nD3As+xR8a20Kzng61+9YRETOkyr0IiJS5DzvWc+e9KTkU4EsaH9hOFXhcJwEuKCLgy3nz8TvYuCA\ntXZHtn3Ds+1zkzOhb4KT+GzPo183zvTvzHaQd4XWjfN8eri19pRnmwuYg/OM+VW5qsxn48ZZ3X47\nUAmnMtoZSMnVronnPHmt2O/Gefb5bGKAKP5aUc7sIzrXtl15tDuC82rDc1EfJ1HNbRPOtdXPtT0h\nj3NTiPP/lv0Xa+0JzyyOBtk2T8ZZr6Afzs2SFkAb4I4CniPNWrsw+wZjzNc4Nz7G4jxekF187g48\nNxNG4cyeqMWfixq6cWYA5Ha2zyUzGf/17OH/pa/M/mLy2C4iIl6ghF5ERIrDazgJR6ZFOIt4FZrn\ndWyf47zyrKen8l5Qy4CrjTGtcBLf7NO8lwMvGmNq4iT0e6218dn2++EsVHZlPn2fyNYOnOe3N+bT\nNnvS7saZCj4Ep7r7nzyPyN+qzFXujTFf4FzHVGOMsdaezhaTGydBzOu5/LNVYjP72IvznHteK+Hn\nXhwxPZ9+vLWKfrGf31q7wRizHmfa/Seen0nAZ2c88Mx9JmQ+9pDH7rxu9ryNM25eBVbi3FjKHFN5\nzbwsys/F19+xiIjkooReRESKw7/IuUjYOVXnPc+nT8GZ3j3AWrvsLIfkltm+C07S/mq2fWtwVkW/\nHOfZ+q9yHso2PBX6XIl+bts8P49ba/N9lV4uD3h+vmuMOZZ9WnlheF799zjO8933ApmrjWfGdMBa\nu+gs3eT5GjlPH12AZQWYil1Q+Z0rLzvJu/Ib6+lnZ5FE9KcYsr3hwLMOQXX+WiWfDLzgWYH/ZmC2\ntfYE5ycAqHDWVo7+wH+stSOyxRpK3tX5gtiGk5C3JO9ZFiIiUoLpGXoRESly1trN1toF2f4U9NVh\nuY0HBuC8fuyLczh+NU7SPghnenJWhd7z/O86nGn3YeScbg9O1dUNPJVXx55nl8F5XVs88Ijn0YDc\n7armcbgbGIYz8+AjY0x+swDOylr7Pc4ifQ8YYzJv1H+N87z2KM9igGeK6RTgn0fs04Ag4Ik8jg/w\nPFZRWKdwksdKBWg7F6hjjOmf/bw4K7sfJ+/HCc6VC7gr12d1n2f73Fxtp+L899MbOK+N+5DzYIxp\nhvOIxM8FPCSdv/732/2
2018-07-11 01:26:10 +03:00
"text/plain": [
"<matplotlib.figure.Figure at 0x7f8a591d7a10>"
2018-07-11 01:26:10 +03:00
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.rcParams['figure.figsize'] = (12, 6)\n",
"fig, ax = plt.subplots()\n",
"for group, data in ret_df.groupby(\"branch\"):\n",
2018-07-11 01:26:10 +03:00
" (data.sort_values(\"period\")\n",
" .plot(x='period', \n",
" y='retention', \n",
" ax=ax, \n",
" label=group))\n",
"plt.ylabel(\"Retention\")\n",
"plt.xlabel(\"Week (period)\")\n",
"plt.title(\"1-12 Week Retention by Branch\")\n",
2018-07-11 01:26:10 +03:00
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 38,
2018-07-11 01:26:10 +03:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>branch</th>\n",
" <th>period</th>\n",
" <th>n_week_clients</th>\n",
" <th>total_clients</th>\n",
" <th>retention</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>control</td>\n",
" <td>6.0</td>\n",
" <td>83578</td>\n",
" <td>139913</td>\n",
" <td>0.597357</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
2018-07-11 01:26:10 +03:00
" <td>ensemble-taar</td>\n",
" <td>6.0</td>\n",
" <td>65909</td>\n",
" <td>108381</td>\n",
" <td>0.608123</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38</th>\n",
" <td>linear-taar</td>\n",
" <td>6.0</td>\n",
" <td>65698</td>\n",
" <td>108071</td>\n",
" <td>0.607915</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>control</td>\n",
" <td>7.0</td>\n",
" <td>80556</td>\n",
" <td>139913</td>\n",
" <td>0.575758</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>ensemble-taar</td>\n",
" <td>7.0</td>\n",
" <td>63699</td>\n",
" <td>108381</td>\n",
" <td>0.587732</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32</th>\n",
" <td>linear-taar</td>\n",
" <td>7.0</td>\n",
" <td>63363</td>\n",
" <td>108071</td>\n",
" <td>0.586309</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>control</td>\n",
" <td>8.0</td>\n",
" <td>77789</td>\n",
" <td>139913</td>\n",
" <td>0.555981</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>ensemble-taar</td>\n",
" <td>8.0</td>\n",
" <td>61393</td>\n",
" <td>108381</td>\n",
" <td>0.566455</td>\n",
" </tr>\n",
" <tr>\n",
" <th>34</th>\n",
" <td>linear-taar</td>\n",
" <td>8.0</td>\n",
" <td>61164</td>\n",
" <td>108071</td>\n",
" <td>0.565961</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>control</td>\n",
" <td>9.0</td>\n",
" <td>75070</td>\n",
" <td>139913</td>\n",
" <td>0.536548</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>ensemble-taar</td>\n",
" <td>9.0</td>\n",
" <td>59350</td>\n",
" <td>108381</td>\n",
" <td>0.547605</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>linear-taar</td>\n",
" <td>9.0</td>\n",
" <td>59055</td>\n",
" <td>108071</td>\n",
" <td>0.546446</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>control</td>\n",
" <td>10.0</td>\n",
" <td>72598</td>\n",
" <td>139913</td>\n",
" <td>0.518880</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>ensemble-taar</td>\n",
" <td>10.0</td>\n",
" <td>57404</td>\n",
" <td>108381</td>\n",
" <td>0.529650</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36</th>\n",
" <td>linear-taar</td>\n",
" <td>10.0</td>\n",
" <td>57254</td>\n",
" <td>108071</td>\n",
" <td>0.529781</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>control</td>\n",
" <td>11.0</td>\n",
" <td>70461</td>\n",
" <td>139913</td>\n",
" <td>0.503606</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>ensemble-taar</td>\n",
" <td>11.0</td>\n",
" <td>55793</td>\n",
" <td>108381</td>\n",
" <td>0.514786</td>\n",
" </tr>\n",
" <tr>\n",
" <th>37</th>\n",
" <td>linear-taar</td>\n",
" <td>11.0</td>\n",
" <td>55486</td>\n",
" <td>108071</td>\n",
" <td>0.513422</td>\n",
2018-07-11 01:26:10 +03:00
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>control</td>\n",
" <td>12.0</td>\n",
" <td>67862</td>\n",
" <td>139913</td>\n",
" <td>0.485030</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>ensemble-taar</td>\n",
" <td>12.0</td>\n",
" <td>54080</td>\n",
" <td>108381</td>\n",
" <td>0.498980</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
2018-07-11 01:26:10 +03:00
" <td>linear-taar</td>\n",
" <td>12.0</td>\n",
" <td>53843</td>\n",
" <td>108071</td>\n",
" <td>0.498219</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" branch period n_week_clients total_clients retention\n",
"20 control 6.0 83578 139913 0.597357\n",
"9 ensemble-taar 6.0 65909 108381 0.608123\n",
"38 linear-taar 6.0 65698 108071 0.607915\n",
"17 control 7.0 80556 139913 0.575758\n",
"5 ensemble-taar 7.0 63699 108381 0.587732\n",
"32 linear-taar 7.0 63363 108071 0.586309\n",
"21 control 8.0 77789 139913 0.555981\n",
"12 ensemble-taar 8.0 61393 108381 0.566455\n",
"34 linear-taar 8.0 61164 108071 0.565961\n",
"14 control 9.0 75070 139913 0.536548\n",
"11 ensemble-taar 9.0 59350 108381 0.547605\n",
"31 linear-taar 9.0 59055 108071 0.546446\n",
"25 control 10.0 72598 139913 0.518880\n",
"10 ensemble-taar 10.0 57404 108381 0.529650\n",
"36 linear-taar 10.0 57254 108071 0.529781\n",
"22 control 11.0 70461 139913 0.503606\n",
"1 ensemble-taar 11.0 55793 108381 0.514786\n",
"37 linear-taar 11.0 55486 108071 0.513422\n",
"19 control 12.0 67862 139913 0.485030\n",
"8 ensemble-taar 12.0 54080 108381 0.498980\n",
"28 linear-taar 12.0 53843 108071 0.498219"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ret_df[ret_df.period >= 6.0].sort_values([\"period\", \"branch\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Investigate nearly identical retention lines for TAAR Branches\n",
"\n",
"Let's look at 6-week retention over time by each enrollment date"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"day_over_day_retention = []\n",
"for i in range(40):\n",
" d = date_plus_x_days(\"20180312\", i)\n",
" joinedx = joined.filter(\"enrollment_date = '{}'\".format(d))\n",
" ret_dfx = get_retention(joinedx)\n",
" week6 = ret_dfx[ret_dfx.period == 6.0]\n",
" for b, data in week6.groupby(\"branch\"):\n",
" x = {\n",
" 'branch': b,\n",
" 'ret': data['retention'].values[0],\n",
" 'date': d\n",
" }\n",
" day_over_day_retention.append(x)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/mnt/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:8: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA/QAAAJYCAYAAADMjK1DAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAAPYQAAD2EBqD+naQAAIABJREFUeJzs3Xd8VFX+//HXpJICAUJvoR96Z1GwoLKyylpWRV1ZxS7W\ntYOi2EBd9au7it11QfbnKhasWNcGq6KACCocBYWELimQXuf3x7kTJpMJKSQE8P18PPJIcsu555a5\ncz+nXZ/f70dEREREREREDiwRjZ0BEREREREREak9BfQiIiIiIiIiByAF9CIiIiIiIiIHIAX0IiIi\nIiIiIgcgBfQiIiIiIiIiByAF9CIiIiIiIiIHIAX0IiIiIiIiIgcgBfQiIiIiIiIiByAF9CIiIiIi\nIiIHIAX0IrLfMsb82xiT2dj52B8ZYzYaY55qhO3ONMaUGWOa7ett7w9Cj7sx5hjveIxuzHzJwak+\n7oHGmAhjzA/GmBvqK19SPWPMhd69oUNj50XAGHOSMSbTGJPU2HkRqW9RjZ0BEalfxphhwO3AGKAJ\n8DPwpLV29h7WuQH4GzDUWvttyLxMIAnoZq3dEDQ9FtgFvGSt/Ut974fH7/3UmjHmLmB60KRiYBPw\nOnCbtXZXHdOdBLS01j5Sl/Vrua0xwDjg/6y1OSGzy6jjsdlLdT4ntWGMWQxUFSR/Z60d1NB5qEK4\nfW+M81Artb1ujTEbgUAg4sd91tOAz4FnrLVL9yIv04GV1to365pGLbYVeh/wA1uBpcAsa+1XDZ2H\nvVQfn7ezgbbAY6EzjDEjcN8XhwKxuO+Lx621j1eXaENeI3sj6N6xxlrbL8z8PwALvX9Ptta+0UBZ\n2Sf3yuoYYx4Ergb+n7X27MbOT2Ox1r5ujNkC3ADc0tj5EalPqqEXOYgYY47FPUy1Au4ErgLeBDpV\ns+pi7/dhIen1xwXzxbgCgmAjgWhg0d7lukH5gYuAvwBX4h7i/wq8thdpBtLaFw4DZgDhasN7AJfu\no3w0Bj+wAZiEO+bBP9MaMV8Hqtpet37c52USLiCcBnwCnAR8ZYz5217k5RbghL1Yv7aC7wOTgUeB\nIcCn3j3uYHc9LpjLDZ5ojDkO+B/QHLgDF/S9DXSuYboNeY3sDT+QDxhjzJAw8yd58xs62H4WiLPW\nbm7g7VTJGOMDzgB+AU4yxsQ1Vl72E08Bl3kVEiIHDdXQixwkjDFNgbnAm9baibVcfSlQgAsgHw2a\nPgbY4c0/DHg+aN5huAei/9U1z/vIS0G18U8ZY14CTjHGDLHWrmjMjNWAr6oZ1trifZmRRpJprf1P\nQyXuPezGWGsLG2obB7iNocffGHMj8AJwvTHmR2vtPxsna7UWfB/AGPMWsAI4Dfi+qpWMMU2stQX7\nIH8NwhgzEuhPSOGf1+x4DrDAWnvmXmxif71GLJAI/Bl3ngN5i8MVOLwNnNqgGbDWDxQ15DZqYBzQ\nDjgK+C9wMtBg99SqGGOiAKy1Jft62yFeAv4P+BPuGhU5KCigFzl4TALa4DUvNcbEA/neQ8UeWWuL\njTFfU7kWfgzwBfA1EFpIMAbIstZ+F5jgBUjXABfgapAzgQXAtNAm7saYCbganaFAKfApcKO1ds2e\n8mqMGQ68B3wDnFCHh+1FwCle/ioE9MaYybhazH64Gpx3gRsCNSzGmEXefmOMKfNWW2ut7e1Ni8Ud\n/7NwrSK24QpBZgQCcGNMJK7Fw99xLSPuBHoCPwHXWGs/9JYLNBX2AxuNMXh/d7bWbvaauy601l4c\nlP8euK4TR+Oaz34L3GGtfS9omWOAD3APswOAS4CWuIKZi621v9TwOLY1xjwLHAsUAvNw57nI285i\noIm1dkToisaYdcAP1tq9rqU1xswEbga6447libjj9DJwRSBYDznu3+CuvZ64B7uFxphE4C5cgNca\nV6P1pLX273XI02IgHjgPeAQYjuvuMdVau8AYcxTuPA0A1gOXW2s/DkmjIzAL+AOuBvUn4AFr7dyg\nZWp0Lqu7bmvDWltgjDkb13piOlAerBljpuKOfx8gDhcoz7LWvubND5wDP3ChMeZCb9VnAtdxTfa7\nnmzzfpcHGEHH83RgGHAO0M4bLyIOd50dC3TF3bP+hzun34VJo0afL2PMobhWOIfgWjytBZ621j4a\nslwnXJP5o3D3pmettTVpqXKyt/znIdPPxrXkutlLPwHIq8n3RXX25hrxlqmve8d/gHOBqUHTTgZi\ncIHdaWHSHw7cjeuC4AO+BG621n7tzR+F+06cFKYgYwKuRdwfrLXve9f3U0CnoO+QjcBXuPvQA8BA\n3L1hhrX2+ZD0hgAPAyNwBeuPAenAk8FpVmMSrnvLZ8aYj7z/y/PtXVcbcPfu+0O2Pxh3r7zQWvus\nN60l7j57Eu55YwPwhLX2waD1DLAa910aB1yGa/XR3xiTimuhcxzu/huBqzC4xVpboXLAGNPa2/8J\nuPvGq96+fwWcaa2dH7TsANz9+whvm6twXeveDU7TWrvJGLPGy78CejloqMm9yMHjGFwfxs7eF1YO\nsMsY81gNm5ctBjoaY7oETRuDexD8HPdlHNz0ezTuwSbYs7iHoU9xX+ZzcA/F7xhjyu83xphzgTdw\nAf+NwEzcg80i7wEjLGPMIcCHuIesCXWsOevm/a4w0JQx5jYv/6txhRJ/B8bjmuUmeovdAazE9cEN\nNAW/zlvfh6v1uRr34HGFt4/XAf8vTD7GAv/w5t2ACwBfCRqwZz7wovf3Fd62zgYyvGkVHryNMe1w\n5+lo3EPQdC/Nt40xfwyz/VtwD0p/835GA8+FWS4cHy5gjsA9LL/r7XdwH915wFBjTIWg0Qtgunnz\nqxNpjEkO8xMftEygn+oruEKMqV7ezid8P8nxuP193stzatC5uxJ4C3f+fwIerGOzYT8uWHoDF8jd\ngHsgfcEYczrunL+OK1RoBrwU3BTWO5dfAUfizuVfcX2b/2WMuSzM9qo7l1Vet3Vhrc328p9ijOkV\nNOsqYJmXn5tw4zy84nUFwlpb6m27BPiY3V0onqnjftdG4Nppbdw4I08CebjALtTtwO+B+3Cfo2Jc\n8DHB2+9rgPuBwcAnxpg2YdKo9vPl9eX+BOiNqzW81vt/QkhaMcD7wBbcefsMuMEYc0EN9vtQXEBX\nFjL9GNy9pIcx5kcgG9hpjHnEGBNTg3T3qK7XiKc+7h3gPuOdjTHBXcn+jDuWGaELG2MG4b67+uK+\nx+7CFfx+6l0zWGuX4ILY08Ns7wxc4P2h93+4PvR+wOCCyXdx53wnMDf4OBljOuM+I71x349/x3UX\nuSxMmmEZY5rgCiwDBQX/AX5vjEkOLGOt3Yj7Hq9qf4pw32d434OLcQUh/8R9Ly0BHjDG3B1m/Utx\nhfuP4e6BO4Fk3DPBf3FdQe7EjcPwgTGmT1DeI73jcwrwNHAr7tw/Hbr/XsHH5978u710i4C3vM9Y\nqGVUrrwQOaCphl7k4NELV8PzOu5LbxouaLwK1w9+UjXrL8YFaocBzxtj2uJqPRfjarL9uIfSd40x\n/YAW7O57jzFmLO6BY6K19pWg6Z/hgqVTgJeN6xrwd+Axa+2VQcs9h2smOQ33oFCBMeYIXO3Hf4HT\na9F0L9kYE41rfjkOV2O2JSTv3XEPDDdaa/8vaPprwHJgCq6W8EPjBtWJD9MUfDIuEBkTPNCWMWY1\n8IgxZkTIIFF9gD7W2lRvucW4B40zgKestauMMStwD1qv1aA2ZjouiDwkqDbpn7iaigdxgWqwKOB3\nXpCFMWYX7sGst7X2x2q2BWCttYGHwMeNMbnARcaY/7PWrsYVRvwdF7DNCFrvL7iCp9drsI2BwK8h\n0/y4biFXBU3zAUustYGg70kvyLoAd16D9QL6WWvXBiYYY04FDsed/weC9ukV4FpjzKOB81QLHXGf\nhcDD8Me42sj/hzvu33jTf8J9PoIfvO/F1QAPsdbuDNqn+cCdxpinQ7pc7PFcVnPd1lWgVroHrvAD\noHtw9wVjzKO4ViLX4II
"text/plain": [
"<matplotlib.figure.Figure at 0x7f8a59144450>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"day_over_day_retention_df = pd.DataFrame(day_over_day_retention)\n",
"day_over_day_retention_df.date = (\n",
" pd.to_datetime(day_over_day_retention_df.date, format='%Y%m%d'))\n",
"\n",
"plt.rcParams['figure.figsize'] = (12, 6)\n",
"fig, ax = plt.subplots()\n",
"for group, data in day_over_day_retention_df.groupby(\"branch\"):\n",
" data['ma'] = data.ret.rolling(window=6).mean()\n",
" (data.sort_values(\"date\")\n",
" .plot(x='date', \n",
" y='ma', \n",
" ax=ax, \n",
" label=group))\n",
"plt.ylabel(\"Retention\")\n",
"plt.xlabel(\"Enrollment Date\")\n",
"plt.title(\"6 Week Retention by Enrollment Date, Branch (6-Day Moving Average)\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We see increased variability with time, which is most certainly due to the study being front-loaded with participants. Looking at enrollment:"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x7f95ac07eb10>"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA/QAAAIaCAYAAACOHAYkAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAAPYQAAD2EBqD+naQAAIABJREFUeJzs3Xt0XXWd///naZLm0ja9pKUXBQuD82lBECjXURFF4Svi\nCD9EUUYBYcZBQAYvP1AQu7iMICNfR6szDDKO6DBrOSDyE5UqOoBApYIoRdqPA23l0rSl17RJmvvv\nj312ehLaNCdNcs5On4+1unJ69ufs/Tn5dHWt1/589ued6+npQZIkSZIkZcu4UndAkiRJkiQVz0Av\nSZIkSVIGGeglSZIkScogA70kSZIkSRlkoJckSZIkKYMM9JIkSZIkZZCBXpIkSZKkDDLQS5IkSZKU\nQQZ6SZIkSZIyyEAvSZIkSVIGVe7Nh0MIVwH/CHwtxvjpgvevAy4CpgCPARfHGJ8vOF4N3Ap8CKgG\nFgOfjDGuL2gzFVgEnA50A/cAl8cYmwva7A/8K3ASsA24E7gqxthd0Obw/HmOAdYDi2KMt+zN95Yk\nSZIkqdSGPEMfQjgG+DvgD/3evxK4NH/sWKAZWBxCGF/Q7GvAe4GzgBOBOSSBvdBdwHzg5HzbE4Hb\nCq4zDvgpyU2J44HzgPOB6wraTCK5WbAKOAr4HLAwhHDRUL+3JEmSJEnlYEiBPoQwEfg+ySz8ln6H\nLweujzHeH2N8FvgYSWA/I//ZeuDjwBUxxodjjE8DFwBvCSEcm28zHzgVuDDG+GSM8XHgMuCcEMKs\n/HVOBeYB58YYl8UYFwNfBC4JIaQrD/4GqMqfZ3mM8QfA14He1QSSJEmSJGXRUGfovwn8OMb4q8I3\nQwgHArOAX6bvxRibgCeAE/JvHU0yq17YJgIvFrQ5HticD/upB4Ee4LiCNstijBsK2iwGJgOHFrR5\nJMbY2a9NCCFMLuYLS5IkSZJUTop+hj6EcA5wBEkw728WSehe1+/9dfljADOB9nzQ312bWSTPu/eK\nMXaFEDb1a7Or66TH/pD/uXKANlt38R36eOqppxpIVgOsBnbsqb0kSZIkSXupBpgLLF6wYMHG3TUq\nKtCHEF5P8vz7u2KMHXvVvew4FfjPUndCkiRJkrTPOZdkf7ldKnaGfgEwA/hdCCGXf68CODGEcCnJ\nM+05kln4wtnzmUC6fH4tMD6EUN9vln5m/ljaZr/CC4cQKoBp/doc069/MwuOpT9n7qHNnqwGmD59\nOhMnThzkR1QO2traaGxsZPbs2VRXV5e6Oxokxy27HLvscuyyy7HLJsctuxy77Mra2KX9JZ9Hd6fY\nQP8gcFi/9/4DWA7cFGNcGUJYS7Iz/TPQuwnecSTP3QM8BXTm29ybbxOAA4Al+TZLgCkhhCMLnqM/\nmeRmwRMFbb4QQphe8Bz9KSTL6J8raHNDCKEixthV0CbGGPe43D5vB8DEiRNpaGgY5EdUDlpaWmhs\nbGTKlCnU1dWVujsaJMctuxy77HLsssuxyybHLbscu+zK2til/WUPj30XFejzNeCfK3wvhNAMbIwx\nLs+/9TXgmhDC8yR3E64HXgbuy5+jKYRwB3BrCGEzSf34rwOPxRiX5tusCCEsBm4PIVwMjAe+AfxX\njDGdWf95vi/fy5fKm52/1qKCxwHuAq4F/j2EcDPJzYhPkezEL0mSJElSZg25Dn2BnsK/xBi/QhK+\nbyOZTa8F3hNjbC9odgVwP3A38BCwhqQmfaGPACtIVgXcDzwCfKLgOt3A6UAX8DhwJ8lqgS8VtGki\nmZGfCzwJ3AIsjDHeMeRvK0mSJElSGSh6l/v+Yozv3MV7C4GFA3ymjaSu/GUDtNlCUkd+oGu/RBLq\nB2rzLPD2gdpIkiRJkpQ1wzFDL0mSJEmSRpmBXpIkSZKkDDLQS5IkSZKUQQZ6SZIkSZIyyEAvSZIk\nSVIGGeglSZIkScogA70kSZIkSRlkoJckSZIkKYMM9GNMd3cPTc3tpe6GJEmSpH4++tGP8uUvf7nU\n3ejji1/8Iscddxzz589nxYoVpe4OAJ///Oe59NJLB9X2lVdeYd68eWXT99FmoB9jvnT7Ej668AGe\nef7VUndFkiRJUhl75JFH+NGPfsS//du/8eijj/KXf/mXo3r93YXxq6++mptuumlQ55gzZw6PPfbY\nqPcdYN68efzyl78c9esWqizp1TWsdrR18vs/JUH+6fgqhx88o8Q9kiRJkjSSuru7yeVy5HK5oj/7\n4osvMmPGDN785jePQM8G1tHRQU9Pzy77PXHixEGfJ5fL0dDQMJxdyxRn6MeQxo3Nva83bm0tYU8k\nSZK0L2pu7SD+edOo/mlu7Siqjx/96Ee54YYbuOWWWzjuuON461vfyqJFi3qP72rWeNu2bcybN4+n\nnnoKgCeffJJ58+bx6KOPcuaZZ/LmN7+Z888/n02bNvHwww9z2mmnsWDBAj7zmc/Q1tbW5/pdXV1c\nf/31HH300Rx//PH88z//c5/j7e3t3HzzzZx44okceeSRfOhDH2Lp0qW9x++9916OOeYYfvWrX/He\n976Xww8/nMbGxl1+16VLl3L22Wdz2GGH8da3vpWvfvWrdHd3A8my9htuuIHGxkbmzZvHySefvMtz\npNd78MEHOfXUUzn88MO58MILWbt2bW+bl156iU9+8pO85S1v4cgjj+QDH/gAS5Ys6XOed77znXzr\nW9/iyiuv5Oijj+baa6/lXe96FwBnnHEG8+bN42Mf+xgAV111VZ8l9z09Pdx+++2ccsopHHbYYbzz\nne/ktttu2+V4LV26lHnz5vHwww/z13/91xx++OF86EMf4oUXXug935YtW/jMZz7DiSeeyBFHHMH7\n3vc+fvKTn/Tp757+nbzzne8kl8txySWX9Pn9rVixgo997GMcddRRLFiwgLPOOos//vGPu/zdDgdn\n6MeQxg2FgX5HCXsiSZKkfU1zawcX3viLogP23ppQW8UdV7+bCbVVg/7Mfffdx/nnn89///d/87vf\n/Y7Pf/7zLFiwgBNOOAFg0LPdixYt4ktf+hI1NTVcfvnlXH755VRXV3PrrbfS3NzMJZdcwve+9z0u\nuuii3s/88Ic/5Oyzz+buu+/m2Wef5Ytf/CJz5szh7LPPBuC6665j5cqVfO1rX2PGjBk8+OCD/O3f\n/i0//vGPOeCAAwBobW3l29/+NjfeeCNTpkzZ5Qz1unXr+MQnPsFZZ53FV77yFVauXMk111xDdXU1\nl156KVdffTX7778///3f/80999wz4HdubW3ltttu45ZbbqGyspKFCxfy6U9/mrvuuguA5uZmTjrp\nJD7zmc9QVVXFj370Iy6++GIeeOABZs2a1Xue73znO1xyySW9Yf0jH/kIZ599Nt/97nc5+OCDqaqq\n2uXv/5/+6Z+4++67+cIXvsBRRx3Fpk2beP7553uP76rvt9xyC1dffTXTp0/nq1/9Kv/wD//Qu4y/\nra2NN73pTfzd3/0dEyZM4OGHH+bKK6/kgAMO4LDDDus9x0D/Tu6++27+6q/+iptuuom3ve1tjBuX\nzJV/7nOf45BDDuG6665j3LhxLF++nMrKkYvdBvoxxEAvSZIk7VkIgUsuuQSAAw44gP/8z/9kyZIl\nvYG+p6dnj+fI5XJcccUVHHHEEQCcddZZ/N//+3958MEHed3rXgfAqaeeyhNPPNEn0M+ZM4fPf/7z\nAMydO5cYI9/97nc5++yzWbNmDffeey8PPfQQM2Ykj89ecMEFPPLII9xzzz1cccUVQDLLv3DhwgGf\nG7/rrruYPXs211xzDQAHHngg69at46tf/SqXXnopEydOZMKECYwbN45p06YN+F27urq49tpre8Pu\nTTfdxGmnncayZcs47LDDmDdvHvPmzett/6lPfYpf/OIX/PKXv+Tcc8/tff+EE07g/PPP7/17GoIn\nT56822Xzzc3NfO973+NLX/oS73//+wHYf//9+zwmsKvxuvTSS3vHM13x8Nvf/pZDDz2UmTNncsEF\nF/S2Pffcc/n1r3/Nz37
"text/plain": [
"<matplotlib.figure.Figure at 0x7f95ad993e10>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"(joined.groupby(\"enrollment_date\")\n",
" .agg(F.countDistinct(\"client_id\").alias(\"number of participants\"))\n",
" .sort(\"enrollment_date\")\n",
" .toPandas()\n",
" .plot(x='enrollment_date'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"we see most of our clients are enrolled before 2018-03-22. This is why the lines are so smooth for the first ~9 datapoints in the previous retention chart\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Breaking retention down into shorter segments shows that there are indeed differences between the taar branches, however they track each other rather closely and are consistently higher than the control. This is evidence that the similarities we see in the lines for the first plot are in fact the true retention values and not a data-handling issue."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Results"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>branch</th>\n",
" <th>period</th>\n",
" <th>n_week_clients</th>\n",
" <th>total_clients</th>\n",
" <th>retention</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>ensemble-taar</td>\n",
2018-07-11 01:26:10 +03:00
" <td>6.0</td>\n",
" <td>65909</td>\n",
" <td>108381</td>\n",
" <td>0.608123</td>\n",
2018-07-11 01:26:10 +03:00
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
2018-07-11 01:26:10 +03:00
" <td>control</td>\n",
" <td>6.0</td>\n",
" <td>83578</td>\n",
" <td>139913</td>\n",
" <td>0.597357</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38</th>\n",
" <td>linear-taar</td>\n",
" <td>6.0</td>\n",
" <td>65698</td>\n",
" <td>108071</td>\n",
" <td>0.607915</td>\n",
2018-07-11 01:26:10 +03:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" branch period n_week_clients total_clients retention\n",
"9 ensemble-taar 6.0 65909 108381 0.608123\n",
"20 control 6.0 83578 139913 0.597357\n",
"38 linear-taar 6.0 65698 108071 0.607915"
2018-07-11 01:26:10 +03:00
]
},
"execution_count": 79,
2018-07-11 01:26:10 +03:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"w6r = ret_df[ret_df.period==6]\n",
"w6r"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [],
"source": [
"(slinear, nlinear,\n",
" sensemble, nensemble,\n",
" scontrol, ncontrol) = [int(w6r[w6r.branch == b][i].values[0])\n",
" for b in ('linear-taar', 'ensemble-taar', 'control')\n",
" for i in ('n_week_clients', \"total_clients\")]\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {},
"outputs": [],
"source": [
"def get_effect(g1s, g2s, g1n, g2n):\n",
" \"\"\"\n",
" Extracts the observed difference and p-value \n",
" for two proportions\n",
" \n",
" g1s: the number of successes for group 1\n",
" g2s: the number of successes for group 2\n",
" g1n: total trials for group 1\n",
" g2n: total trials for group 2\n",
" \n",
" returns the effect and p-value as a tuple\n",
" \"\"\"\n",
" # use counts to form a proportion and format\n",
" effect = str(round((g1s*1.0 / g1n) - (g2s*1.0 / g2n), 4) * 100) + '%'\n",
" \n",
" # perform test of proportions\n",
" pval = sm.stats.proportions_ztest(np.array([g1s, g2s]),\n",
" np.array([g1n, g2n]),\n",
" value=0.05)[1]\n",
" return effect, pval"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"linear effect: 1.06%\n",
"pvalue: 4.34802206697e-88\n",
"\n",
"ensemble effect: 1.08%\n",
"pvalue: 2.51189892932e-87\n"
]
}
],
"source": [
"le, lp = get_effect(slinear, scontrol, nlinear, ncontrol) \n",
"print \"linear effect: {}\\npvalue: {}\\n\".format(le, lp)\n",
"\n",
"ee, ep = get_effect(sensemble, scontrol, nensemble, ncontrol) \n",
"print \"ensemble effect: {}\\npvalue: {}\".format(ee, ep)\n"
2018-07-11 01:26:10 +03:00
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}