make minor cosmetic and consistency improvements memberrewards notebook (#263)

This commit is contained in:
Lovkush 2021-04-29 06:20:21 +01:00 коммит произвёл GitHub
Родитель 29b0d75d64
Коммит f3fe1c1ffa
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
1 изменённых файлов: 26 добавлений и 18 удалений

Просмотреть файл

@ -175,25 +175,25 @@
}
],
"source": [
"# Creating some simulated data for our example example\n",
"# Creating some simulated data for our example\n",
"import pandas as pd\n",
"import numpy as np\n",
"num_users = 10000\n",
"num_months = 12\n",
"\n",
"signup_months = np.random.choice(np.arange(1, num_months), num_users) * np.random.randint(0,2, size=num_users)\n",
"signup_months = np.random.choice(np.arange(1, num_months), num_users) * np.random.randint(0,2, size=num_users) # signup_months == 0 means customer did not sign up\n",
"df = pd.DataFrame({\n",
" 'user_id': np.repeat(np.arange(num_users), num_months),\n",
" 'signup_month': np.repeat(signup_months, num_months), # signup month == 0 means customer did not sign up\n",
" 'month': np.tile(np.arange(1, num_months+1), num_users), # months are from 1 to 12\n",
" 'spend': np.random.poisson(500, num_users*num_months) #np.random.beta(a=2, b=5, size=num_users * num_months)*1000 # centered at 500\n",
"})\n",
"# Assigning a treatment value based on the signup month \n",
"df[\"treatment\"] = (1-(df[\"signup_month\"]==0)).astype(bool)\n",
"# Simulating effect of month (monotonically increasing--customers buy the most in December)\n",
"# A customer is in the treatment group if and only if they signed up\n",
"df[\"treatment\"] = df[\"signup_month\"]>0\n",
"# Simulating an effect of month (monotonically decreasing--customers buy less later in the year)\n",
"df[\"spend\"] = df[\"spend\"] - df[\"month\"]*10\n",
"# The treatment effect (simulating a simple treatment effect of 100)\n",
"after_signup = (df[\"signup_month\"] < df[\"month\"]) & (df[\"signup_month\"] !=0)\n",
"# Simulating a simple treatment effect of 100\n",
"after_signup = (df[\"signup_month\"] < df[\"month\"]) & (df[\"treatment\"])\n",
"df.loc[after_signup,\"spend\"] = df[after_signup][\"spend\"] + 100\n",
"df"
]
@ -205,10 +205,10 @@
"### The importance of time\n",
"Time plays a crucial role in modeling this problem. \n",
"\n",
"Rewards signup can affect the future transactions, but not those that happened before it. In fact, the transaction prior to the rewards signup can be assumed to cause the rewards signup decision. Therefore we can split up the variables for each user in terms of \n",
"Rewards signup can affect the future transactions, but not those that happened before it. In fact, the transactions prior to the rewards signup can be assumed to cause the rewards signup decision. Therefore we split up the variables for each user:\n",
"\n",
"1) Activity prior to the treatment (causes the treatment)\n",
"2) Activity after the treatment (is the outcome of applying treatment)\n",
"1. Activity prior to the treatment (assumed a cause of the treatment)\n",
"2. Activity after the treatment (is the outcome of applying treatment)\n",
"\n",
"Of course, many important variables that affect signup and total spend are missing (e.g., the type of products bought, length of a user's account, geography, etc.). So we'll need a node denoting `Unobserved Confounders`. \n",
"\n",
@ -226,7 +226,7 @@
"import dowhy\n",
"\n",
"# Setting the signup month (for ease of analysis)\n",
"i = 6"
"i = 3"
]
},
{
@ -268,8 +268,6 @@
}
],
"source": [
"\n",
"\n",
"causal_graph = \"\"\"digraph {\n",
"treatment[label=\"Program Signup in month i\"];\n",
"pre_spends;\n",
@ -284,10 +282,20 @@
"}\"\"\"\n",
"\n",
"# Post-process the data based on the graph and the month of the treatment (signup)\n",
"df_i_signupmonth = df[df.signup_month.isin([0,i])].groupby([\"user_id\", \"signup_month\", \"treatment\"]).apply(\n",
" lambda x: pd.Series({'pre_spends': np.sum(np.where(x.month < i, x.spend,0))/np.sum(np.where(x.month<i, 1,0)),\n",
" 'post_spends': np.sum(np.where(x.month > i, x.spend,0))/np.sum(np.where(x.month>i, 1,0)) })\n",
").reset_index()\n",
"# For each customer, determine their average monthly spend before and after month i\n",
"df_i_signupmonth = (\n",
" df[df.signup_month.isin([0, i])]\n",
" .groupby([\"user_id\", \"signup_month\", \"treatment\"])\n",
" .apply(\n",
" lambda x: pd.Series(\n",
" {\n",
" \"pre_spends\": x.loc[x.month < i, \"spend\"].mean(),\n",
" \"post_spends\": x.loc[x.month > i, \"spend\"].mean(),\n",
" }\n",
" )\n",
" )\n",
" .reset_index()\n",
")\n",
"print(df_i_signupmonth)\n",
"model = dowhy.CausalModel(data=df_i_signupmonth,\n",
" graph=causal_graph.replace(\"\\n\", \" \"),\n",
@ -498,4 +506,4 @@
},
"nbformat": 4,
"nbformat_minor": 4
}
}