refactoring football
This commit is contained in:
Родитель
b4a6ac1168
Коммит
2eb40b4bea
|
@ -45,12 +45,6 @@
|
|||
"import seaborn as sns\n",
|
||||
"import itertools\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"from sklearn.ensemble import GradientBoostingClassifier\n",
|
||||
"from sklearn.ensemble import AdaBoostClassifier \n",
|
||||
"from sklearn.ensemble import RandomForestClassifier\n",
|
||||
"from sklearn.naive_bayes import GaussianNB\n",
|
||||
"from sklearn.neighbors import KNeighborsClassifier\n",
|
||||
"from sklearn import linear_model\n",
|
||||
"from sklearn.metrics import (confusion_matrix, accuracy_score, roc_auc_score, f1_score, log_loss, precision_score,\n",
|
||||
" recall_score, mean_squared_error, mean_absolute_error, r2_score, classification_report)\n",
|
||||
"from sklearn.calibration import CalibratedClassifierCV\n",
|
||||
|
@ -66,8 +60,12 @@
|
|||
"from xgboost import XGBClassifier\n",
|
||||
"from lightgbm import LGBMClassifier\n",
|
||||
"from libs.loaders import load_football\n",
|
||||
"from libs.football import get_fifa_data, create_feables\n",
|
||||
"import pkg_resources\n",
|
||||
"\n",
|
||||
"print(\"System version: {}\".format(sys.version))"
|
||||
"print(\"System version: {}\".format(sys.version))\n",
|
||||
"print(\"XGBoost version: {}\".format(pkg_resources.get_distribution('xgboost').version))\n",
|
||||
"print(\"LightGBM version: {}\".format(pkg_resources.get_distribution('lightgbm').version))"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -425,7 +423,9 @@
|
|||
"cell_type": "code",
|
||||
"execution_count": 68,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -628,60 +628,7 @@
|
|||
"editable": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"def get_fifa_stats(match, player_stats):\n",
|
||||
" ''' Aggregates fifa stats for a given match. '''\n",
|
||||
" #Define variables\n",
|
||||
" match_id = match.match_api_id\n",
|
||||
" date = match['date']\n",
|
||||
" players = ['home_player_1', 'home_player_2', 'home_player_3', \"home_player_4\", \"home_player_5\",\n",
|
||||
" \"home_player_6\", \"home_player_7\", \"home_player_8\", \"home_player_9\", \"home_player_10\",\n",
|
||||
" \"home_player_11\", \"away_player_1\", \"away_player_2\", \"away_player_3\", \"away_player_4\",\n",
|
||||
" \"away_player_5\", \"away_player_6\", \"away_player_7\", \"away_player_8\", \"away_player_9\",\n",
|
||||
" \"away_player_10\", \"away_player_11\"]\n",
|
||||
" player_stats_new = pd.DataFrame()\n",
|
||||
" names = []\n",
|
||||
" \n",
|
||||
" #Loop through all players\n",
|
||||
" for player in players: \n",
|
||||
" \n",
|
||||
" #Get player ID\n",
|
||||
" player_id = match[player]\n",
|
||||
" \n",
|
||||
" #Get player stats \n",
|
||||
" stats = player_stats[player_stats.player_api_id == player_id]\n",
|
||||
" \n",
|
||||
" #Identify current stats \n",
|
||||
" current_stats = stats[stats.date < date].sort_values(by = 'date', ascending = False)[:1]\n",
|
||||
" \n",
|
||||
" if np.isnan(player_id) == True:\n",
|
||||
" overall_rating = pd.Series(0)\n",
|
||||
" else:\n",
|
||||
" current_stats.reset_index(inplace = True, drop = True)\n",
|
||||
" overall_rating = pd.Series(current_stats.loc[0, \"overall_rating\"])\n",
|
||||
"\n",
|
||||
" #Rename stat\n",
|
||||
" name = \"{}_overall_rating\".format(player)\n",
|
||||
" names.append(name)\n",
|
||||
" \n",
|
||||
" #Aggregate stats\n",
|
||||
" player_stats_new = pd.concat([player_stats_new, overall_rating], axis = 1)\n",
|
||||
" \n",
|
||||
" player_stats_new.columns = names \n",
|
||||
" player_stats_new['match_api_id'] = match_id\n",
|
||||
"\n",
|
||||
" player_stats_new.reset_index(inplace = True, drop = True)\n",
|
||||
" \n",
|
||||
" #Return player stats \n",
|
||||
" return player_stats_new.ix[0] \n",
|
||||
" \n",
|
||||
"def get_fifa_data(matches, player_stats):\n",
|
||||
" ''' Gets fifa data for all matches. ''' \n",
|
||||
" #Apply get_fifa_stats for each match\n",
|
||||
" fifa_data = matches.apply(lambda x :get_fifa_stats(x, player_stats), axis = 1)\n",
|
||||
" return fifa_data"
|
||||
]
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
|
@ -739,315 +686,6 @@
|
|||
"fifa_data.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## Loading all functions\n",
|
||||
"def get_match_label(match):\n",
|
||||
" ''' Derives a label for a given match. '''\n",
|
||||
" \n",
|
||||
" #Define variables\n",
|
||||
" home_goals = match['home_team_goal']\n",
|
||||
" away_goals = match['away_team_goal']\n",
|
||||
" \n",
|
||||
" label = pd.DataFrame()\n",
|
||||
" label.loc[0,'match_api_id'] = match['match_api_id'] \n",
|
||||
"\n",
|
||||
" #Identify match label \n",
|
||||
" if home_goals > away_goals:\n",
|
||||
" label.loc[0,'label'] = \"Win\"\n",
|
||||
" if home_goals == away_goals:\n",
|
||||
" label.loc[0,'label'] = \"Draw\"\n",
|
||||
" if home_goals < away_goals:\n",
|
||||
" label.loc[0,'label'] = \"Defeat\"\n",
|
||||
"\n",
|
||||
" #Return label \n",
|
||||
" return label.loc[0]\n",
|
||||
" \n",
|
||||
" \n",
|
||||
"def get_overall_fifa_rankings(fifa, get_overall = False):\n",
|
||||
" ''' Get overall fifa rankings from fifa data. '''\n",
|
||||
" \n",
|
||||
" temp_data = fifa\n",
|
||||
" \n",
|
||||
" #Check if only overall player stats are desired\n",
|
||||
" if get_overall == True:\n",
|
||||
" \n",
|
||||
" #Get overall stats\n",
|
||||
" data = temp_data.loc[:,(fifa.columns.str.contains('overall_rating'))]\n",
|
||||
" data.loc[:,'match_api_id'] = temp_data.loc[:,'match_api_id']\n",
|
||||
" else:\n",
|
||||
" \n",
|
||||
" #Get all stats except for stat date\n",
|
||||
" cols = fifa.loc[:,(fifa.columns.str.contains('date_stat'))]\n",
|
||||
" temp_data = fifa.drop(cols.columns, axis = 1) \n",
|
||||
" data = temp_data\n",
|
||||
" \n",
|
||||
" #Return data\n",
|
||||
" return data\n",
|
||||
"\n",
|
||||
"def get_last_matches(matches, date, team, x = 10):\n",
|
||||
" ''' Get the last x matches of a given team. '''\n",
|
||||
" \n",
|
||||
" #Filter team matches from matches\n",
|
||||
" team_matches = matches[(matches['home_team_api_id'] == team) | (matches['away_team_api_id'] == team)]\n",
|
||||
" \n",
|
||||
" #Filter x last matches from team matches\n",
|
||||
" last_matches = team_matches[team_matches.date < date].sort_values(by = 'date', ascending = False).iloc[0:x,:]\n",
|
||||
" \n",
|
||||
" #Return last matches\n",
|
||||
" return last_matches\n",
|
||||
" \n",
|
||||
"def get_last_matches_against_eachother(matches, date, home_team, away_team, x = 10):\n",
|
||||
" ''' Get the last x matches of two given teams. '''\n",
|
||||
" \n",
|
||||
" #Find matches of both teams\n",
|
||||
" home_matches = matches[(matches['home_team_api_id'] == home_team) & (matches['away_team_api_id'] == away_team)] \n",
|
||||
" away_matches = matches[(matches['home_team_api_id'] == away_team) & (matches['away_team_api_id'] == home_team)] \n",
|
||||
" total_matches = pd.concat([home_matches, away_matches])\n",
|
||||
" \n",
|
||||
" #Get last x matches\n",
|
||||
" try: \n",
|
||||
" last_matches = total_matches[total_matches.date < date].sort_values(by = 'date', ascending = False).iloc[0:x,:]\n",
|
||||
" except:\n",
|
||||
" last_matches = total_matches[total_matches.date < date].sort_values(by = 'date', ascending = False).iloc[0:total_matches.shape[0],:]\n",
|
||||
" \n",
|
||||
" #Check for error in data\n",
|
||||
" if(last_matches.shape[0] > x):\n",
|
||||
" print(\"Error in obtaining matches\")\n",
|
||||
" \n",
|
||||
" #Return data\n",
|
||||
" return last_matches\n",
|
||||
" \n",
|
||||
"def get_goals(matches, team):\n",
|
||||
" ''' Get the goals of a specfic team from a set of matches. '''\n",
|
||||
" \n",
|
||||
" #Find home and away goals\n",
|
||||
" home_goals = int(matches.home_team_goal[matches.home_team_api_id == team].sum())\n",
|
||||
" away_goals = int(matches.away_team_goal[matches.away_team_api_id == team].sum())\n",
|
||||
"\n",
|
||||
" total_goals = home_goals + away_goals\n",
|
||||
" \n",
|
||||
" #Return total goals\n",
|
||||
" return total_goals\n",
|
||||
"\n",
|
||||
"def get_goals_conceided(matches, team):\n",
|
||||
" ''' Get the goals conceided of a specfic team from a set of matches. '''\n",
|
||||
"\n",
|
||||
" #Find home and away goals\n",
|
||||
" home_goals = int(matches.home_team_goal[matches.away_team_api_id == team].sum())\n",
|
||||
" away_goals = int(matches.away_team_goal[matches.home_team_api_id == team].sum())\n",
|
||||
"\n",
|
||||
" total_goals = home_goals + away_goals\n",
|
||||
"\n",
|
||||
" #Return total goals\n",
|
||||
" return total_goals\n",
|
||||
"\n",
|
||||
"def get_wins(matches, team):\n",
|
||||
" ''' Get the number of wins of a specfic team from a set of matches. '''\n",
|
||||
" \n",
|
||||
" #Find home and away wins\n",
|
||||
" home_wins = int(matches.home_team_goal[(matches.home_team_api_id == team) & (matches.home_team_goal > matches.away_team_goal)].count())\n",
|
||||
" away_wins = int(matches.away_team_goal[(matches.away_team_api_id == team) & (matches.away_team_goal > matches.home_team_goal)].count())\n",
|
||||
"\n",
|
||||
" total_wins = home_wins + away_wins\n",
|
||||
"\n",
|
||||
" #Return total wins\n",
|
||||
" return total_wins \n",
|
||||
" \n",
|
||||
"def get_match_features(match, matches, x = 10):\n",
|
||||
" ''' Create match specific features for a given match. '''\n",
|
||||
" \n",
|
||||
" #Define variables\n",
|
||||
" date = match.date\n",
|
||||
" home_team = match.home_team_api_id\n",
|
||||
" away_team = match.away_team_api_id\n",
|
||||
" \n",
|
||||
" #Get last x matches of home and away team\n",
|
||||
" matches_home_team = get_last_matches(matches, date, home_team, x = 10)\n",
|
||||
" matches_away_team = get_last_matches(matches, date, away_team, x = 10)\n",
|
||||
" \n",
|
||||
" #Get last x matches of both teams against each other\n",
|
||||
" last_matches_against = get_last_matches_against_eachother(matches, date, home_team, away_team, x = 3)\n",
|
||||
" \n",
|
||||
" #Create goal variables\n",
|
||||
" home_goals = get_goals(matches_home_team, home_team)\n",
|
||||
" away_goals = get_goals(matches_away_team, away_team)\n",
|
||||
" home_goals_conceided = get_goals_conceided(matches_home_team, home_team)\n",
|
||||
" away_goals_conceided = get_goals_conceided(matches_away_team, away_team)\n",
|
||||
" \n",
|
||||
" #Define result data frame\n",
|
||||
" result = pd.DataFrame()\n",
|
||||
" \n",
|
||||
" #Define ID features\n",
|
||||
" result.loc[0, 'match_api_id'] = match.match_api_id\n",
|
||||
" result.loc[0, 'league_id'] = match.league_id\n",
|
||||
"\n",
|
||||
" #Create match features\n",
|
||||
" result.loc[0, 'home_team_goals_difference'] = home_goals - home_goals_conceided\n",
|
||||
" result.loc[0, 'away_team_goals_difference'] = away_goals - away_goals_conceided\n",
|
||||
" result.loc[0, 'games_won_home_team'] = get_wins(matches_home_team, home_team) \n",
|
||||
" result.loc[0, 'games_won_away_team'] = get_wins(matches_away_team, away_team)\n",
|
||||
" result.loc[0, 'games_against_won'] = get_wins(last_matches_against, home_team)\n",
|
||||
" result.loc[0, 'games_against_lost'] = get_wins(last_matches_against, away_team)\n",
|
||||
" \n",
|
||||
" #Return match features\n",
|
||||
" return result.loc[0]\n",
|
||||
" \n",
|
||||
"def create_feables(matches, fifa, bookkeepers, get_overall = False, horizontal = True, x = 10, verbose = True):\n",
|
||||
" ''' Create and aggregate features and labels for all matches. '''\n",
|
||||
"\n",
|
||||
" #Get fifa stats features\n",
|
||||
" fifa_stats = get_overall_fifa_rankings(fifa, get_overall)\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" if verbose == True:\n",
|
||||
" print(\"Generating match features...\")\n",
|
||||
" start = time()\n",
|
||||
" \n",
|
||||
" #Get match features for all matches\n",
|
||||
" match_stats = matches.apply(lambda x: get_match_features(x, matches, x = 10), axis = 1)\n",
|
||||
" \n",
|
||||
" #Create dummies for league ID feature\n",
|
||||
" dummies = pd.get_dummies(match_stats['league_id']).rename(columns = lambda x: 'League_' + str(x))\n",
|
||||
" match_stats = pd.concat([match_stats, dummies], axis = 1)\n",
|
||||
" match_stats.drop(['league_id'], inplace = True, axis = 1)\n",
|
||||
" \n",
|
||||
" end = time()\n",
|
||||
" if verbose == True:\n",
|
||||
" print(\"Match features generated in {:.1f} minutes\".format((end - start)/60))\n",
|
||||
" \n",
|
||||
" if verbose == True: \n",
|
||||
" print(\"Generating match labels...\")\n",
|
||||
" start = time()\n",
|
||||
" \n",
|
||||
" #Create match labels\n",
|
||||
" labels = matches.apply(get_match_label, axis = 1)\n",
|
||||
" end = time()\n",
|
||||
" if verbose == True:\n",
|
||||
" print(\"Match labels generated in {:.1f} minutes\".format((end - start)/60))\n",
|
||||
" \n",
|
||||
" if verbose == True: \n",
|
||||
" print(\"Generating bookkeeper data...\")\n",
|
||||
" start = time()\n",
|
||||
" \n",
|
||||
" #Get bookkeeper quotas for all matches\n",
|
||||
" bk_data = get_bookkeeper_data(matches, bookkeepers, horizontal = True)\n",
|
||||
" bk_data.loc[:,'match_api_id'] = matches.loc[:,'match_api_id']\n",
|
||||
" end = time()\n",
|
||||
" if verbose == True:\n",
|
||||
" print(\"Bookkeeper data generated in {:.1f} minutes\".format((end - start)/60))\n",
|
||||
"\n",
|
||||
" #Merges features and labels into one frame\n",
|
||||
" features = pd.merge(match_stats, fifa_stats, on = 'match_api_id', how = 'left')\n",
|
||||
" features = pd.merge(features, bk_data, on = 'match_api_id', how = 'left')\n",
|
||||
" feables = pd.merge(features, labels, on = 'match_api_id', how = 'left')\n",
|
||||
" \n",
|
||||
" #Drop NA values\n",
|
||||
" feables.dropna(inplace = True)\n",
|
||||
" \n",
|
||||
" #Return preprocessed data\n",
|
||||
" return feables\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"def convert_odds_to_prob(match_odds):\n",
|
||||
" ''' Converts bookkeeper odds to probabilities. '''\n",
|
||||
" \n",
|
||||
" #Define variables\n",
|
||||
" match_id = match_odds.loc[:,'match_api_id']\n",
|
||||
" bookkeeper = match_odds.loc[:,'bookkeeper'] \n",
|
||||
" win_odd = match_odds.loc[:,'Win']\n",
|
||||
" draw_odd = match_odds.loc[:,'Draw']\n",
|
||||
" loss_odd = match_odds.loc[:,'Defeat']\n",
|
||||
" \n",
|
||||
" #Converts odds to prob\n",
|
||||
" win_prob = 1 / win_odd\n",
|
||||
" draw_prob = 1 / draw_odd\n",
|
||||
" loss_prob = 1 / loss_odd\n",
|
||||
" \n",
|
||||
" total_prob = win_prob + draw_prob + loss_prob\n",
|
||||
" \n",
|
||||
" probs = pd.DataFrame()\n",
|
||||
" \n",
|
||||
" #Define output format and scale probs by sum over all probs\n",
|
||||
" probs.loc[:,'match_api_id'] = match_id\n",
|
||||
" probs.loc[:,'bookkeeper'] = bookkeeper\n",
|
||||
" probs.loc[:,'Win'] = win_prob / total_prob\n",
|
||||
" probs.loc[:,'Draw'] = draw_prob / total_prob\n",
|
||||
" probs.loc[:,'Defeat'] = loss_prob / total_prob\n",
|
||||
" \n",
|
||||
" #Return probs and meta data\n",
|
||||
" return probs\n",
|
||||
" \n",
|
||||
"def get_bookkeeper_data(matches, bookkeepers, horizontal = True):\n",
|
||||
" ''' Aggregates bookkeeper data for all matches and bookkeepers. '''\n",
|
||||
" \n",
|
||||
" bk_data = pd.DataFrame()\n",
|
||||
" \n",
|
||||
" #Loop through bookkeepers\n",
|
||||
" for bookkeeper in bookkeepers:\n",
|
||||
"\n",
|
||||
" #Find columns containing data of bookkeeper\n",
|
||||
" temp_data = matches.loc[:,(matches.columns.str.contains(bookkeeper))]\n",
|
||||
" temp_data.loc[:, 'bookkeeper'] = str(bookkeeper)\n",
|
||||
" temp_data.loc[:, 'match_api_id'] = matches.loc[:, 'match_api_id']\n",
|
||||
" \n",
|
||||
" #Rename odds columns and convert to numeric\n",
|
||||
" cols = temp_data.columns.values\n",
|
||||
" cols[:3] = ['Win','Draw','Defeat']\n",
|
||||
" temp_data.columns = cols\n",
|
||||
" temp_data.loc[:,'Win'] = pd.to_numeric(temp_data['Win'])\n",
|
||||
" temp_data.loc[:,'Draw'] = pd.to_numeric(temp_data['Draw'])\n",
|
||||
" temp_data.loc[:,'Defeat'] = pd.to_numeric(temp_data['Defeat'])\n",
|
||||
" \n",
|
||||
" #Check if data should be aggregated horizontally\n",
|
||||
" if(horizontal == True):\n",
|
||||
" \n",
|
||||
" #Convert data to probs\n",
|
||||
" temp_data = convert_odds_to_prob(temp_data)\n",
|
||||
" temp_data.drop('match_api_id', axis = 1, inplace = True)\n",
|
||||
" temp_data.drop('bookkeeper', axis = 1, inplace = True)\n",
|
||||
" \n",
|
||||
" #Rename columns with bookkeeper names\n",
|
||||
" win_name = bookkeeper + \"_\" + \"Win\"\n",
|
||||
" draw_name = bookkeeper + \"_\" + \"Draw\"\n",
|
||||
" defeat_name = bookkeeper + \"_\" + \"Defeat\"\n",
|
||||
" temp_data.columns.values[:3] = [win_name, draw_name, defeat_name]\n",
|
||||
"\n",
|
||||
" #Aggregate data\n",
|
||||
" bk_data = pd.concat([bk_data, temp_data], axis = 1)\n",
|
||||
" else:\n",
|
||||
" #Aggregate vertically\n",
|
||||
" bk_data = bk_data.append(temp_data, ignore_index = True)\n",
|
||||
" \n",
|
||||
" #If horizontal add match api id to data\n",
|
||||
" if(horizontal == True):\n",
|
||||
" temp_data.loc[:, 'match_api_id'] = matches.loc[:, 'match_api_id']\n",
|
||||
" \n",
|
||||
" #Return bookkeeper data\n",
|
||||
" return bk_data\n",
|
||||
" \n",
|
||||
"def get_bookkeeper_probs(matches, bookkeepers, horizontal = False):\n",
|
||||
" ''' Get bookkeeper data and convert to probabilities for vertical aggregation. '''\n",
|
||||
" \n",
|
||||
" #Get bookkeeper data\n",
|
||||
" data = get_bookkeeper_data(matches, bookkeepers, horizontal = False)\n",
|
||||
" \n",
|
||||
" #Convert odds to probabilities\n",
|
||||
" probs = convert_odds_to_prob(data)\n",
|
||||
" \n",
|
||||
" #Return data\n",
|
||||
" return probs\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
|
@ -1827,15 +1465,15 @@
|
|||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
"source": [
|
||||
"# 2) Concept drift"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
|
@ -1846,14 +1484,306 @@
|
|||
"editable": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"la_liga_id = 21518\n",
|
||||
"matches_target = matches[matches['league_id'] == la_liga_id]\n",
|
||||
"print(matches_target.shape)\n",
|
||||
"print(matches_target.head(5))\n",
|
||||
"print(matches_target.tail(5))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"cols = [\"country_id\", \"league_id\", \"season\", \"stage\", \"date\", \"match_api_id\", \"home_team_api_id\", \n",
|
||||
" \"away_team_api_id\", \"home_team_goal\", \"away_team_goal\", \"home_player_1\", \"home_player_2\",\n",
|
||||
" \"home_player_3\", \"home_player_4\", \"home_player_5\", \"home_player_6\", \"home_player_7\", \n",
|
||||
" \"home_player_8\", \"home_player_9\", \"home_player_10\", \"home_player_11\", \"away_player_1\",\n",
|
||||
" \"away_player_2\", \"away_player_3\", \"away_player_4\", \"away_player_5\", \"away_player_6\",\n",
|
||||
" \"away_player_7\", \"away_player_8\", \"away_player_9\", \"away_player_10\", \"away_player_11\"]\n",
|
||||
"match_data = matches_target.dropna(subset = cols)\n",
|
||||
"print(match_data.shape)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"fifa_data = get_fifa_data(match_data, players)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pd.set_option('display.max_columns', None) #show all columns in pandas"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(fifa_data.shape)\n",
|
||||
"fifa_data.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"bk_cols = ['B365', 'BW', 'IW', 'LB', 'PS', 'WH', 'SJ', 'VC', 'GB', 'BS']\n",
|
||||
"bk_cols_selected = ['B365', 'BW'] \n",
|
||||
"feables = create_feables(match_data, fifa_data, bk_cols_selected, get_overall = True)\n",
|
||||
"print(feables.shape)\n",
|
||||
"feables.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"feables_2015_2016 = feables[feables['season'] == '2015/2016']\n",
|
||||
"print(feables_2015_2016.shape)\n",
|
||||
"feables_2014_2015 = feables[feables['season'] == '2014/2015']\n",
|
||||
"print(feables_2014_2015.shape)\n",
|
||||
"feables_rest = feables[(feables['season'] != '2014/2015') & (feables['season'] != '2015/2016')]\n",
|
||||
"print(feables_rest.shape)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X_train = feables_rest[feables_rest.columns.difference(['match_api_id', 'label', 'season'])]\n",
|
||||
"y_train = feables_rest['label']\n",
|
||||
"X_test1 = feables_2014_2015[feables_rest.columns.difference(['match_api_id', 'label', 'season'])]\n",
|
||||
"y_test1 = feables_2014_2015['label']\n",
|
||||
"X_test2 = feables_2015_2016[feables_rest.columns.difference(['match_api_id', 'label', 'season'])]\n",
|
||||
"y_test2 = feables_2015_2016['label']\n",
|
||||
"\n",
|
||||
"print(X_train.shape)\n",
|
||||
"print(X_test1.shape)\n",
|
||||
"print(X_test2.shape)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"feables_up_to_2014_2015 = feables[feables['season'] != '2015/2016']\n",
|
||||
"print(feables_up_to_2014_2015.shape)\n",
|
||||
"X_train2 = feables_up_to_2014_2015[feables_up_to_2014_2015.columns.difference(['match_api_id', 'label', 'season'])]\n",
|
||||
"y_train2 = feables_up_to_2014_2015['label']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"clf = XGBClassifier(max_depth=8, \n",
|
||||
" learning_rate=0.1, \n",
|
||||
" scale_pos_weight=2,\n",
|
||||
" min_child_weight=5,\n",
|
||||
" n_estimators=100,\n",
|
||||
" subsample=1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"clf.fit(X_train, y_train, verbose=True, eval_metric='merror')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"y_pred1 = clf.predict(X_test1)\n",
|
||||
"print(\"Score of {} for training set: {:.4f}.\".format(clf.__class__.__name__, accuracy_score(y_train, clf.predict(X_train))))\n",
|
||||
"print(\"Score of {} for test set: {:.4f}.\".format(clf.__class__.__name__, accuracy_score(y_test1, y_pred1)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"y_pred2 = clf.predict(X_test2)\n",
|
||||
"print(\"Score of {} for training set: {:.4f}.\".format(clf.__class__.__name__, accuracy_score(y_train, clf.predict(X_train))))\n",
|
||||
"print(\"Score of {} for test set: {:.4f}.\".format(clf.__class__.__name__, accuracy_score(y_test2, y_pred2)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"clf.fit(X_train2, y_train2, verbose=True, eval_metric='merror')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"y_pred2 = clf.predict(X_test2)\n",
|
||||
"print(\"Score of {} for training set: {:.4f}.\".format(clf.__class__.__name__, accuracy_score(y_train2, clf.predict(X_train2))))\n",
|
||||
"print(\"Score of {} for test set: {:.4f}.\".format(clf.__class__.__name__, accuracy_score(y_test2, y_pred2)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"clf = LGBMClassifier(num_leaves=255,\n",
|
||||
" learning_rate=0.1, \n",
|
||||
" scale_pos_weight=2,\n",
|
||||
" min_child_weight=5,\n",
|
||||
" n_estimators=100,\n",
|
||||
" subsample=1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True, eval_metric='multi_error')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"y_pred1 = clf.predict(X_test1)\n",
|
||||
"print(\"Score of {} for training set: {:.4f}.\".format(clf.__class__.__name__, accuracy_score(y_train, clf.predict(X_train))))\n",
|
||||
"print(\"Score of {} for test set: {:.4f}.\".format(clf.__class__.__name__, accuracy_score(y_test1, y_pred1)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"y_pred2 = clf.predict(X_test2)\n",
|
||||
"print(\"Score of {} for training set: {:.4f}.\".format(clf.__class__.__name__, accuracy_score(y_train, clf.predict(X_train))))\n",
|
||||
"print(\"Score of {} for test set: {:.4f}.\".format(clf.__class__.__name__, accuracy_score(y_test2, y_pred2)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"clf.fit(X_train2, y_train2, verbose=True, eval_metric='merror')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"y_pred2 = clf.predict(X_test2)\n",
|
||||
"print(\"Score of {} for training set: {:.4f}.\".format(clf.__class__.__name__, accuracy_score(y_train2, clf.predict(X_train2))))\n",
|
||||
"print(\"Score of {} for test set: {:.4f}.\".format(clf.__class__.__name__, accuracy_score(y_test2, y_pred2)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python Strata",
|
||||
"display_name": "Python 3.5",
|
||||
"language": "python",
|
||||
"name": "strata"
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -1865,7 +1795,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.0"
|
||||
"version": "3.5.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
|
@ -2182,9 +2182,9 @@
|
|||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python Strata",
|
||||
"display_name": "Python 3.5",
|
||||
"language": "python",
|
||||
"name": "strata"
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -2196,7 +2196,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.0"
|
||||
"version": "3.5.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
|
|
@ -0,0 +1,352 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def get_fifa_stats(match, player_stats):
|
||||
''' Aggregates fifa stats for a given match. '''
|
||||
#Define variables
|
||||
match_id = match.match_api_id
|
||||
date = match['date']
|
||||
players = ['home_player_1', 'home_player_2', 'home_player_3', "home_player_4", "home_player_5",
|
||||
"home_player_6", "home_player_7", "home_player_8", "home_player_9", "home_player_10",
|
||||
"home_player_11", "away_player_1", "away_player_2", "away_player_3", "away_player_4",
|
||||
"away_player_5", "away_player_6", "away_player_7", "away_player_8", "away_player_9",
|
||||
"away_player_10", "away_player_11"]
|
||||
player_stats_new = pd.DataFrame()
|
||||
names = []
|
||||
|
||||
#Loop through all players
|
||||
for player in players:
|
||||
|
||||
#Get player ID
|
||||
player_id = match[player]
|
||||
|
||||
#Get player stats
|
||||
stats = player_stats[player_stats.player_api_id == player_id]
|
||||
|
||||
#Identify current stats
|
||||
current_stats = stats[stats.date < date].sort_values(by = 'date', ascending = False)[:1]
|
||||
|
||||
if np.isnan(player_id) == True:
|
||||
overall_rating = pd.Series(0)
|
||||
else:
|
||||
current_stats.reset_index(inplace = True, drop = True)
|
||||
overall_rating = pd.Series(current_stats.loc[0, "overall_rating"])
|
||||
|
||||
#Rename stat
|
||||
name = "{}_overall_rating".format(player)
|
||||
names.append(name)
|
||||
|
||||
#Aggregate stats
|
||||
player_stats_new = pd.concat([player_stats_new, overall_rating], axis = 1)
|
||||
|
||||
player_stats_new.columns = names
|
||||
player_stats_new['match_api_id'] = match_id
|
||||
|
||||
player_stats_new.reset_index(inplace = True, drop = True)
|
||||
|
||||
#Return player stats
|
||||
return player_stats_new.ix[0]
|
||||
|
||||
def get_fifa_data(matches, player_stats):
|
||||
''' Gets fifa data for all matches. '''
|
||||
#Apply get_fifa_stats for each match
|
||||
fifa_data = matches.apply(lambda x :get_fifa_stats(x, player_stats), axis = 1)
|
||||
return fifa_data
|
||||
|
||||
def get_match_label(match):
|
||||
''' Derives a label for a given match. '''
|
||||
|
||||
#Define variables
|
||||
home_goals = match['home_team_goal']
|
||||
away_goals = match['away_team_goal']
|
||||
|
||||
label = pd.DataFrame()
|
||||
label.loc[0,'match_api_id'] = match['match_api_id']
|
||||
|
||||
#Identify match label
|
||||
if home_goals > away_goals:
|
||||
label.loc[0,'label'] = "Win"
|
||||
if home_goals == away_goals:
|
||||
label.loc[0,'label'] = "Draw"
|
||||
if home_goals < away_goals:
|
||||
label.loc[0,'label'] = "Defeat"
|
||||
|
||||
#Return label
|
||||
return label.loc[0]
|
||||
|
||||
|
||||
def get_overall_fifa_rankings(fifa, get_overall = False):
|
||||
''' Get overall fifa rankings from fifa data. '''
|
||||
|
||||
temp_data = fifa
|
||||
|
||||
#Check if only overall player stats are desired
|
||||
if get_overall == True:
|
||||
|
||||
#Get overall stats
|
||||
data = temp_data.loc[:,(fifa.columns.str.contains('overall_rating'))]
|
||||
data.loc[:,'match_api_id'] = temp_data.loc[:,'match_api_id']
|
||||
else:
|
||||
|
||||
#Get all stats except for stat date
|
||||
cols = fifa.loc[:,(fifa.columns.str.contains('date_stat'))]
|
||||
temp_data = fifa.drop(cols.columns, axis = 1)
|
||||
data = temp_data
|
||||
|
||||
#Return data
|
||||
return data
|
||||
|
||||
def get_last_matches(matches, date, team, x = 10):
|
||||
''' Get the last x matches of a given team. '''
|
||||
|
||||
#Filter team matches from matches
|
||||
team_matches = matches[(matches['home_team_api_id'] == team) | (matches['away_team_api_id'] == team)]
|
||||
|
||||
#Filter x last matches from team matches
|
||||
last_matches = team_matches[team_matches.date < date].sort_values(by = 'date', ascending = False).iloc[0:x,:]
|
||||
|
||||
#Return last matches
|
||||
return last_matches
|
||||
|
||||
def get_last_matches_against_eachother(matches, date, home_team, away_team, x = 10):
|
||||
''' Get the last x matches of two given teams. '''
|
||||
|
||||
#Find matches of both teams
|
||||
home_matches = matches[(matches['home_team_api_id'] == home_team) & (matches['away_team_api_id'] == away_team)]
|
||||
away_matches = matches[(matches['home_team_api_id'] == away_team) & (matches['away_team_api_id'] == home_team)]
|
||||
total_matches = pd.concat([home_matches, away_matches])
|
||||
|
||||
#Get last x matches
|
||||
try:
|
||||
last_matches = total_matches[total_matches.date < date].sort_values(by = 'date', ascending = False).iloc[0:x,:]
|
||||
except:
|
||||
last_matches = total_matches[total_matches.date < date].sort_values(by = 'date', ascending = False).iloc[0:total_matches.shape[0],:]
|
||||
|
||||
#Check for error in data
|
||||
if(last_matches.shape[0] > x):
|
||||
print("Error in obtaining matches")
|
||||
|
||||
#Return data
|
||||
return last_matches
|
||||
|
||||
def get_goals(matches, team):
|
||||
''' Get the goals of a specfic team from a set of matches. '''
|
||||
|
||||
#Find home and away goals
|
||||
home_goals = int(matches.home_team_goal[matches.home_team_api_id == team].sum())
|
||||
away_goals = int(matches.away_team_goal[matches.away_team_api_id == team].sum())
|
||||
|
||||
total_goals = home_goals + away_goals
|
||||
|
||||
#Return total goals
|
||||
return total_goals
|
||||
|
||||
def get_goals_conceided(matches, team):
|
||||
''' Get the goals conceided of a specfic team from a set of matches. '''
|
||||
|
||||
#Find home and away goals
|
||||
home_goals = int(matches.home_team_goal[matches.away_team_api_id == team].sum())
|
||||
away_goals = int(matches.away_team_goal[matches.home_team_api_id == team].sum())
|
||||
|
||||
total_goals = home_goals + away_goals
|
||||
|
||||
#Return total goals
|
||||
return total_goals
|
||||
|
||||
def get_wins(matches, team):
|
||||
''' Get the number of wins of a specfic team from a set of matches. '''
|
||||
|
||||
#Find home and away wins
|
||||
home_wins = int(matches.home_team_goal[(matches.home_team_api_id == team) & (matches.home_team_goal > matches.away_team_goal)].count())
|
||||
away_wins = int(matches.away_team_goal[(matches.away_team_api_id == team) & (matches.away_team_goal > matches.home_team_goal)].count())
|
||||
|
||||
total_wins = home_wins + away_wins
|
||||
|
||||
#Return total wins
|
||||
return total_wins
|
||||
|
||||
def get_match_features(match, matches, x = 10):
|
||||
''' Create match specific features for a given match. '''
|
||||
|
||||
#Define variables
|
||||
date = match.date
|
||||
home_team = match.home_team_api_id
|
||||
away_team = match.away_team_api_id
|
||||
|
||||
#Get last x matches of home and away team
|
||||
matches_home_team = get_last_matches(matches, date, home_team, x = 10)
|
||||
matches_away_team = get_last_matches(matches, date, away_team, x = 10)
|
||||
|
||||
#Get last x matches of both teams against each other
|
||||
last_matches_against = get_last_matches_against_eachother(matches, date, home_team, away_team, x = 3)
|
||||
|
||||
#Create goal variables
|
||||
home_goals = get_goals(matches_home_team, home_team)
|
||||
away_goals = get_goals(matches_away_team, away_team)
|
||||
home_goals_conceided = get_goals_conceided(matches_home_team, home_team)
|
||||
away_goals_conceided = get_goals_conceided(matches_away_team, away_team)
|
||||
|
||||
#Define result data frame
|
||||
result = pd.DataFrame()
|
||||
|
||||
#Define ID features
|
||||
result.loc[0, 'match_api_id'] = match.match_api_id
|
||||
result.loc[0, 'league_id'] = match.league_id
|
||||
|
||||
#Create match features
|
||||
result.loc[0, 'home_team_goals_difference'] = home_goals - home_goals_conceided
|
||||
result.loc[0, 'away_team_goals_difference'] = away_goals - away_goals_conceided
|
||||
result.loc[0, 'games_won_home_team'] = get_wins(matches_home_team, home_team)
|
||||
result.loc[0, 'games_won_away_team'] = get_wins(matches_away_team, away_team)
|
||||
result.loc[0, 'games_against_won'] = get_wins(last_matches_against, home_team)
|
||||
result.loc[0, 'games_against_lost'] = get_wins(last_matches_against, away_team)
|
||||
|
||||
#Return match features
|
||||
return result.loc[0]
|
||||
|
||||
def create_feables(matches, fifa, bookkeepers, get_overall = False, horizontal = True, x = 10, verbose = True):
|
||||
''' Create and aggregate features and labels for all matches. '''
|
||||
|
||||
#Get fifa stats features
|
||||
fifa_stats = get_overall_fifa_rankings(fifa, get_overall)
|
||||
|
||||
|
||||
if verbose == True:
|
||||
print("Generating match features...")
|
||||
start = time()
|
||||
|
||||
#Get match features for all matches
|
||||
match_stats = matches.apply(lambda x: get_match_features(x, matches, x = 10), axis = 1)
|
||||
|
||||
#Create dummies for league ID feature
|
||||
dummies = pd.get_dummies(match_stats['league_id']).rename(columns = lambda x: 'League_' + str(x))
|
||||
match_stats = pd.concat([match_stats, dummies], axis = 1)
|
||||
match_stats.drop(['league_id'], inplace = True, axis = 1)
|
||||
|
||||
end = time()
|
||||
if verbose == True:
|
||||
print("Match features generated in {:.1f} minutes".format((end - start)/60))
|
||||
|
||||
if verbose == True:
|
||||
print("Generating match labels...")
|
||||
start = time()
|
||||
|
||||
#Create match labels
|
||||
labels = matches.apply(get_match_label, axis = 1)
|
||||
end = time()
|
||||
if verbose == True:
|
||||
print("Match labels generated in {:.1f} minutes".format((end - start)/60))
|
||||
|
||||
if verbose == True:
|
||||
print("Generating bookkeeper data...")
|
||||
start = time()
|
||||
|
||||
#Get bookkeeper quotas for all matches
|
||||
bk_data = get_bookkeeper_data(matches, bookkeepers, horizontal = True)
|
||||
bk_data.loc[:,'match_api_id'] = matches.loc[:,'match_api_id']
|
||||
end = time()
|
||||
if verbose == True:
|
||||
print("Bookkeeper data generated in {:.1f} minutes".format((end - start)/60))
|
||||
|
||||
#Merges features and labels into one frame
|
||||
features = pd.merge(match_stats, fifa_stats, on = 'match_api_id', how = 'left')
|
||||
features = pd.merge(features, bk_data, on = 'match_api_id', how = 'left')
|
||||
feables = pd.merge(features, labels, on = 'match_api_id', how = 'left')
|
||||
|
||||
#Drop NA values
|
||||
feables.dropna(inplace = True)
|
||||
|
||||
#Return preprocessed data
|
||||
return feables
|
||||
|
||||
|
||||
def convert_odds_to_prob(match_odds):
|
||||
''' Converts bookkeeper odds to probabilities. '''
|
||||
|
||||
#Define variables
|
||||
match_id = match_odds.loc[:,'match_api_id']
|
||||
bookkeeper = match_odds.loc[:,'bookkeeper']
|
||||
win_odd = match_odds.loc[:,'Win']
|
||||
draw_odd = match_odds.loc[:,'Draw']
|
||||
loss_odd = match_odds.loc[:,'Defeat']
|
||||
|
||||
#Converts odds to prob
|
||||
win_prob = 1 / win_odd
|
||||
draw_prob = 1 / draw_odd
|
||||
loss_prob = 1 / loss_odd
|
||||
|
||||
total_prob = win_prob + draw_prob + loss_prob
|
||||
|
||||
probs = pd.DataFrame()
|
||||
|
||||
#Define output format and scale probs by sum over all probs
|
||||
probs.loc[:,'match_api_id'] = match_id
|
||||
probs.loc[:,'bookkeeper'] = bookkeeper
|
||||
probs.loc[:,'Win'] = win_prob / total_prob
|
||||
probs.loc[:,'Draw'] = draw_prob / total_prob
|
||||
probs.loc[:,'Defeat'] = loss_prob / total_prob
|
||||
|
||||
#Return probs and meta data
|
||||
return probs
|
||||
|
||||
def get_bookkeeper_data(matches, bookkeepers, horizontal = True):
|
||||
''' Aggregates bookkeeper data for all matches and bookkeepers. '''
|
||||
|
||||
bk_data = pd.DataFrame()
|
||||
|
||||
#Loop through bookkeepers
|
||||
for bookkeeper in bookkeepers:
|
||||
|
||||
#Find columns containing data of bookkeeper
|
||||
temp_data = matches.loc[:,(matches.columns.str.contains(bookkeeper))]
|
||||
temp_data.loc[:, 'bookkeeper'] = str(bookkeeper)
|
||||
temp_data.loc[:, 'match_api_id'] = matches.loc[:, 'match_api_id']
|
||||
|
||||
#Rename odds columns and convert to numeric
|
||||
cols = temp_data.columns.values
|
||||
cols[:3] = ['Win','Draw','Defeat']
|
||||
temp_data.columns = cols
|
||||
temp_data.loc[:,'Win'] = pd.to_numeric(temp_data['Win'])
|
||||
temp_data.loc[:,'Draw'] = pd.to_numeric(temp_data['Draw'])
|
||||
temp_data.loc[:,'Defeat'] = pd.to_numeric(temp_data['Defeat'])
|
||||
|
||||
#Check if data should be aggregated horizontally
|
||||
if(horizontal == True):
|
||||
|
||||
#Convert data to probs
|
||||
temp_data = convert_odds_to_prob(temp_data)
|
||||
temp_data.drop('match_api_id', axis = 1, inplace = True)
|
||||
temp_data.drop('bookkeeper', axis = 1, inplace = True)
|
||||
|
||||
#Rename columns with bookkeeper names
|
||||
win_name = bookkeeper + "_" + "Win"
|
||||
draw_name = bookkeeper + "_" + "Draw"
|
||||
defeat_name = bookkeeper + "_" + "Defeat"
|
||||
temp_data.columns.values[:3] = [win_name, draw_name, defeat_name]
|
||||
|
||||
#Aggregate data
|
||||
bk_data = pd.concat([bk_data, temp_data], axis = 1)
|
||||
else:
|
||||
#Aggregate vertically
|
||||
bk_data = bk_data.append(temp_data, ignore_index = True)
|
||||
|
||||
#If horizontal add match api id to data
|
||||
if(horizontal == True):
|
||||
temp_data.loc[:, 'match_api_id'] = matches.loc[:, 'match_api_id']
|
||||
|
||||
#Return bookkeeper data
|
||||
return bk_data
|
||||
|
||||
def get_bookkeeper_probs(matches, bookkeepers, horizontal = False):
|
||||
''' Get bookkeeper data and convert to probabilities for vertical aggregation. '''
|
||||
|
||||
#Get bookkeeper data
|
||||
data = get_bookkeeper_data(matches, bookkeepers, horizontal = False)
|
||||
|
||||
#Convert odds to probabilities
|
||||
probs = convert_odds_to_prob(data)
|
||||
|
||||
#Return data
|
||||
return probs
|
||||
|
Загрузка…
Ссылка в новой задаче