This commit is contained in:
miguelgfierro 2017-05-20 20:09:04 +00:00
Родитель b4a6ac1168
Коммит 2eb40b4bea
3 изменённых файлов: 663 добавлений и 381 удалений

Просмотреть файл

@ -45,12 +45,6 @@
"import seaborn as sns\n",
"import itertools\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.ensemble import GradientBoostingClassifier\n",
"from sklearn.ensemble import AdaBoostClassifier \n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.naive_bayes import GaussianNB\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn import linear_model\n",
"from sklearn.metrics import (confusion_matrix, accuracy_score, roc_auc_score, f1_score, log_loss, precision_score,\n",
" recall_score, mean_squared_error, mean_absolute_error, r2_score, classification_report)\n",
"from sklearn.calibration import CalibratedClassifierCV\n",
@ -66,8 +60,12 @@
"from xgboost import XGBClassifier\n",
"from lightgbm import LGBMClassifier\n",
"from libs.loaders import load_football\n",
"from libs.football import get_fifa_data, create_feables\n",
"import pkg_resources\n",
"\n",
"print(\"System version: {}\".format(sys.version))"
"print(\"System version: {}\".format(sys.version))\n",
"print(\"XGBoost version: {}\".format(pkg_resources.get_distribution('xgboost').version))\n",
"print(\"LightGBM version: {}\".format(pkg_resources.get_distribution('lightgbm').version))"
]
},
{
@ -425,7 +423,9 @@
"cell_type": "code",
"execution_count": 68,
"metadata": {
"collapsed": false
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
@ -628,60 +628,7 @@
"editable": true
},
"outputs": [],
"source": [
"\n",
"def get_fifa_stats(match, player_stats):\n",
" ''' Aggregates fifa stats for a given match. '''\n",
" #Define variables\n",
" match_id = match.match_api_id\n",
" date = match['date']\n",
" players = ['home_player_1', 'home_player_2', 'home_player_3', \"home_player_4\", \"home_player_5\",\n",
" \"home_player_6\", \"home_player_7\", \"home_player_8\", \"home_player_9\", \"home_player_10\",\n",
" \"home_player_11\", \"away_player_1\", \"away_player_2\", \"away_player_3\", \"away_player_4\",\n",
" \"away_player_5\", \"away_player_6\", \"away_player_7\", \"away_player_8\", \"away_player_9\",\n",
" \"away_player_10\", \"away_player_11\"]\n",
" player_stats_new = pd.DataFrame()\n",
" names = []\n",
" \n",
" #Loop through all players\n",
" for player in players: \n",
" \n",
" #Get player ID\n",
" player_id = match[player]\n",
" \n",
" #Get player stats \n",
" stats = player_stats[player_stats.player_api_id == player_id]\n",
" \n",
" #Identify current stats \n",
" current_stats = stats[stats.date < date].sort_values(by = 'date', ascending = False)[:1]\n",
" \n",
" if np.isnan(player_id) == True:\n",
" overall_rating = pd.Series(0)\n",
" else:\n",
" current_stats.reset_index(inplace = True, drop = True)\n",
" overall_rating = pd.Series(current_stats.loc[0, \"overall_rating\"])\n",
"\n",
" #Rename stat\n",
" name = \"{}_overall_rating\".format(player)\n",
" names.append(name)\n",
" \n",
" #Aggregate stats\n",
" player_stats_new = pd.concat([player_stats_new, overall_rating], axis = 1)\n",
" \n",
" player_stats_new.columns = names \n",
" player_stats_new['match_api_id'] = match_id\n",
"\n",
" player_stats_new.reset_index(inplace = True, drop = True)\n",
" \n",
" #Return player stats \n",
" return player_stats_new.ix[0] \n",
" \n",
"def get_fifa_data(matches, player_stats):\n",
" ''' Gets fifa data for all matches. ''' \n",
" #Apply get_fifa_stats for each match\n",
" fifa_data = matches.apply(lambda x :get_fifa_stats(x, player_stats), axis = 1)\n",
" return fifa_data"
]
"source": []
},
{
"cell_type": "code",
@ -739,315 +686,6 @@
"fifa_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"## Loading all functions\n",
"def get_match_label(match):\n",
" ''' Derives a label for a given match. '''\n",
" \n",
" #Define variables\n",
" home_goals = match['home_team_goal']\n",
" away_goals = match['away_team_goal']\n",
" \n",
" label = pd.DataFrame()\n",
" label.loc[0,'match_api_id'] = match['match_api_id'] \n",
"\n",
" #Identify match label \n",
" if home_goals > away_goals:\n",
" label.loc[0,'label'] = \"Win\"\n",
" if home_goals == away_goals:\n",
" label.loc[0,'label'] = \"Draw\"\n",
" if home_goals < away_goals:\n",
" label.loc[0,'label'] = \"Defeat\"\n",
"\n",
" #Return label \n",
" return label.loc[0]\n",
" \n",
" \n",
"def get_overall_fifa_rankings(fifa, get_overall = False):\n",
" ''' Get overall fifa rankings from fifa data. '''\n",
" \n",
" temp_data = fifa\n",
" \n",
" #Check if only overall player stats are desired\n",
" if get_overall == True:\n",
" \n",
" #Get overall stats\n",
" data = temp_data.loc[:,(fifa.columns.str.contains('overall_rating'))]\n",
" data.loc[:,'match_api_id'] = temp_data.loc[:,'match_api_id']\n",
" else:\n",
" \n",
" #Get all stats except for stat date\n",
" cols = fifa.loc[:,(fifa.columns.str.contains('date_stat'))]\n",
" temp_data = fifa.drop(cols.columns, axis = 1) \n",
" data = temp_data\n",
" \n",
" #Return data\n",
" return data\n",
"\n",
"def get_last_matches(matches, date, team, x = 10):\n",
" ''' Get the last x matches of a given team. '''\n",
" \n",
" #Filter team matches from matches\n",
" team_matches = matches[(matches['home_team_api_id'] == team) | (matches['away_team_api_id'] == team)]\n",
" \n",
" #Filter x last matches from team matches\n",
" last_matches = team_matches[team_matches.date < date].sort_values(by = 'date', ascending = False).iloc[0:x,:]\n",
" \n",
" #Return last matches\n",
" return last_matches\n",
" \n",
"def get_last_matches_against_eachother(matches, date, home_team, away_team, x = 10):\n",
" ''' Get the last x matches of two given teams. '''\n",
" \n",
" #Find matches of both teams\n",
" home_matches = matches[(matches['home_team_api_id'] == home_team) & (matches['away_team_api_id'] == away_team)] \n",
" away_matches = matches[(matches['home_team_api_id'] == away_team) & (matches['away_team_api_id'] == home_team)] \n",
" total_matches = pd.concat([home_matches, away_matches])\n",
" \n",
" #Get last x matches\n",
" try: \n",
" last_matches = total_matches[total_matches.date < date].sort_values(by = 'date', ascending = False).iloc[0:x,:]\n",
" except:\n",
" last_matches = total_matches[total_matches.date < date].sort_values(by = 'date', ascending = False).iloc[0:total_matches.shape[0],:]\n",
" \n",
" #Check for error in data\n",
" if(last_matches.shape[0] > x):\n",
" print(\"Error in obtaining matches\")\n",
" \n",
" #Return data\n",
" return last_matches\n",
" \n",
"def get_goals(matches, team):\n",
" ''' Get the goals of a specfic team from a set of matches. '''\n",
" \n",
" #Find home and away goals\n",
" home_goals = int(matches.home_team_goal[matches.home_team_api_id == team].sum())\n",
" away_goals = int(matches.away_team_goal[matches.away_team_api_id == team].sum())\n",
"\n",
" total_goals = home_goals + away_goals\n",
" \n",
" #Return total goals\n",
" return total_goals\n",
"\n",
"def get_goals_conceided(matches, team):\n",
" ''' Get the goals conceided of a specfic team from a set of matches. '''\n",
"\n",
" #Find home and away goals\n",
" home_goals = int(matches.home_team_goal[matches.away_team_api_id == team].sum())\n",
" away_goals = int(matches.away_team_goal[matches.home_team_api_id == team].sum())\n",
"\n",
" total_goals = home_goals + away_goals\n",
"\n",
" #Return total goals\n",
" return total_goals\n",
"\n",
"def get_wins(matches, team):\n",
" ''' Get the number of wins of a specfic team from a set of matches. '''\n",
" \n",
" #Find home and away wins\n",
" home_wins = int(matches.home_team_goal[(matches.home_team_api_id == team) & (matches.home_team_goal > matches.away_team_goal)].count())\n",
" away_wins = int(matches.away_team_goal[(matches.away_team_api_id == team) & (matches.away_team_goal > matches.home_team_goal)].count())\n",
"\n",
" total_wins = home_wins + away_wins\n",
"\n",
" #Return total wins\n",
" return total_wins \n",
" \n",
"def get_match_features(match, matches, x = 10):\n",
" ''' Create match specific features for a given match. '''\n",
" \n",
" #Define variables\n",
" date = match.date\n",
" home_team = match.home_team_api_id\n",
" away_team = match.away_team_api_id\n",
" \n",
" #Get last x matches of home and away team\n",
" matches_home_team = get_last_matches(matches, date, home_team, x = 10)\n",
" matches_away_team = get_last_matches(matches, date, away_team, x = 10)\n",
" \n",
" #Get last x matches of both teams against each other\n",
" last_matches_against = get_last_matches_against_eachother(matches, date, home_team, away_team, x = 3)\n",
" \n",
" #Create goal variables\n",
" home_goals = get_goals(matches_home_team, home_team)\n",
" away_goals = get_goals(matches_away_team, away_team)\n",
" home_goals_conceided = get_goals_conceided(matches_home_team, home_team)\n",
" away_goals_conceided = get_goals_conceided(matches_away_team, away_team)\n",
" \n",
" #Define result data frame\n",
" result = pd.DataFrame()\n",
" \n",
" #Define ID features\n",
" result.loc[0, 'match_api_id'] = match.match_api_id\n",
" result.loc[0, 'league_id'] = match.league_id\n",
"\n",
" #Create match features\n",
" result.loc[0, 'home_team_goals_difference'] = home_goals - home_goals_conceided\n",
" result.loc[0, 'away_team_goals_difference'] = away_goals - away_goals_conceided\n",
" result.loc[0, 'games_won_home_team'] = get_wins(matches_home_team, home_team) \n",
" result.loc[0, 'games_won_away_team'] = get_wins(matches_away_team, away_team)\n",
" result.loc[0, 'games_against_won'] = get_wins(last_matches_against, home_team)\n",
" result.loc[0, 'games_against_lost'] = get_wins(last_matches_against, away_team)\n",
" \n",
" #Return match features\n",
" return result.loc[0]\n",
" \n",
"def create_feables(matches, fifa, bookkeepers, get_overall = False, horizontal = True, x = 10, verbose = True):\n",
" ''' Create and aggregate features and labels for all matches. '''\n",
"\n",
" #Get fifa stats features\n",
" fifa_stats = get_overall_fifa_rankings(fifa, get_overall)\n",
" \n",
" \n",
" if verbose == True:\n",
" print(\"Generating match features...\")\n",
" start = time()\n",
" \n",
" #Get match features for all matches\n",
" match_stats = matches.apply(lambda x: get_match_features(x, matches, x = 10), axis = 1)\n",
" \n",
" #Create dummies for league ID feature\n",
" dummies = pd.get_dummies(match_stats['league_id']).rename(columns = lambda x: 'League_' + str(x))\n",
" match_stats = pd.concat([match_stats, dummies], axis = 1)\n",
" match_stats.drop(['league_id'], inplace = True, axis = 1)\n",
" \n",
" end = time()\n",
" if verbose == True:\n",
" print(\"Match features generated in {:.1f} minutes\".format((end - start)/60))\n",
" \n",
" if verbose == True: \n",
" print(\"Generating match labels...\")\n",
" start = time()\n",
" \n",
" #Create match labels\n",
" labels = matches.apply(get_match_label, axis = 1)\n",
" end = time()\n",
" if verbose == True:\n",
" print(\"Match labels generated in {:.1f} minutes\".format((end - start)/60))\n",
" \n",
" if verbose == True: \n",
" print(\"Generating bookkeeper data...\")\n",
" start = time()\n",
" \n",
" #Get bookkeeper quotas for all matches\n",
" bk_data = get_bookkeeper_data(matches, bookkeepers, horizontal = True)\n",
" bk_data.loc[:,'match_api_id'] = matches.loc[:,'match_api_id']\n",
" end = time()\n",
" if verbose == True:\n",
" print(\"Bookkeeper data generated in {:.1f} minutes\".format((end - start)/60))\n",
"\n",
" #Merges features and labels into one frame\n",
" features = pd.merge(match_stats, fifa_stats, on = 'match_api_id', how = 'left')\n",
" features = pd.merge(features, bk_data, on = 'match_api_id', how = 'left')\n",
" feables = pd.merge(features, labels, on = 'match_api_id', how = 'left')\n",
" \n",
" #Drop NA values\n",
" feables.dropna(inplace = True)\n",
" \n",
" #Return preprocessed data\n",
" return feables\n",
" \n",
"\n",
"def convert_odds_to_prob(match_odds):\n",
" ''' Converts bookkeeper odds to probabilities. '''\n",
" \n",
" #Define variables\n",
" match_id = match_odds.loc[:,'match_api_id']\n",
" bookkeeper = match_odds.loc[:,'bookkeeper'] \n",
" win_odd = match_odds.loc[:,'Win']\n",
" draw_odd = match_odds.loc[:,'Draw']\n",
" loss_odd = match_odds.loc[:,'Defeat']\n",
" \n",
" #Converts odds to prob\n",
" win_prob = 1 / win_odd\n",
" draw_prob = 1 / draw_odd\n",
" loss_prob = 1 / loss_odd\n",
" \n",
" total_prob = win_prob + draw_prob + loss_prob\n",
" \n",
" probs = pd.DataFrame()\n",
" \n",
" #Define output format and scale probs by sum over all probs\n",
" probs.loc[:,'match_api_id'] = match_id\n",
" probs.loc[:,'bookkeeper'] = bookkeeper\n",
" probs.loc[:,'Win'] = win_prob / total_prob\n",
" probs.loc[:,'Draw'] = draw_prob / total_prob\n",
" probs.loc[:,'Defeat'] = loss_prob / total_prob\n",
" \n",
" #Return probs and meta data\n",
" return probs\n",
" \n",
"def get_bookkeeper_data(matches, bookkeepers, horizontal = True):\n",
" ''' Aggregates bookkeeper data for all matches and bookkeepers. '''\n",
" \n",
" bk_data = pd.DataFrame()\n",
" \n",
" #Loop through bookkeepers\n",
" for bookkeeper in bookkeepers:\n",
"\n",
" #Find columns containing data of bookkeeper\n",
" temp_data = matches.loc[:,(matches.columns.str.contains(bookkeeper))]\n",
" temp_data.loc[:, 'bookkeeper'] = str(bookkeeper)\n",
" temp_data.loc[:, 'match_api_id'] = matches.loc[:, 'match_api_id']\n",
" \n",
" #Rename odds columns and convert to numeric\n",
" cols = temp_data.columns.values\n",
" cols[:3] = ['Win','Draw','Defeat']\n",
" temp_data.columns = cols\n",
" temp_data.loc[:,'Win'] = pd.to_numeric(temp_data['Win'])\n",
" temp_data.loc[:,'Draw'] = pd.to_numeric(temp_data['Draw'])\n",
" temp_data.loc[:,'Defeat'] = pd.to_numeric(temp_data['Defeat'])\n",
" \n",
" #Check if data should be aggregated horizontally\n",
" if(horizontal == True):\n",
" \n",
" #Convert data to probs\n",
" temp_data = convert_odds_to_prob(temp_data)\n",
" temp_data.drop('match_api_id', axis = 1, inplace = True)\n",
" temp_data.drop('bookkeeper', axis = 1, inplace = True)\n",
" \n",
" #Rename columns with bookkeeper names\n",
" win_name = bookkeeper + \"_\" + \"Win\"\n",
" draw_name = bookkeeper + \"_\" + \"Draw\"\n",
" defeat_name = bookkeeper + \"_\" + \"Defeat\"\n",
" temp_data.columns.values[:3] = [win_name, draw_name, defeat_name]\n",
"\n",
" #Aggregate data\n",
" bk_data = pd.concat([bk_data, temp_data], axis = 1)\n",
" else:\n",
" #Aggregate vertically\n",
" bk_data = bk_data.append(temp_data, ignore_index = True)\n",
" \n",
" #If horizontal add match api id to data\n",
" if(horizontal == True):\n",
" temp_data.loc[:, 'match_api_id'] = matches.loc[:, 'match_api_id']\n",
" \n",
" #Return bookkeeper data\n",
" return bk_data\n",
" \n",
"def get_bookkeeper_probs(matches, bookkeepers, horizontal = False):\n",
" ''' Get bookkeeper data and convert to probabilities for vertical aggregation. '''\n",
" \n",
" #Get bookkeeper data\n",
" data = get_bookkeeper_data(matches, bookkeepers, horizontal = False)\n",
" \n",
" #Convert odds to probabilities\n",
" probs = convert_odds_to_prob(data)\n",
" \n",
" #Return data\n",
" return probs\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
@ -1827,15 +1465,15 @@
]
},
{
"cell_type": "code",
"execution_count": null,
"cell_type": "markdown",
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": []
"source": [
"# 2) Concept drift"
]
},
{
"cell_type": "code",
@ -1846,14 +1484,306 @@
"editable": true
},
"outputs": [],
"source": [
"la_liga_id = 21518\n",
"matches_target = matches[matches['league_id'] == la_liga_id]\n",
"print(matches_target.shape)\n",
"print(matches_target.head(5))\n",
"print(matches_target.tail(5))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"cols = [\"country_id\", \"league_id\", \"season\", \"stage\", \"date\", \"match_api_id\", \"home_team_api_id\", \n",
" \"away_team_api_id\", \"home_team_goal\", \"away_team_goal\", \"home_player_1\", \"home_player_2\",\n",
" \"home_player_3\", \"home_player_4\", \"home_player_5\", \"home_player_6\", \"home_player_7\", \n",
" \"home_player_8\", \"home_player_9\", \"home_player_10\", \"home_player_11\", \"away_player_1\",\n",
" \"away_player_2\", \"away_player_3\", \"away_player_4\", \"away_player_5\", \"away_player_6\",\n",
" \"away_player_7\", \"away_player_8\", \"away_player_9\", \"away_player_10\", \"away_player_11\"]\n",
"match_data = matches_target.dropna(subset = cols)\n",
"print(match_data.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%%time\n",
"fifa_data = get_fifa_data(match_data, players)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"pd.set_option('display.max_columns', None) #show all columns in pandas"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"print(fifa_data.shape)\n",
"fifa_data.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%%time\n",
"bk_cols = ['B365', 'BW', 'IW', 'LB', 'PS', 'WH', 'SJ', 'VC', 'GB', 'BS']\n",
"bk_cols_selected = ['B365', 'BW'] \n",
"feables = create_feables(match_data, fifa_data, bk_cols_selected, get_overall = True)\n",
"print(feables.shape)\n",
"feables.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"feables_2015_2016 = feables[feables['season'] == '2015/2016']\n",
"print(feables_2015_2016.shape)\n",
"feables_2014_2015 = feables[feables['season'] == '2014/2015']\n",
"print(feables_2014_2015.shape)\n",
"feables_rest = feables[(feables['season'] != '2014/2015') & (feables['season'] != '2015/2016')]\n",
"print(feables_rest.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"X_train = feables_rest[feables_rest.columns.difference(['match_api_id', 'label', 'season'])]\n",
"y_train = feables_rest['label']\n",
"X_test1 = feables_2014_2015[feables_rest.columns.difference(['match_api_id', 'label', 'season'])]\n",
"y_test1 = feables_2014_2015['label']\n",
"X_test2 = feables_2015_2016[feables_rest.columns.difference(['match_api_id', 'label', 'season'])]\n",
"y_test2 = feables_2015_2016['label']\n",
"\n",
"print(X_train.shape)\n",
"print(X_test1.shape)\n",
"print(X_test2.shape)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"feables_up_to_2014_2015 = feables[feables['season'] != '2015/2016']\n",
"print(feables_up_to_2014_2015.shape)\n",
"X_train2 = feables_up_to_2014_2015[feables_up_to_2014_2015.columns.difference(['match_api_id', 'label', 'season'])]\n",
"y_train2 = feables_up_to_2014_2015['label']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"clf = XGBClassifier(max_depth=8, \n",
" learning_rate=0.1, \n",
" scale_pos_weight=2,\n",
" min_child_weight=5,\n",
" n_estimators=100,\n",
" subsample=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%%time\n",
"clf.fit(X_train, y_train, verbose=True, eval_metric='merror')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"y_pred1 = clf.predict(X_test1)\n",
"print(\"Score of {} for training set: {:.4f}.\".format(clf.__class__.__name__, accuracy_score(y_train, clf.predict(X_train))))\n",
"print(\"Score of {} for test set: {:.4f}.\".format(clf.__class__.__name__, accuracy_score(y_test1, y_pred1)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"y_pred2 = clf.predict(X_test2)\n",
"print(\"Score of {} for training set: {:.4f}.\".format(clf.__class__.__name__, accuracy_score(y_train, clf.predict(X_train))))\n",
"print(\"Score of {} for test set: {:.4f}.\".format(clf.__class__.__name__, accuracy_score(y_test2, y_pred2)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%%time\n",
"clf.fit(X_train2, y_train2, verbose=True, eval_metric='merror')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"y_pred2 = clf.predict(X_test2)\n",
"print(\"Score of {} for training set: {:.4f}.\".format(clf.__class__.__name__, accuracy_score(y_train2, clf.predict(X_train2))))\n",
"print(\"Score of {} for test set: {:.4f}.\".format(clf.__class__.__name__, accuracy_score(y_test2, y_pred2)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"clf = LGBMClassifier(num_leaves=255,\n",
" learning_rate=0.1, \n",
" scale_pos_weight=2,\n",
" min_child_weight=5,\n",
" n_estimators=100,\n",
" subsample=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%%time\n",
"clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True, eval_metric='multi_error')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"y_pred1 = clf.predict(X_test1)\n",
"print(\"Score of {} for training set: {:.4f}.\".format(clf.__class__.__name__, accuracy_score(y_train, clf.predict(X_train))))\n",
"print(\"Score of {} for test set: {:.4f}.\".format(clf.__class__.__name__, accuracy_score(y_test1, y_pred1)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"y_pred2 = clf.predict(X_test2)\n",
"print(\"Score of {} for training set: {:.4f}.\".format(clf.__class__.__name__, accuracy_score(y_train, clf.predict(X_train))))\n",
"print(\"Score of {} for test set: {:.4f}.\".format(clf.__class__.__name__, accuracy_score(y_test2, y_pred2)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%%time\n",
"clf.fit(X_train2, y_train2, verbose=True, eval_metric='merror')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"y_pred2 = clf.predict(X_test2)\n",
"print(\"Score of {} for training set: {:.4f}.\".format(clf.__class__.__name__, accuracy_score(y_train2, clf.predict(X_train2))))\n",
"print(\"Score of {} for test set: {:.4f}.\".format(clf.__class__.__name__, accuracy_score(y_test2, y_pred2)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python Strata",
"display_name": "Python 3.5",
"language": "python",
"name": "strata"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
@ -1865,7 +1795,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.0"
"version": "3.5.2"
}
},
"nbformat": 4,

Просмотреть файл

@ -2182,9 +2182,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python Strata",
"display_name": "Python 3.5",
"language": "python",
"name": "strata"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
@ -2196,7 +2196,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.0"
"version": "3.5.2"
}
},
"nbformat": 4,

Просмотреть файл

@ -0,0 +1,352 @@
import numpy as np
import pandas as pd
def get_fifa_stats(match, player_stats):
''' Aggregates fifa stats for a given match. '''
#Define variables
match_id = match.match_api_id
date = match['date']
players = ['home_player_1', 'home_player_2', 'home_player_3', "home_player_4", "home_player_5",
"home_player_6", "home_player_7", "home_player_8", "home_player_9", "home_player_10",
"home_player_11", "away_player_1", "away_player_2", "away_player_3", "away_player_4",
"away_player_5", "away_player_6", "away_player_7", "away_player_8", "away_player_9",
"away_player_10", "away_player_11"]
player_stats_new = pd.DataFrame()
names = []
#Loop through all players
for player in players:
#Get player ID
player_id = match[player]
#Get player stats
stats = player_stats[player_stats.player_api_id == player_id]
#Identify current stats
current_stats = stats[stats.date < date].sort_values(by = 'date', ascending = False)[:1]
if np.isnan(player_id) == True:
overall_rating = pd.Series(0)
else:
current_stats.reset_index(inplace = True, drop = True)
overall_rating = pd.Series(current_stats.loc[0, "overall_rating"])
#Rename stat
name = "{}_overall_rating".format(player)
names.append(name)
#Aggregate stats
player_stats_new = pd.concat([player_stats_new, overall_rating], axis = 1)
player_stats_new.columns = names
player_stats_new['match_api_id'] = match_id
player_stats_new.reset_index(inplace = True, drop = True)
#Return player stats
return player_stats_new.ix[0]
def get_fifa_data(matches, player_stats):
''' Gets fifa data for all matches. '''
#Apply get_fifa_stats for each match
fifa_data = matches.apply(lambda x :get_fifa_stats(x, player_stats), axis = 1)
return fifa_data
def get_match_label(match):
''' Derives a label for a given match. '''
#Define variables
home_goals = match['home_team_goal']
away_goals = match['away_team_goal']
label = pd.DataFrame()
label.loc[0,'match_api_id'] = match['match_api_id']
#Identify match label
if home_goals > away_goals:
label.loc[0,'label'] = "Win"
if home_goals == away_goals:
label.loc[0,'label'] = "Draw"
if home_goals < away_goals:
label.loc[0,'label'] = "Defeat"
#Return label
return label.loc[0]
def get_overall_fifa_rankings(fifa, get_overall = False):
''' Get overall fifa rankings from fifa data. '''
temp_data = fifa
#Check if only overall player stats are desired
if get_overall == True:
#Get overall stats
data = temp_data.loc[:,(fifa.columns.str.contains('overall_rating'))]
data.loc[:,'match_api_id'] = temp_data.loc[:,'match_api_id']
else:
#Get all stats except for stat date
cols = fifa.loc[:,(fifa.columns.str.contains('date_stat'))]
temp_data = fifa.drop(cols.columns, axis = 1)
data = temp_data
#Return data
return data
def get_last_matches(matches, date, team, x = 10):
''' Get the last x matches of a given team. '''
#Filter team matches from matches
team_matches = matches[(matches['home_team_api_id'] == team) | (matches['away_team_api_id'] == team)]
#Filter x last matches from team matches
last_matches = team_matches[team_matches.date < date].sort_values(by = 'date', ascending = False).iloc[0:x,:]
#Return last matches
return last_matches
def get_last_matches_against_eachother(matches, date, home_team, away_team, x = 10):
''' Get the last x matches of two given teams. '''
#Find matches of both teams
home_matches = matches[(matches['home_team_api_id'] == home_team) & (matches['away_team_api_id'] == away_team)]
away_matches = matches[(matches['home_team_api_id'] == away_team) & (matches['away_team_api_id'] == home_team)]
total_matches = pd.concat([home_matches, away_matches])
#Get last x matches
try:
last_matches = total_matches[total_matches.date < date].sort_values(by = 'date', ascending = False).iloc[0:x,:]
except:
last_matches = total_matches[total_matches.date < date].sort_values(by = 'date', ascending = False).iloc[0:total_matches.shape[0],:]
#Check for error in data
if(last_matches.shape[0] > x):
print("Error in obtaining matches")
#Return data
return last_matches
def get_goals(matches, team):
''' Get the goals of a specfic team from a set of matches. '''
#Find home and away goals
home_goals = int(matches.home_team_goal[matches.home_team_api_id == team].sum())
away_goals = int(matches.away_team_goal[matches.away_team_api_id == team].sum())
total_goals = home_goals + away_goals
#Return total goals
return total_goals
def get_goals_conceided(matches, team):
''' Get the goals conceided of a specfic team from a set of matches. '''
#Find home and away goals
home_goals = int(matches.home_team_goal[matches.away_team_api_id == team].sum())
away_goals = int(matches.away_team_goal[matches.home_team_api_id == team].sum())
total_goals = home_goals + away_goals
#Return total goals
return total_goals
def get_wins(matches, team):
''' Get the number of wins of a specfic team from a set of matches. '''
#Find home and away wins
home_wins = int(matches.home_team_goal[(matches.home_team_api_id == team) & (matches.home_team_goal > matches.away_team_goal)].count())
away_wins = int(matches.away_team_goal[(matches.away_team_api_id == team) & (matches.away_team_goal > matches.home_team_goal)].count())
total_wins = home_wins + away_wins
#Return total wins
return total_wins
def get_match_features(match, matches, x = 10):
''' Create match specific features for a given match. '''
#Define variables
date = match.date
home_team = match.home_team_api_id
away_team = match.away_team_api_id
#Get last x matches of home and away team
matches_home_team = get_last_matches(matches, date, home_team, x = 10)
matches_away_team = get_last_matches(matches, date, away_team, x = 10)
#Get last x matches of both teams against each other
last_matches_against = get_last_matches_against_eachother(matches, date, home_team, away_team, x = 3)
#Create goal variables
home_goals = get_goals(matches_home_team, home_team)
away_goals = get_goals(matches_away_team, away_team)
home_goals_conceided = get_goals_conceided(matches_home_team, home_team)
away_goals_conceided = get_goals_conceided(matches_away_team, away_team)
#Define result data frame
result = pd.DataFrame()
#Define ID features
result.loc[0, 'match_api_id'] = match.match_api_id
result.loc[0, 'league_id'] = match.league_id
#Create match features
result.loc[0, 'home_team_goals_difference'] = home_goals - home_goals_conceided
result.loc[0, 'away_team_goals_difference'] = away_goals - away_goals_conceided
result.loc[0, 'games_won_home_team'] = get_wins(matches_home_team, home_team)
result.loc[0, 'games_won_away_team'] = get_wins(matches_away_team, away_team)
result.loc[0, 'games_against_won'] = get_wins(last_matches_against, home_team)
result.loc[0, 'games_against_lost'] = get_wins(last_matches_against, away_team)
#Return match features
return result.loc[0]
def create_feables(matches, fifa, bookkeepers, get_overall = False, horizontal = True, x = 10, verbose = True):
''' Create and aggregate features and labels for all matches. '''
#Get fifa stats features
fifa_stats = get_overall_fifa_rankings(fifa, get_overall)
if verbose == True:
print("Generating match features...")
start = time()
#Get match features for all matches
match_stats = matches.apply(lambda x: get_match_features(x, matches, x = 10), axis = 1)
#Create dummies for league ID feature
dummies = pd.get_dummies(match_stats['league_id']).rename(columns = lambda x: 'League_' + str(x))
match_stats = pd.concat([match_stats, dummies], axis = 1)
match_stats.drop(['league_id'], inplace = True, axis = 1)
end = time()
if verbose == True:
print("Match features generated in {:.1f} minutes".format((end - start)/60))
if verbose == True:
print("Generating match labels...")
start = time()
#Create match labels
labels = matches.apply(get_match_label, axis = 1)
end = time()
if verbose == True:
print("Match labels generated in {:.1f} minutes".format((end - start)/60))
if verbose == True:
print("Generating bookkeeper data...")
start = time()
#Get bookkeeper quotas for all matches
bk_data = get_bookkeeper_data(matches, bookkeepers, horizontal = True)
bk_data.loc[:,'match_api_id'] = matches.loc[:,'match_api_id']
end = time()
if verbose == True:
print("Bookkeeper data generated in {:.1f} minutes".format((end - start)/60))
#Merges features and labels into one frame
features = pd.merge(match_stats, fifa_stats, on = 'match_api_id', how = 'left')
features = pd.merge(features, bk_data, on = 'match_api_id', how = 'left')
feables = pd.merge(features, labels, on = 'match_api_id', how = 'left')
#Drop NA values
feables.dropna(inplace = True)
#Return preprocessed data
return feables
def convert_odds_to_prob(match_odds):
''' Converts bookkeeper odds to probabilities. '''
#Define variables
match_id = match_odds.loc[:,'match_api_id']
bookkeeper = match_odds.loc[:,'bookkeeper']
win_odd = match_odds.loc[:,'Win']
draw_odd = match_odds.loc[:,'Draw']
loss_odd = match_odds.loc[:,'Defeat']
#Converts odds to prob
win_prob = 1 / win_odd
draw_prob = 1 / draw_odd
loss_prob = 1 / loss_odd
total_prob = win_prob + draw_prob + loss_prob
probs = pd.DataFrame()
#Define output format and scale probs by sum over all probs
probs.loc[:,'match_api_id'] = match_id
probs.loc[:,'bookkeeper'] = bookkeeper
probs.loc[:,'Win'] = win_prob / total_prob
probs.loc[:,'Draw'] = draw_prob / total_prob
probs.loc[:,'Defeat'] = loss_prob / total_prob
#Return probs and meta data
return probs
def get_bookkeeper_data(matches, bookkeepers, horizontal = True):
''' Aggregates bookkeeper data for all matches and bookkeepers. '''
bk_data = pd.DataFrame()
#Loop through bookkeepers
for bookkeeper in bookkeepers:
#Find columns containing data of bookkeeper
temp_data = matches.loc[:,(matches.columns.str.contains(bookkeeper))]
temp_data.loc[:, 'bookkeeper'] = str(bookkeeper)
temp_data.loc[:, 'match_api_id'] = matches.loc[:, 'match_api_id']
#Rename odds columns and convert to numeric
cols = temp_data.columns.values
cols[:3] = ['Win','Draw','Defeat']
temp_data.columns = cols
temp_data.loc[:,'Win'] = pd.to_numeric(temp_data['Win'])
temp_data.loc[:,'Draw'] = pd.to_numeric(temp_data['Draw'])
temp_data.loc[:,'Defeat'] = pd.to_numeric(temp_data['Defeat'])
#Check if data should be aggregated horizontally
if(horizontal == True):
#Convert data to probs
temp_data = convert_odds_to_prob(temp_data)
temp_data.drop('match_api_id', axis = 1, inplace = True)
temp_data.drop('bookkeeper', axis = 1, inplace = True)
#Rename columns with bookkeeper names
win_name = bookkeeper + "_" + "Win"
draw_name = bookkeeper + "_" + "Draw"
defeat_name = bookkeeper + "_" + "Defeat"
temp_data.columns.values[:3] = [win_name, draw_name, defeat_name]
#Aggregate data
bk_data = pd.concat([bk_data, temp_data], axis = 1)
else:
#Aggregate vertically
bk_data = bk_data.append(temp_data, ignore_index = True)
#If horizontal add match api id to data
if(horizontal == True):
temp_data.loc[:, 'match_api_id'] = matches.loc[:, 'match_api_id']
#Return bookkeeper data
return bk_data
def get_bookkeeper_probs(matches, bookkeepers, horizontal = False):
''' Get bookkeeper data and convert to probabilities for vertical aggregation. '''
#Get bookkeeper data
data = get_bookkeeper_data(matches, bookkeepers, horizontal = False)
#Convert odds to probabilities
probs = convert_odds_to_prob(data)
#Return data
return probs