Merge pull request #27 from Azure/miguel_dev
football and airline experiments
This commit is contained in:
Коммит
73a00bb04f
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,86 +0,0 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Playground\n",
|
||||
"\n",
|
||||
"Playground notebook to quickly test code."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"System version: 3.5.2 |Continuum Analytics, Inc.| (default, Jul 2 2016, 17:53:06) \n",
|
||||
"[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import os,sys\n",
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"\n",
|
||||
"from lightgbm.sklearn import LGBMRegressor\n",
|
||||
"import xgboost as xgb\n",
|
||||
"\n",
|
||||
"from libs.timer import Timer\n",
|
||||
"\n",
|
||||
"print(\"System version: {}\".format(sys.version))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0.000576000000000132\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"with Timer() as t:\n",
|
||||
" for i in range(10000):\n",
|
||||
" r = 1\n",
|
||||
"print(t.interval)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.5.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
|
@ -0,0 +1,95 @@
|
|||
import pandas as pd
|
||||
|
||||
|
||||
def _get_nominal_integer_dict(nominal_vals):
|
||||
"""Convert nominal values in integers, starting at 0.
|
||||
Parameters:
|
||||
nominal_vals (pd.Series): A series.
|
||||
Returns:
|
||||
d (dict): An dictionary with numeric values.
|
||||
|
||||
"""
|
||||
d = {}
|
||||
for val in nominal_vals:
|
||||
if val not in d:
|
||||
current_max = max(d.values()) if len(d) > 0 else -1
|
||||
d[val] = current_max+1
|
||||
return d
|
||||
|
||||
|
||||
def _convert_to_integer(srs, d):
|
||||
"""Convert series to integer, given a dictionary.
|
||||
Parameters:
|
||||
srs (pd.Series): A series.
|
||||
d (dict): A dictionary mapping values to integers
|
||||
Returns:
|
||||
srs (pd.Series): An series with numeric values.
|
||||
|
||||
"""
|
||||
return srs.map(lambda x: d[x])
|
||||
|
||||
|
||||
def convert_cols_categorical_to_numeric(df, col_list=None):
|
||||
"""Convert categorical columns to numeric and leave numeric columns
|
||||
as they are. You can force to convert a numerical column if it is
|
||||
included in col_list
|
||||
Parameters:
|
||||
df (pd.DataFrame): Dataframe.
|
||||
col_list (list): List of columns.
|
||||
Returns:
|
||||
ret (pd.DataFrame): An dataframe with numeric values.
|
||||
Examples:
|
||||
>>> df = pd.DataFrame({'letters':['a','b','c'],'numbers':[1,2,3]})
|
||||
>>> df_numeric = convert_cols_categorical_to_numeric(df)
|
||||
>>> print(df_numeric)
|
||||
letters numbers
|
||||
0 0 1
|
||||
1 1 2
|
||||
2 2 3
|
||||
|
||||
"""
|
||||
if col_list is None: col_list = []
|
||||
ret = pd.DataFrame()
|
||||
for column_name in df.columns:
|
||||
column = df[column_name]
|
||||
if column.dtype == 'object' or column_name in col_list:
|
||||
col_dict = _get_nominal_integer_dict(column)
|
||||
ret[column_name] = _convert_to_integer(column, col_dict)
|
||||
else:
|
||||
ret[column_name] = column
|
||||
return ret
|
||||
|
||||
|
||||
def convert_related_cols_categorical_to_numeric(df, col_list):
|
||||
"""Convert categorical columns, that are related between each other,
|
||||
to numeric and leave numeric columns
|
||||
as they are.
|
||||
Parameters:
|
||||
df (pd.DataFrame): Dataframe.
|
||||
col_list (list): List of columns.
|
||||
Returns:
|
||||
ret (pd.DataFrame): An dataframe with numeric values.
|
||||
Examples:
|
||||
>>> df = pd.DataFrame({'letters':['a','b','c'],'letters2':['c','d','e'],'numbers':[1,2,3]})
|
||||
>>> df_numeric = convert_related_cols_categorical_to_numeric(df, col_list=['letters','letters2'])
|
||||
>>> print(df_numeric)
|
||||
letters letters2 numbers
|
||||
0 0 2 1
|
||||
1 1 3 2
|
||||
2 2 4 3
|
||||
|
||||
"""
|
||||
ret = pd.DataFrame()
|
||||
values=None
|
||||
for c in col_list:
|
||||
values = pd.concat([values,df[c]], axis=0)
|
||||
values = pd.Series(values.unique())
|
||||
col_dict = _get_nominal_integer_dict(values)
|
||||
for column_name in df.columns:
|
||||
column = df[column_name]
|
||||
if column_name in col_list:
|
||||
ret[column_name] = _convert_to_integer(column, col_dict)
|
||||
else:
|
||||
ret[column_name] = column
|
||||
return ret
|
||||
|
|
@ -0,0 +1,344 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def get_fifa_stats(match, player_stats):
|
||||
''' Aggregates fifa stats for a given match. '''
|
||||
#Define variables
|
||||
match_id = match.match_api_id
|
||||
date = match['date']
|
||||
players = ['home_player_1', 'home_player_2', 'home_player_3', "home_player_4", "home_player_5",
|
||||
"home_player_6", "home_player_7", "home_player_8", "home_player_9", "home_player_10",
|
||||
"home_player_11", "away_player_1", "away_player_2", "away_player_3", "away_player_4",
|
||||
"away_player_5", "away_player_6", "away_player_7", "away_player_8", "away_player_9",
|
||||
"away_player_10", "away_player_11"]
|
||||
player_stats_new = pd.DataFrame()
|
||||
names = []
|
||||
|
||||
#Loop through all players
|
||||
for player in players:
|
||||
|
||||
#Get player ID
|
||||
player_id = match[player]
|
||||
|
||||
#Get player stats
|
||||
stats = player_stats[player_stats.player_api_id == player_id]
|
||||
|
||||
#Identify current stats
|
||||
current_stats = stats[stats.date < date].sort_values(by = 'date', ascending = False)[:1]
|
||||
|
||||
if np.isnan(player_id) == True:
|
||||
overall_rating = pd.Series(0)
|
||||
else:
|
||||
current_stats.reset_index(inplace = True, drop = True)
|
||||
overall_rating = pd.Series(current_stats.loc[0, "overall_rating"])
|
||||
|
||||
#Rename stat
|
||||
name = "{}_overall_rating".format(player)
|
||||
names.append(name)
|
||||
|
||||
#Aggregate stats
|
||||
player_stats_new = pd.concat([player_stats_new, overall_rating], axis = 1)
|
||||
|
||||
player_stats_new.columns = names
|
||||
player_stats_new['match_api_id'] = match_id
|
||||
|
||||
player_stats_new.reset_index(inplace = True, drop = True)
|
||||
|
||||
#Return player stats
|
||||
return player_stats_new.ix[0]
|
||||
|
||||
def get_fifa_data(matches, player_stats):
|
||||
''' Gets fifa data for all matches. '''
|
||||
#Apply get_fifa_stats for each match
|
||||
fifa_data = matches.apply(lambda x :get_fifa_stats(x, player_stats), axis = 1)
|
||||
return fifa_data
|
||||
|
||||
def get_match_label(match):
|
||||
''' Derives a label for a given match. '''
|
||||
|
||||
#Define variables
|
||||
home_goals = match['home_team_goal']
|
||||
away_goals = match['away_team_goal']
|
||||
|
||||
label = pd.DataFrame()
|
||||
label.loc[0,'match_api_id'] = match['match_api_id']
|
||||
|
||||
#Identify match label
|
||||
if home_goals > away_goals:
|
||||
label.loc[0,'label'] = "Win"
|
||||
if home_goals == away_goals:
|
||||
label.loc[0,'label'] = "Draw"
|
||||
if home_goals < away_goals:
|
||||
label.loc[0,'label'] = "Defeat"
|
||||
|
||||
#Return label
|
||||
return label.loc[0]
|
||||
|
||||
|
||||
def get_overall_fifa_rankings(fifa, get_overall = False):
|
||||
''' Get overall fifa rankings from fifa data. '''
|
||||
|
||||
temp_data = fifa
|
||||
|
||||
#Check if only overall player stats are desired
|
||||
if get_overall == True:
|
||||
|
||||
#Get overall stats
|
||||
data = temp_data.loc[:,(fifa.columns.str.contains('overall_rating'))]
|
||||
data.loc[:,'match_api_id'] = temp_data.loc[:,'match_api_id']
|
||||
else:
|
||||
|
||||
#Get all stats except for stat date
|
||||
cols = fifa.loc[:,(fifa.columns.str.contains('date_stat'))]
|
||||
temp_data = fifa.drop(cols.columns, axis = 1)
|
||||
data = temp_data
|
||||
|
||||
#Return data
|
||||
return data
|
||||
|
||||
def get_last_matches(matches, date, team, x = 10):
|
||||
''' Get the last x matches of a given team. '''
|
||||
|
||||
#Filter team matches from matches
|
||||
team_matches = matches[(matches['home_team_api_id'] == team) | (matches['away_team_api_id'] == team)]
|
||||
|
||||
#Filter x last matches from team matches
|
||||
last_matches = team_matches[team_matches.date < date].sort_values(by = 'date', ascending = False).iloc[0:x,:]
|
||||
|
||||
#Return last matches
|
||||
return last_matches
|
||||
|
||||
def get_last_matches_against_eachother(matches, date, home_team, away_team, x = 10):
|
||||
''' Get the last x matches of two given teams. '''
|
||||
|
||||
#Find matches of both teams
|
||||
home_matches = matches[(matches['home_team_api_id'] == home_team) & (matches['away_team_api_id'] == away_team)]
|
||||
away_matches = matches[(matches['home_team_api_id'] == away_team) & (matches['away_team_api_id'] == home_team)]
|
||||
total_matches = pd.concat([home_matches, away_matches])
|
||||
|
||||
#Get last x matches
|
||||
try:
|
||||
last_matches = total_matches[total_matches.date < date].sort_values(by = 'date', ascending = False).iloc[0:x,:]
|
||||
except:
|
||||
last_matches = total_matches[total_matches.date < date].sort_values(by = 'date', ascending = False).iloc[0:total_matches.shape[0],:]
|
||||
|
||||
#Check for error in data
|
||||
if(last_matches.shape[0] > x):
|
||||
print("Error in obtaining matches")
|
||||
|
||||
#Return data
|
||||
return last_matches
|
||||
|
||||
def get_goals(matches, team):
|
||||
''' Get the goals of a specfic team from a set of matches. '''
|
||||
|
||||
#Find home and away goals
|
||||
home_goals = int(matches.home_team_goal[matches.home_team_api_id == team].sum())
|
||||
away_goals = int(matches.away_team_goal[matches.away_team_api_id == team].sum())
|
||||
|
||||
total_goals = home_goals + away_goals
|
||||
|
||||
#Return total goals
|
||||
return total_goals
|
||||
|
||||
def get_goals_conceided(matches, team):
|
||||
''' Get the goals conceided of a specfic team from a set of matches. '''
|
||||
|
||||
#Find home and away goals
|
||||
home_goals = int(matches.home_team_goal[matches.away_team_api_id == team].sum())
|
||||
away_goals = int(matches.away_team_goal[matches.home_team_api_id == team].sum())
|
||||
|
||||
total_goals = home_goals + away_goals
|
||||
|
||||
#Return total goals
|
||||
return total_goals
|
||||
|
||||
def get_wins(matches, team):
|
||||
''' Get the number of wins of a specfic team from a set of matches. '''
|
||||
|
||||
#Find home and away wins
|
||||
home_wins = int(matches.home_team_goal[(matches.home_team_api_id == team) & (matches.home_team_goal > matches.away_team_goal)].count())
|
||||
away_wins = int(matches.away_team_goal[(matches.away_team_api_id == team) & (matches.away_team_goal > matches.home_team_goal)].count())
|
||||
|
||||
total_wins = home_wins + away_wins
|
||||
|
||||
#Return total wins
|
||||
return total_wins
|
||||
|
||||
def get_match_features(match, matches, x = 10):
|
||||
''' Create match specific features for a given match. '''
|
||||
|
||||
#Define variables
|
||||
date = match.date
|
||||
home_team = match.home_team_api_id
|
||||
away_team = match.away_team_api_id
|
||||
|
||||
#Get last x matches of home and away team
|
||||
matches_home_team = get_last_matches(matches, date, home_team, x = 10)
|
||||
matches_away_team = get_last_matches(matches, date, away_team, x = 10)
|
||||
|
||||
#Get last x matches of both teams against each other
|
||||
last_matches_against = get_last_matches_against_eachother(matches, date, home_team, away_team, x = 3)
|
||||
|
||||
#Create goal variables
|
||||
home_goals = get_goals(matches_home_team, home_team)
|
||||
away_goals = get_goals(matches_away_team, away_team)
|
||||
home_goals_conceided = get_goals_conceided(matches_home_team, home_team)
|
||||
away_goals_conceided = get_goals_conceided(matches_away_team, away_team)
|
||||
|
||||
#Define result data frame
|
||||
result = pd.DataFrame()
|
||||
|
||||
#Define ID features
|
||||
result.loc[0, 'match_api_id'] = match.match_api_id
|
||||
result.loc[0, 'league_id'] = match.league_id
|
||||
|
||||
#Create match features
|
||||
result.loc[0, 'home_team_goals_difference'] = home_goals - home_goals_conceided
|
||||
result.loc[0, 'away_team_goals_difference'] = away_goals - away_goals_conceided
|
||||
result.loc[0, 'games_won_home_team'] = get_wins(matches_home_team, home_team)
|
||||
result.loc[0, 'games_won_away_team'] = get_wins(matches_away_team, away_team)
|
||||
result.loc[0, 'games_against_won'] = get_wins(last_matches_against, home_team)
|
||||
result.loc[0, 'games_against_lost'] = get_wins(last_matches_against, away_team)
|
||||
|
||||
#Add season
|
||||
result.loc[0, 'season'] = int(match['season'].split('/')[0])
|
||||
|
||||
#Return match features
|
||||
return result.loc[0]
|
||||
|
||||
def create_feables(matches, fifa, bookkeepers, get_overall = False, horizontal = True, x = 10, all_leagues = True, verbose = True):
|
||||
''' Create and aggregate features and labels for all matches. '''
|
||||
|
||||
#Get fifa stats features
|
||||
fifa_stats = get_overall_fifa_rankings(fifa, get_overall)
|
||||
|
||||
|
||||
if verbose == True:
|
||||
print("Generating match features...")
|
||||
|
||||
#Get match features for all matches
|
||||
match_stats = matches.apply(lambda x: get_match_features(x, matches, x = 10), axis = 1)
|
||||
|
||||
#Create dummies for league ID feature
|
||||
if all_leagues:
|
||||
dummies = pd.get_dummies(match_stats['league_id']).rename(columns = lambda x: 'League_' + str(x))
|
||||
match_stats = pd.concat([match_stats, dummies], axis = 1)
|
||||
match_stats.drop(['league_id'], inplace = True, axis = 1)
|
||||
|
||||
|
||||
if verbose == True:
|
||||
print("Generating match labels...")
|
||||
|
||||
#Create match labels
|
||||
labels = matches.apply(get_match_label, axis = 1)
|
||||
|
||||
if verbose == True:
|
||||
print("Generating bookkeeper data...")
|
||||
|
||||
#Get bookkeeper quotas for all matches
|
||||
bk_data = get_bookkeeper_data(matches, bookkeepers, horizontal = True)
|
||||
bk_data.loc[:,'match_api_id'] = matches.loc[:,'match_api_id']
|
||||
|
||||
#Merges features and labels into one frame
|
||||
features = pd.merge(match_stats, fifa_stats, on = 'match_api_id', how = 'left')
|
||||
features = pd.merge(features, bk_data, on = 'match_api_id', how = 'left')
|
||||
feables = pd.merge(features, labels, on = 'match_api_id', how = 'left')
|
||||
|
||||
#Drop NA values
|
||||
feables.dropna(inplace = True)
|
||||
|
||||
#Return preprocessed data
|
||||
return feables
|
||||
|
||||
|
||||
def convert_odds_to_prob(match_odds):
|
||||
''' Converts bookkeeper odds to probabilities. '''
|
||||
|
||||
#Define variables
|
||||
match_id = match_odds.loc[:,'match_api_id']
|
||||
bookkeeper = match_odds.loc[:,'bookkeeper']
|
||||
win_odd = match_odds.loc[:,'Win']
|
||||
draw_odd = match_odds.loc[:,'Draw']
|
||||
loss_odd = match_odds.loc[:,'Defeat']
|
||||
|
||||
#Converts odds to prob
|
||||
win_prob = 1 / win_odd
|
||||
draw_prob = 1 / draw_odd
|
||||
loss_prob = 1 / loss_odd
|
||||
|
||||
total_prob = win_prob + draw_prob + loss_prob
|
||||
|
||||
probs = pd.DataFrame()
|
||||
|
||||
#Define output format and scale probs by sum over all probs
|
||||
probs.loc[:,'match_api_id'] = match_id
|
||||
probs.loc[:,'bookkeeper'] = bookkeeper
|
||||
probs.loc[:,'Win'] = win_prob / total_prob
|
||||
probs.loc[:,'Draw'] = draw_prob / total_prob
|
||||
probs.loc[:,'Defeat'] = loss_prob / total_prob
|
||||
|
||||
#Return probs and meta data
|
||||
return probs
|
||||
|
||||
def get_bookkeeper_data(matches, bookkeepers, horizontal = True):
|
||||
''' Aggregates bookkeeper data for all matches and bookkeepers. '''
|
||||
|
||||
bk_data = pd.DataFrame()
|
||||
|
||||
#Loop through bookkeepers
|
||||
for bookkeeper in bookkeepers:
|
||||
|
||||
#Find columns containing data of bookkeeper
|
||||
temp_data = matches.loc[:,(matches.columns.str.contains(bookkeeper))]
|
||||
temp_data.loc[:, 'bookkeeper'] = str(bookkeeper)
|
||||
temp_data.loc[:, 'match_api_id'] = matches.loc[:, 'match_api_id']
|
||||
|
||||
#Rename odds columns and convert to numeric
|
||||
cols = temp_data.columns.values
|
||||
cols[:3] = ['Win','Draw','Defeat']
|
||||
temp_data.columns = cols
|
||||
temp_data.loc[:,'Win'] = pd.to_numeric(temp_data['Win'])
|
||||
temp_data.loc[:,'Draw'] = pd.to_numeric(temp_data['Draw'])
|
||||
temp_data.loc[:,'Defeat'] = pd.to_numeric(temp_data['Defeat'])
|
||||
|
||||
#Check if data should be aggregated horizontally
|
||||
if(horizontal == True):
|
||||
|
||||
#Convert data to probs
|
||||
temp_data = convert_odds_to_prob(temp_data)
|
||||
temp_data.drop('match_api_id', axis = 1, inplace = True)
|
||||
temp_data.drop('bookkeeper', axis = 1, inplace = True)
|
||||
|
||||
#Rename columns with bookkeeper names
|
||||
win_name = bookkeeper + "_" + "Win"
|
||||
draw_name = bookkeeper + "_" + "Draw"
|
||||
defeat_name = bookkeeper + "_" + "Defeat"
|
||||
temp_data.columns.values[:3] = [win_name, draw_name, defeat_name]
|
||||
|
||||
#Aggregate data
|
||||
bk_data = pd.concat([bk_data, temp_data], axis = 1)
|
||||
else:
|
||||
#Aggregate vertically
|
||||
bk_data = bk_data.append(temp_data, ignore_index = True)
|
||||
|
||||
#If horizontal add match api id to data
|
||||
if(horizontal == True):
|
||||
temp_data.loc[:, 'match_api_id'] = matches.loc[:, 'match_api_id']
|
||||
|
||||
#Return bookkeeper data
|
||||
return bk_data
|
||||
|
||||
def get_bookkeeper_probs(matches, bookkeepers, horizontal = False):
|
||||
''' Get bookkeeper data and convert to probabilities for vertical aggregation. '''
|
||||
|
||||
#Get bookkeeper data
|
||||
data = get_bookkeeper_data(matches, bookkeepers, horizontal = False)
|
||||
|
||||
#Convert odds to probabilities
|
||||
probs = convert_odds_to_prob(data)
|
||||
|
||||
#Return data
|
||||
return probs
|
||||
|
Загрузка…
Ссылка в новой задаче