Merge pull request #27 from Azure/miguel_dev

football and airline experiments
This commit is contained in:
Mat 2017-05-21 17:42:56 +01:00 коммит произвёл GitHub
Родитель cac9c3f7fa 90ae075717
Коммит 73a00bb04f
5 изменённых файлов: 5260 добавлений и 86 удалений

1934
experiments/01_airline.ipynb Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,86 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Playground\n",
"\n",
"Playground notebook to quickly test code."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"System version: 3.5.2 |Continuum Analytics, Inc.| (default, Jul 2 2016, 17:53:06) \n",
"[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]\n"
]
}
],
"source": [
"import os,sys\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from lightgbm.sklearn import LGBMRegressor\n",
"import xgboost as xgb\n",
"\n",
"from libs.timer import Timer\n",
"\n",
"print(\"System version: {}\".format(sys.version))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.000576000000000132\n"
]
}
],
"source": [
"with Timer() as t:\n",
" for i in range(10000):\n",
" r = 1\n",
"print(t.interval)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

Просмотреть файл

@ -0,0 +1,95 @@
import pandas as pd
def _get_nominal_integer_dict(nominal_vals):
"""Convert nominal values in integers, starting at 0.
Parameters:
nominal_vals (pd.Series): A series.
Returns:
d (dict): An dictionary with numeric values.
"""
d = {}
for val in nominal_vals:
if val not in d:
current_max = max(d.values()) if len(d) > 0 else -1
d[val] = current_max+1
return d
def _convert_to_integer(srs, d):
"""Convert series to integer, given a dictionary.
Parameters:
srs (pd.Series): A series.
d (dict): A dictionary mapping values to integers
Returns:
srs (pd.Series): An series with numeric values.
"""
return srs.map(lambda x: d[x])
def convert_cols_categorical_to_numeric(df, col_list=None):
"""Convert categorical columns to numeric and leave numeric columns
as they are. You can force to convert a numerical column if it is
included in col_list
Parameters:
df (pd.DataFrame): Dataframe.
col_list (list): List of columns.
Returns:
ret (pd.DataFrame): An dataframe with numeric values.
Examples:
>>> df = pd.DataFrame({'letters':['a','b','c'],'numbers':[1,2,3]})
>>> df_numeric = convert_cols_categorical_to_numeric(df)
>>> print(df_numeric)
letters numbers
0 0 1
1 1 2
2 2 3
"""
if col_list is None: col_list = []
ret = pd.DataFrame()
for column_name in df.columns:
column = df[column_name]
if column.dtype == 'object' or column_name in col_list:
col_dict = _get_nominal_integer_dict(column)
ret[column_name] = _convert_to_integer(column, col_dict)
else:
ret[column_name] = column
return ret
def convert_related_cols_categorical_to_numeric(df, col_list):
"""Convert categorical columns, that are related between each other,
to numeric and leave numeric columns
as they are.
Parameters:
df (pd.DataFrame): Dataframe.
col_list (list): List of columns.
Returns:
ret (pd.DataFrame): An dataframe with numeric values.
Examples:
>>> df = pd.DataFrame({'letters':['a','b','c'],'letters2':['c','d','e'],'numbers':[1,2,3]})
>>> df_numeric = convert_related_cols_categorical_to_numeric(df, col_list=['letters','letters2'])
>>> print(df_numeric)
letters letters2 numbers
0 0 2 1
1 1 3 2
2 2 4 3
"""
ret = pd.DataFrame()
values=None
for c in col_list:
values = pd.concat([values,df[c]], axis=0)
values = pd.Series(values.unique())
col_dict = _get_nominal_integer_dict(values)
for column_name in df.columns:
column = df[column_name]
if column_name in col_list:
ret[column_name] = _convert_to_integer(column, col_dict)
else:
ret[column_name] = column
return ret

Просмотреть файл

@ -0,0 +1,344 @@
import numpy as np
import pandas as pd
def get_fifa_stats(match, player_stats):
''' Aggregates fifa stats for a given match. '''
#Define variables
match_id = match.match_api_id
date = match['date']
players = ['home_player_1', 'home_player_2', 'home_player_3', "home_player_4", "home_player_5",
"home_player_6", "home_player_7", "home_player_8", "home_player_9", "home_player_10",
"home_player_11", "away_player_1", "away_player_2", "away_player_3", "away_player_4",
"away_player_5", "away_player_6", "away_player_7", "away_player_8", "away_player_9",
"away_player_10", "away_player_11"]
player_stats_new = pd.DataFrame()
names = []
#Loop through all players
for player in players:
#Get player ID
player_id = match[player]
#Get player stats
stats = player_stats[player_stats.player_api_id == player_id]
#Identify current stats
current_stats = stats[stats.date < date].sort_values(by = 'date', ascending = False)[:1]
if np.isnan(player_id) == True:
overall_rating = pd.Series(0)
else:
current_stats.reset_index(inplace = True, drop = True)
overall_rating = pd.Series(current_stats.loc[0, "overall_rating"])
#Rename stat
name = "{}_overall_rating".format(player)
names.append(name)
#Aggregate stats
player_stats_new = pd.concat([player_stats_new, overall_rating], axis = 1)
player_stats_new.columns = names
player_stats_new['match_api_id'] = match_id
player_stats_new.reset_index(inplace = True, drop = True)
#Return player stats
return player_stats_new.ix[0]
def get_fifa_data(matches, player_stats):
''' Gets fifa data for all matches. '''
#Apply get_fifa_stats for each match
fifa_data = matches.apply(lambda x :get_fifa_stats(x, player_stats), axis = 1)
return fifa_data
def get_match_label(match):
''' Derives a label for a given match. '''
#Define variables
home_goals = match['home_team_goal']
away_goals = match['away_team_goal']
label = pd.DataFrame()
label.loc[0,'match_api_id'] = match['match_api_id']
#Identify match label
if home_goals > away_goals:
label.loc[0,'label'] = "Win"
if home_goals == away_goals:
label.loc[0,'label'] = "Draw"
if home_goals < away_goals:
label.loc[0,'label'] = "Defeat"
#Return label
return label.loc[0]
def get_overall_fifa_rankings(fifa, get_overall = False):
''' Get overall fifa rankings from fifa data. '''
temp_data = fifa
#Check if only overall player stats are desired
if get_overall == True:
#Get overall stats
data = temp_data.loc[:,(fifa.columns.str.contains('overall_rating'))]
data.loc[:,'match_api_id'] = temp_data.loc[:,'match_api_id']
else:
#Get all stats except for stat date
cols = fifa.loc[:,(fifa.columns.str.contains('date_stat'))]
temp_data = fifa.drop(cols.columns, axis = 1)
data = temp_data
#Return data
return data
def get_last_matches(matches, date, team, x = 10):
''' Get the last x matches of a given team. '''
#Filter team matches from matches
team_matches = matches[(matches['home_team_api_id'] == team) | (matches['away_team_api_id'] == team)]
#Filter x last matches from team matches
last_matches = team_matches[team_matches.date < date].sort_values(by = 'date', ascending = False).iloc[0:x,:]
#Return last matches
return last_matches
def get_last_matches_against_eachother(matches, date, home_team, away_team, x = 10):
''' Get the last x matches of two given teams. '''
#Find matches of both teams
home_matches = matches[(matches['home_team_api_id'] == home_team) & (matches['away_team_api_id'] == away_team)]
away_matches = matches[(matches['home_team_api_id'] == away_team) & (matches['away_team_api_id'] == home_team)]
total_matches = pd.concat([home_matches, away_matches])
#Get last x matches
try:
last_matches = total_matches[total_matches.date < date].sort_values(by = 'date', ascending = False).iloc[0:x,:]
except:
last_matches = total_matches[total_matches.date < date].sort_values(by = 'date', ascending = False).iloc[0:total_matches.shape[0],:]
#Check for error in data
if(last_matches.shape[0] > x):
print("Error in obtaining matches")
#Return data
return last_matches
def get_goals(matches, team):
''' Get the goals of a specfic team from a set of matches. '''
#Find home and away goals
home_goals = int(matches.home_team_goal[matches.home_team_api_id == team].sum())
away_goals = int(matches.away_team_goal[matches.away_team_api_id == team].sum())
total_goals = home_goals + away_goals
#Return total goals
return total_goals
def get_goals_conceided(matches, team):
''' Get the goals conceided of a specfic team from a set of matches. '''
#Find home and away goals
home_goals = int(matches.home_team_goal[matches.away_team_api_id == team].sum())
away_goals = int(matches.away_team_goal[matches.home_team_api_id == team].sum())
total_goals = home_goals + away_goals
#Return total goals
return total_goals
def get_wins(matches, team):
''' Get the number of wins of a specfic team from a set of matches. '''
#Find home and away wins
home_wins = int(matches.home_team_goal[(matches.home_team_api_id == team) & (matches.home_team_goal > matches.away_team_goal)].count())
away_wins = int(matches.away_team_goal[(matches.away_team_api_id == team) & (matches.away_team_goal > matches.home_team_goal)].count())
total_wins = home_wins + away_wins
#Return total wins
return total_wins
def get_match_features(match, matches, x = 10):
''' Create match specific features for a given match. '''
#Define variables
date = match.date
home_team = match.home_team_api_id
away_team = match.away_team_api_id
#Get last x matches of home and away team
matches_home_team = get_last_matches(matches, date, home_team, x = 10)
matches_away_team = get_last_matches(matches, date, away_team, x = 10)
#Get last x matches of both teams against each other
last_matches_against = get_last_matches_against_eachother(matches, date, home_team, away_team, x = 3)
#Create goal variables
home_goals = get_goals(matches_home_team, home_team)
away_goals = get_goals(matches_away_team, away_team)
home_goals_conceided = get_goals_conceided(matches_home_team, home_team)
away_goals_conceided = get_goals_conceided(matches_away_team, away_team)
#Define result data frame
result = pd.DataFrame()
#Define ID features
result.loc[0, 'match_api_id'] = match.match_api_id
result.loc[0, 'league_id'] = match.league_id
#Create match features
result.loc[0, 'home_team_goals_difference'] = home_goals - home_goals_conceided
result.loc[0, 'away_team_goals_difference'] = away_goals - away_goals_conceided
result.loc[0, 'games_won_home_team'] = get_wins(matches_home_team, home_team)
result.loc[0, 'games_won_away_team'] = get_wins(matches_away_team, away_team)
result.loc[0, 'games_against_won'] = get_wins(last_matches_against, home_team)
result.loc[0, 'games_against_lost'] = get_wins(last_matches_against, away_team)
#Add season
result.loc[0, 'season'] = int(match['season'].split('/')[0])
#Return match features
return result.loc[0]
def create_feables(matches, fifa, bookkeepers, get_overall = False, horizontal = True, x = 10, all_leagues = True, verbose = True):
''' Create and aggregate features and labels for all matches. '''
#Get fifa stats features
fifa_stats = get_overall_fifa_rankings(fifa, get_overall)
if verbose == True:
print("Generating match features...")
#Get match features for all matches
match_stats = matches.apply(lambda x: get_match_features(x, matches, x = 10), axis = 1)
#Create dummies for league ID feature
if all_leagues:
dummies = pd.get_dummies(match_stats['league_id']).rename(columns = lambda x: 'League_' + str(x))
match_stats = pd.concat([match_stats, dummies], axis = 1)
match_stats.drop(['league_id'], inplace = True, axis = 1)
if verbose == True:
print("Generating match labels...")
#Create match labels
labels = matches.apply(get_match_label, axis = 1)
if verbose == True:
print("Generating bookkeeper data...")
#Get bookkeeper quotas for all matches
bk_data = get_bookkeeper_data(matches, bookkeepers, horizontal = True)
bk_data.loc[:,'match_api_id'] = matches.loc[:,'match_api_id']
#Merges features and labels into one frame
features = pd.merge(match_stats, fifa_stats, on = 'match_api_id', how = 'left')
features = pd.merge(features, bk_data, on = 'match_api_id', how = 'left')
feables = pd.merge(features, labels, on = 'match_api_id', how = 'left')
#Drop NA values
feables.dropna(inplace = True)
#Return preprocessed data
return feables
def convert_odds_to_prob(match_odds):
''' Converts bookkeeper odds to probabilities. '''
#Define variables
match_id = match_odds.loc[:,'match_api_id']
bookkeeper = match_odds.loc[:,'bookkeeper']
win_odd = match_odds.loc[:,'Win']
draw_odd = match_odds.loc[:,'Draw']
loss_odd = match_odds.loc[:,'Defeat']
#Converts odds to prob
win_prob = 1 / win_odd
draw_prob = 1 / draw_odd
loss_prob = 1 / loss_odd
total_prob = win_prob + draw_prob + loss_prob
probs = pd.DataFrame()
#Define output format and scale probs by sum over all probs
probs.loc[:,'match_api_id'] = match_id
probs.loc[:,'bookkeeper'] = bookkeeper
probs.loc[:,'Win'] = win_prob / total_prob
probs.loc[:,'Draw'] = draw_prob / total_prob
probs.loc[:,'Defeat'] = loss_prob / total_prob
#Return probs and meta data
return probs
def get_bookkeeper_data(matches, bookkeepers, horizontal = True):
''' Aggregates bookkeeper data for all matches and bookkeepers. '''
bk_data = pd.DataFrame()
#Loop through bookkeepers
for bookkeeper in bookkeepers:
#Find columns containing data of bookkeeper
temp_data = matches.loc[:,(matches.columns.str.contains(bookkeeper))]
temp_data.loc[:, 'bookkeeper'] = str(bookkeeper)
temp_data.loc[:, 'match_api_id'] = matches.loc[:, 'match_api_id']
#Rename odds columns and convert to numeric
cols = temp_data.columns.values
cols[:3] = ['Win','Draw','Defeat']
temp_data.columns = cols
temp_data.loc[:,'Win'] = pd.to_numeric(temp_data['Win'])
temp_data.loc[:,'Draw'] = pd.to_numeric(temp_data['Draw'])
temp_data.loc[:,'Defeat'] = pd.to_numeric(temp_data['Defeat'])
#Check if data should be aggregated horizontally
if(horizontal == True):
#Convert data to probs
temp_data = convert_odds_to_prob(temp_data)
temp_data.drop('match_api_id', axis = 1, inplace = True)
temp_data.drop('bookkeeper', axis = 1, inplace = True)
#Rename columns with bookkeeper names
win_name = bookkeeper + "_" + "Win"
draw_name = bookkeeper + "_" + "Draw"
defeat_name = bookkeeper + "_" + "Defeat"
temp_data.columns.values[:3] = [win_name, draw_name, defeat_name]
#Aggregate data
bk_data = pd.concat([bk_data, temp_data], axis = 1)
else:
#Aggregate vertically
bk_data = bk_data.append(temp_data, ignore_index = True)
#If horizontal add match api id to data
if(horizontal == True):
temp_data.loc[:, 'match_api_id'] = matches.loc[:, 'match_api_id']
#Return bookkeeper data
return bk_data
def get_bookkeeper_probs(matches, bookkeepers, horizontal = False):
''' Get bookkeeper data and convert to probabilities for vertical aggregation. '''
#Get bookkeeper data
data = get_bookkeeper_data(matches, bookkeepers, horizontal = False)
#Convert odds to probabilities
probs = convert_odds_to_prob(data)
#Return data
return probs