diff --git a/experiments/03_football.ipynb b/experiments/03_football.ipynb
index 1cc821d..93d638b 100644
--- a/experiments/03_football.ipynb
+++ b/experiments/03_football.ipynb
@@ -25,16 +25,10 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "System version: 3.6.0 |Anaconda 4.3.1 (64-bit)| (default, Dec 23 2016, 12:22:00) \n",
- "[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/home/hoaphumanoid/anaconda3/envs/strata/lib/python3.6/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
- " \"This module will be removed in 0.20.\", DeprecationWarning)\n"
+ "System version: 3.5.2 |Anaconda custom (64-bit)| (default, Jul 2 2016, 17:53:06) \n",
+ "[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]\n",
+ "XGBoost version: 0.6\n",
+ "LightGBM version: 0.2\n"
]
}
],
@@ -61,8 +55,10 @@
"from lightgbm import LGBMClassifier\n",
"from libs.loaders import load_football\n",
"from libs.football import get_fifa_data, create_feables\n",
+ "import pickle\n",
"import pkg_resources\n",
"\n",
+ "os.environ['MOUNT_POINT'] = '/strata'\n",
"print(\"System version: {}\".format(sys.version))\n",
"print(\"XGBoost version: {}\".format(pkg_resources.get_distribution('xgboost').version))\n",
"print(\"LightGBM version: {}\".format(pkg_resources.get_distribution('lightgbm').version))"
@@ -81,14 +77,13 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "MOUNT_POINT not found in environment. Defaulting to /fileshare\n",
"(11, 2)\n",
"(25979, 115)\n",
"(11, 3)\n",
"(299, 5)\n",
"(183978, 42)\n",
- "CPU times: user 3.91 s, sys: 412 ms, total: 4.32 s\n",
- "Wall time: 4.34 s\n"
+ "CPU times: user 4.73 s, sys: 728 ms, total: 5.46 s\n",
+ "Wall time: 19.2 s\n"
]
}
],
@@ -421,7 +416,7 @@
},
{
"cell_type": "code",
- "execution_count": 68,
+ "execution_count": 5,
"metadata": {
"collapsed": false,
"deletable": true,
@@ -610,7 +605,7 @@
"[5 rows x 115 columns]"
]
},
- "execution_count": 68,
+ "execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@@ -619,17 +614,6 @@
"matches.tail()"
]
},
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {
- "collapsed": false,
- "deletable": true,
- "editable": true
- },
- "outputs": [],
- "source": []
- },
{
"cell_type": "code",
"execution_count": 6,
@@ -656,13 +640,12 @@
" \"away_player_2\", \"away_player_3\", \"away_player_4\", \"away_player_5\", \"away_player_6\",\n",
" \"away_player_7\", \"away_player_8\", \"away_player_9\", \"away_player_10\", \"away_player_11\"]\n",
"match_data = matches.dropna(subset = rows)\n",
- "print(match_data.shape)\n",
- "#match_data = match_data.tail(1500)"
+ "print(match_data.shape)\n"
]
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
@@ -674,21 +657,25 @@
"output_type": "stream",
"text": [
"(21374, 23)\n",
- "CPU times: user 33min 8s, sys: 4.06 s, total: 33min 12s\n",
- "Wall time: 32min 42s\n"
+ "CPU times: user 4 ms, sys: 0 ns, total: 4 ms\n",
+ "Wall time: 2.94 ms\n"
]
}
],
"source": [
"%%time\n",
- "fifa_data = get_fifa_data(match_data, players)\n",
- "print(fifa_data.shape)\n",
- "fifa_data.head()"
+ "fifa_data_filename = 'fifa_data.pk'\n",
+ "if os.path.isfile(fifa_data_filename):\n",
+ " fifa_data = pd.read_pickle(fifa_data_filename)\n",
+ "else:\n",
+ " fifa_data = get_fifa_data(match_data, players)\n",
+ " fifa_data.to_pickle(fifa_data_filename)\n",
+ "print(fifa_data.shape)\n"
]
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
@@ -699,39 +686,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Generating match features...\n",
- "Match features generated in 13.0 minutes\n",
- "Generating match labels...\n",
- "Match labels generated in 1.2 minutes\n",
- "Generating bookkeeper data...\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/home/hoaphumanoid/anaconda3/envs/strata/lib/python3.6/site-packages/pandas/core/indexing.py:297: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame.\n",
- "Try using .loc[row_indexer,col_indexer] = value instead\n",
- "\n",
- "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
- " self.obj[key] = _infer_fill_value(value)\n",
- "/home/hoaphumanoid/anaconda3/envs/strata/lib/python3.6/site-packages/pandas/core/indexing.py:477: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame.\n",
- "Try using .loc[row_indexer,col_indexer] = value instead\n",
- "\n",
- "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
- " self.obj[item] = s\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Bookkeeper data generated in 0.0 minutes\n",
- "(19673, 47)\n",
- "CPU times: user 14min 27s, sys: 5.35 s, total: 14min 32s\n",
- "Wall time: 14min 14s\n"
+ "Generating match features...\n"
]
}
],
@@ -745,243 +700,26 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " match_api_id | \n",
- " home_team_goals_difference | \n",
- " away_team_goals_difference | \n",
- " games_won_home_team | \n",
- " games_won_away_team | \n",
- " games_against_won | \n",
- " games_against_lost | \n",
- " League_1.0 | \n",
- " League_1729.0 | \n",
- " League_4769.0 | \n",
- " ... | \n",
- " away_player_9_overall_rating | \n",
- " away_player_10_overall_rating | \n",
- " away_player_11_overall_rating | \n",
- " B365_Win | \n",
- " B365_Draw | \n",
- " B365_Defeat | \n",
- " BW_Win | \n",
- " BW_Draw | \n",
- " BW_Defeat | \n",
- " label | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 493017.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " ... | \n",
- " 70.0 | \n",
- " 68.0 | \n",
- " 63.0 | \n",
- " 0.313804 | \n",
- " 0.276886 | \n",
- " 0.409310 | \n",
- " 0.307825 | \n",
- " 0.279410 | \n",
- " 0.412765 | \n",
- " Win | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 493025.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " ... | \n",
- " 67.0 | \n",
- " 73.0 | \n",
- " 68.0 | \n",
- " 0.327179 | \n",
- " 0.286281 | \n",
- " 0.386540 | \n",
- " 0.290493 | \n",
- " 0.300176 | \n",
- " 0.409331 | \n",
- " Defeat | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 493027.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " ... | \n",
- " 55.0 | \n",
- " 58.0 | \n",
- " 64.0 | \n",
- " 0.672897 | \n",
- " 0.209346 | \n",
- " 0.117757 | \n",
- " 0.672269 | \n",
- " 0.226891 | \n",
- " 0.100840 | \n",
- " Win | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 493034.0 | \n",
- " 1.0 | \n",
- " 2.0 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " ... | \n",
- " 74.0 | \n",
- " 70.0 | \n",
- " 69.0 | \n",
- " 0.207407 | \n",
- " 0.259259 | \n",
- " 0.533333 | \n",
- " 0.192717 | \n",
- " 0.274476 | \n",
- " 0.532807 | \n",
- " Win | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 493040.0 | \n",
- " -2.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " ... | \n",
- " 60.0 | \n",
- " 63.0 | \n",
- " 65.0 | \n",
- " 0.535211 | \n",
- " 0.267606 | \n",
- " 0.197183 | \n",
- " 0.565759 | \n",
- " 0.254990 | \n",
- " 0.179250 | \n",
- " Draw | \n",
- "
\n",
- " \n",
- "
\n",
- "
5 rows × 47 columns
\n",
- "
"
- ],
- "text/plain": [
- " match_api_id home_team_goals_difference away_team_goals_difference \\\n",
- "0 493017.0 0.0 0.0 \n",
- "1 493025.0 0.0 0.0 \n",
- "2 493027.0 0.0 0.0 \n",
- "3 493034.0 1.0 2.0 \n",
- "4 493040.0 -2.0 0.0 \n",
- "\n",
- " games_won_home_team games_won_away_team games_against_won \\\n",
- "0 0.0 0.0 0.0 \n",
- "1 0.0 0.0 0.0 \n",
- "2 0.0 0.0 0.0 \n",
- "3 1.0 1.0 0.0 \n",
- "4 0.0 0.0 0.0 \n",
- "\n",
- " games_against_lost League_1.0 League_1729.0 League_4769.0 ... \\\n",
- "0 0.0 1 0 0 ... \n",
- "1 0.0 1 0 0 ... \n",
- "2 0.0 1 0 0 ... \n",
- "3 0.0 1 0 0 ... \n",
- "4 0.0 1 0 0 ... \n",
- "\n",
- " away_player_9_overall_rating away_player_10_overall_rating \\\n",
- "0 70.0 68.0 \n",
- "1 67.0 73.0 \n",
- "2 55.0 58.0 \n",
- "3 74.0 70.0 \n",
- "4 60.0 63.0 \n",
- "\n",
- " away_player_11_overall_rating B365_Win B365_Draw B365_Defeat BW_Win \\\n",
- "0 63.0 0.313804 0.276886 0.409310 0.307825 \n",
- "1 68.0 0.327179 0.286281 0.386540 0.290493 \n",
- "2 64.0 0.672897 0.209346 0.117757 0.672269 \n",
- "3 69.0 0.207407 0.259259 0.533333 0.192717 \n",
- "4 65.0 0.535211 0.267606 0.197183 0.565759 \n",
- "\n",
- " BW_Draw BW_Defeat label \n",
- "0 0.279410 0.412765 Win \n",
- "1 0.300176 0.409331 Defeat \n",
- "2 0.226891 0.100840 Win \n",
- "3 0.274476 0.532807 Win \n",
- "4 0.254990 0.179250 Draw \n",
- "\n",
- "[5 rows x 47 columns]"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"feables.head()"
]
},
{
"cell_type": "code",
- "execution_count": 38,
+ "execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "(19673, 45)\n",
- "(19673,)\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"features = feables[feables.columns.difference(['match_api_id', 'label'])]\n",
"labs = feables['label']\n",
@@ -991,24 +729,13 @@
},
{
"cell_type": "code",
- "execution_count": 43,
+ "execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "(12590, 45)\n",
- "(3148, 45)\n",
- "(3935, 45)\n",
- "19673\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(features, labs, test_size=0.2, random_state=42, stratify=labs)\n",
"X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)\n",
@@ -1021,7 +748,7 @@
},
{
"cell_type": "code",
- "execution_count": 59,
+ "execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
@@ -1039,136 +766,13 @@
},
{
"cell_type": "code",
- "execution_count": 60,
+ "execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[0]\tvalidation_0-merror:0.492058\n",
- "[1]\tvalidation_0-merror:0.487611\n",
- "[2]\tvalidation_0-merror:0.484435\n",
- "[3]\tvalidation_0-merror:0.484435\n",
- "[4]\tvalidation_0-merror:0.481576\n",
- "[5]\tvalidation_0-merror:0.478399\n",
- "[6]\tvalidation_0-merror:0.478399\n",
- "[7]\tvalidation_0-merror:0.48094\n",
- "[8]\tvalidation_0-merror:0.481576\n",
- "[9]\tvalidation_0-merror:0.484117\n",
- "[10]\tvalidation_0-merror:0.484752\n",
- "[11]\tvalidation_0-merror:0.483164\n",
- "[12]\tvalidation_0-merror:0.482529\n",
- "[13]\tvalidation_0-merror:0.482529\n",
- "[14]\tvalidation_0-merror:0.483164\n",
- "[15]\tvalidation_0-merror:0.483799\n",
- "[16]\tvalidation_0-merror:0.482529\n",
- "[17]\tvalidation_0-merror:0.481576\n",
- "[18]\tvalidation_0-merror:0.482211\n",
- "[19]\tvalidation_0-merror:0.480623\n",
- "[20]\tvalidation_0-merror:0.483482\n",
- "[21]\tvalidation_0-merror:0.484435\n",
- "[22]\tvalidation_0-merror:0.485388\n",
- "[23]\tvalidation_0-merror:0.483799\n",
- "[24]\tvalidation_0-merror:0.483799\n",
- "[25]\tvalidation_0-merror:0.483482\n",
- "[26]\tvalidation_0-merror:0.481258\n",
- "[27]\tvalidation_0-merror:0.481258\n",
- "[28]\tvalidation_0-merror:0.481576\n",
- "[29]\tvalidation_0-merror:0.481893\n",
- "[30]\tvalidation_0-merror:0.482846\n",
- "[31]\tvalidation_0-merror:0.481576\n",
- "[32]\tvalidation_0-merror:0.47967\n",
- "[33]\tvalidation_0-merror:0.482211\n",
- "[34]\tvalidation_0-merror:0.48094\n",
- "[35]\tvalidation_0-merror:0.481576\n",
- "[36]\tvalidation_0-merror:0.481258\n",
- "[37]\tvalidation_0-merror:0.48094\n",
- "[38]\tvalidation_0-merror:0.482529\n",
- "[39]\tvalidation_0-merror:0.483164\n",
- "[40]\tvalidation_0-merror:0.48094\n",
- "[41]\tvalidation_0-merror:0.481576\n",
- "[42]\tvalidation_0-merror:0.481893\n",
- "[43]\tvalidation_0-merror:0.482211\n",
- "[44]\tvalidation_0-merror:0.482211\n",
- "[45]\tvalidation_0-merror:0.481893\n",
- "[46]\tvalidation_0-merror:0.482529\n",
- "[47]\tvalidation_0-merror:0.480305\n",
- "[48]\tvalidation_0-merror:0.47967\n",
- "[49]\tvalidation_0-merror:0.479987\n",
- "[50]\tvalidation_0-merror:0.479352\n",
- "[51]\tvalidation_0-merror:0.47967\n",
- "[52]\tvalidation_0-merror:0.480623\n",
- "[53]\tvalidation_0-merror:0.482211\n",
- "[54]\tvalidation_0-merror:0.482846\n",
- "[55]\tvalidation_0-merror:0.481258\n",
- "[56]\tvalidation_0-merror:0.48094\n",
- "[57]\tvalidation_0-merror:0.482529\n",
- "[58]\tvalidation_0-merror:0.482211\n",
- "[59]\tvalidation_0-merror:0.483799\n",
- "[60]\tvalidation_0-merror:0.484117\n",
- "[61]\tvalidation_0-merror:0.485705\n",
- "[62]\tvalidation_0-merror:0.487611\n",
- "[63]\tvalidation_0-merror:0.487294\n",
- "[64]\tvalidation_0-merror:0.486976\n",
- "[65]\tvalidation_0-merror:0.488247\n",
- "[66]\tvalidation_0-merror:0.487294\n",
- "[67]\tvalidation_0-merror:0.487929\n",
- "[68]\tvalidation_0-merror:0.488247\n",
- "[69]\tvalidation_0-merror:0.488882\n",
- "[70]\tvalidation_0-merror:0.487611\n",
- "[71]\tvalidation_0-merror:0.487611\n",
- "[72]\tvalidation_0-merror:0.487611\n",
- "[73]\tvalidation_0-merror:0.487611\n",
- "[74]\tvalidation_0-merror:0.487294\n",
- "[75]\tvalidation_0-merror:0.487294\n",
- "[76]\tvalidation_0-merror:0.486976\n",
- "[77]\tvalidation_0-merror:0.488247\n",
- "[78]\tvalidation_0-merror:0.487294\n",
- "[79]\tvalidation_0-merror:0.486658\n",
- "[80]\tvalidation_0-merror:0.486341\n",
- "[81]\tvalidation_0-merror:0.486341\n",
- "[82]\tvalidation_0-merror:0.486023\n",
- "[83]\tvalidation_0-merror:0.485705\n",
- "[84]\tvalidation_0-merror:0.485705\n",
- "[85]\tvalidation_0-merror:0.487611\n",
- "[86]\tvalidation_0-merror:0.486341\n",
- "[87]\tvalidation_0-merror:0.48507\n",
- "[88]\tvalidation_0-merror:0.484752\n",
- "[89]\tvalidation_0-merror:0.48507\n",
- "[90]\tvalidation_0-merror:0.485388\n",
- "[91]\tvalidation_0-merror:0.484117\n",
- "[92]\tvalidation_0-merror:0.483799\n",
- "[93]\tvalidation_0-merror:0.486976\n",
- "[94]\tvalidation_0-merror:0.486341\n",
- "[95]\tvalidation_0-merror:0.486023\n",
- "[96]\tvalidation_0-merror:0.486023\n",
- "[97]\tvalidation_0-merror:0.485388\n",
- "[98]\tvalidation_0-merror:0.484435\n",
- "[99]\tvalidation_0-merror:0.485705\n",
- "CPU times: user 19.4 s, sys: 12.7 s, total: 32.2 s\n",
- "Wall time: 1.55 s\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,\n",
- " gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=8,\n",
- " min_child_weight=5, missing=None, n_estimators=100, nthread=-1,\n",
- " objective='multi:softprob', reg_alpha=0, reg_lambda=1,\n",
- " scale_pos_weight=2, seed=0, silent=True, subsample=1)"
- ]
- },
- "execution_count": 60,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"%%time\n",
"clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True, eval_metric='merror')"
@@ -1176,22 +780,13 @@
},
{
"cell_type": "code",
- "execution_count": 61,
+ "execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Score of XGBClassifier for training set: 0.7590.\n",
- "Score of XGBClassifier for test set: 0.5225.\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"y_pred = clf.predict(X_test)\n",
"print(\"Score of {} for training set: {:.4f}.\".format(clf.__class__.__name__, accuracy_score(y_train, clf.predict(X_train))))\n",
@@ -1200,7 +795,7 @@
},
{
"cell_type": "code",
- "execution_count": 62,
+ "execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
@@ -1221,30 +816,13 @@
},
{
"cell_type": "code",
- "execution_count": 63,
+ "execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'Accuracy': 0.52249047013977123,\n",
- " 'Confusion Matrix': array([[1452, 88, 267],\n",
- " [ 650, 65, 278],\n",
- " [ 532, 64, 539]]),\n",
- " 'F1': 0.46751804585279649,\n",
- " 'Precision': 0.472151244629816,\n",
- " 'Recall': 0.52249047013977123}"
- ]
- },
- "execution_count": 63,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"labels = [\"Win\", \"Draw\", \"Defeat\"]\n",
"report = classification_metrics_multilabel(y_test, y_pred, labels)\n",
@@ -1253,7 +831,7 @@
},
{
"cell_type": "code",
- "execution_count": 64,
+ "execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
@@ -1271,139 +849,13 @@
},
{
"cell_type": "code",
- "execution_count": 65,
+ "execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[1]\tvalid_0's multi_error: 0.520013\n",
- "[2]\tvalid_0's multi_error: 0.524778\n",
- "[3]\tvalid_0's multi_error: 0.513342\n",
- "[4]\tvalid_0's multi_error: 0.511118\n",
- "[5]\tvalid_0's multi_error: 0.5054\n",
- "[6]\tvalid_0's multi_error: 0.50413\n",
- "[7]\tvalid_0's multi_error: 0.499682\n",
- "[8]\tvalid_0's multi_error: 0.506353\n",
- "[9]\tvalid_0's multi_error: 0.505083\n",
- "[10]\tvalid_0's multi_error: 0.506671\n",
- "[11]\tvalid_0's multi_error: 0.506036\n",
- "[12]\tvalid_0's multi_error: 0.499365\n",
- "[13]\tvalid_0's multi_error: 0.499682\n",
- "[14]\tvalid_0's multi_error: 0.501271\n",
- "[15]\tvalid_0's multi_error: 0.496506\n",
- "[16]\tvalid_0's multi_error: 0.499047\n",
- "[17]\tvalid_0's multi_error: 0.498729\n",
- "[18]\tvalid_0's multi_error: 0.499682\n",
- "[19]\tvalid_0's multi_error: 0.499682\n",
- "[20]\tvalid_0's multi_error: 0.498094\n",
- "[21]\tvalid_0's multi_error: 0.498094\n",
- "[22]\tvalid_0's multi_error: 0.499682\n",
- "[23]\tvalid_0's multi_error: 0.497141\n",
- "[24]\tvalid_0's multi_error: 0.498412\n",
- "[25]\tvalid_0's multi_error: 0.499365\n",
- "[26]\tvalid_0's multi_error: 0.503494\n",
- "[27]\tvalid_0's multi_error: 0.502541\n",
- "[28]\tvalid_0's multi_error: 0.501588\n",
- "[29]\tvalid_0's multi_error: 0.499047\n",
- "[30]\tvalid_0's multi_error: 0.500953\n",
- "[31]\tvalid_0's multi_error: 0.501906\n",
- "[32]\tvalid_0's multi_error: 0.502224\n",
- "[33]\tvalid_0's multi_error: 0.500953\n",
- "[34]\tvalid_0's multi_error: 0.500635\n",
- "[35]\tvalid_0's multi_error: 0.502224\n",
- "[36]\tvalid_0's multi_error: 0.502859\n",
- "[37]\tvalid_0's multi_error: 0.500953\n",
- "[38]\tvalid_0's multi_error: 0.5054\n",
- "[39]\tvalid_0's multi_error: 0.506671\n",
- "[40]\tvalid_0's multi_error: 0.507306\n",
- "[41]\tvalid_0's multi_error: 0.5054\n",
- "[42]\tvalid_0's multi_error: 0.506671\n",
- "[43]\tvalid_0's multi_error: 0.50413\n",
- "[44]\tvalid_0's multi_error: 0.503494\n",
- "[45]\tvalid_0's multi_error: 0.502859\n",
- "[46]\tvalid_0's multi_error: 0.5054\n",
- "[47]\tvalid_0's multi_error: 0.506353\n",
- "[48]\tvalid_0's multi_error: 0.505718\n",
- "[49]\tvalid_0's multi_error: 0.506671\n",
- "[50]\tvalid_0's multi_error: 0.506671\n",
- "[51]\tvalid_0's multi_error: 0.505718\n",
- "[52]\tvalid_0's multi_error: 0.50413\n",
- "[53]\tvalid_0's multi_error: 0.504447\n",
- "[54]\tvalid_0's multi_error: 0.50413\n",
- "[55]\tvalid_0's multi_error: 0.502541\n",
- "[56]\tvalid_0's multi_error: 0.502224\n",
- "[57]\tvalid_0's multi_error: 0.500953\n",
- "[58]\tvalid_0's multi_error: 0.501588\n",
- "[59]\tvalid_0's multi_error: 0.50413\n",
- "[60]\tvalid_0's multi_error: 0.502859\n",
- "[61]\tvalid_0's multi_error: 0.504765\n",
- "[62]\tvalid_0's multi_error: 0.50413\n",
- "[63]\tvalid_0's multi_error: 0.50413\n",
- "[64]\tvalid_0's multi_error: 0.502541\n",
- "[65]\tvalid_0's multi_error: 0.501271\n",
- "[66]\tvalid_0's multi_error: 0.5\n",
- "[67]\tvalid_0's multi_error: 0.500635\n",
- "[68]\tvalid_0's multi_error: 0.499047\n",
- "[69]\tvalid_0's multi_error: 0.499682\n",
- "[70]\tvalid_0's multi_error: 0.498729\n",
- "[71]\tvalid_0's multi_error: 0.498412\n",
- "[72]\tvalid_0's multi_error: 0.497141\n",
- "[73]\tvalid_0's multi_error: 0.499682\n",
- "[74]\tvalid_0's multi_error: 0.499682\n",
- "[75]\tvalid_0's multi_error: 0.498729\n",
- "[76]\tvalid_0's multi_error: 0.5\n",
- "[77]\tvalid_0's multi_error: 0.500318\n",
- "[78]\tvalid_0's multi_error: 0.499047\n",
- "[79]\tvalid_0's multi_error: 0.499682\n",
- "[80]\tvalid_0's multi_error: 0.499682\n",
- "[81]\tvalid_0's multi_error: 0.499682\n",
- "[82]\tvalid_0's multi_error: 0.500635\n",
- "[83]\tvalid_0's multi_error: 0.497459\n",
- "[84]\tvalid_0's multi_error: 0.496823\n",
- "[85]\tvalid_0's multi_error: 0.496506\n",
- "[86]\tvalid_0's multi_error: 0.5\n",
- "[87]\tvalid_0's multi_error: 0.498094\n",
- "[88]\tvalid_0's multi_error: 0.498729\n",
- "[89]\tvalid_0's multi_error: 0.497776\n",
- "[90]\tvalid_0's multi_error: 0.497459\n",
- "[91]\tvalid_0's multi_error: 0.495553\n",
- "[92]\tvalid_0's multi_error: 0.497459\n",
- "[93]\tvalid_0's multi_error: 0.498729\n",
- "[94]\tvalid_0's multi_error: 0.498412\n",
- "[95]\tvalid_0's multi_error: 0.497776\n",
- "[96]\tvalid_0's multi_error: 0.499365\n",
- "[97]\tvalid_0's multi_error: 0.497459\n",
- "[98]\tvalid_0's multi_error: 0.497776\n",
- "[99]\tvalid_0's multi_error: 0.49587\n",
- "[100]\tvalid_0's multi_error: 0.497141\n",
- "CPU times: user 40.5 s, sys: 54.7 s, total: 1min 35s\n",
- "Wall time: 4.7 s\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, drop_rate=0.1,\n",
- " is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,\n",
- " max_drop=50, min_child_samples=10, min_child_weight=5,\n",
- " min_split_gain=0, n_estimators=100, nthread=-1, num_leaves=255,\n",
- " objective='multiclass', reg_alpha=0, reg_lambda=0,\n",
- " scale_pos_weight=2, seed=0, sigmoid=1.0, silent=True,\n",
- " skip_drop=0.5, subsample=1, subsample_for_bin=50000,\n",
- " subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)"
- ]
- },
- "execution_count": 65,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"%%time\n",
"clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True, eval_metric='multi_error')"
@@ -1411,22 +863,13 @@
},
{
"cell_type": "code",
- "execution_count": 66,
+ "execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Score of LGBMClassifier for training set: 0.9998.\n",
- "Score of LGBMClassifier for test set: 0.5116.\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"y_pred = clf.predict(X_test)\n",
"print(\"Score of {} for training set: {:.4f}.\".format(clf.__class__.__name__, accuracy_score(y_train, clf.predict(X_train))))\n",
@@ -1435,30 +878,13 @@
},
{
"cell_type": "code",
- "execution_count": 67,
+ "execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'Accuracy': 0.51156289707750957,\n",
- " 'Confusion Matrix': array([[1387, 147, 273],\n",
- " [ 623, 109, 261],\n",
- " [ 503, 115, 517]]),\n",
- " 'F1': 0.47163912122565715,\n",
- " 'Precision': 0.4694795495800122,\n",
- " 'Recall': 0.51156289707750957}"
- ]
- },
- "execution_count": 67,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"report = classification_metrics_multilabel(y_test, y_pred, labels)\n",
"report"
@@ -1479,7 +905,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
- "collapsed": true,
+ "collapsed": false,
"deletable": true,
"editable": true
},
@@ -1496,7 +922,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
- "collapsed": true
+ "collapsed": false
},
"outputs": [],
"source": [
@@ -1514,7 +940,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
- "collapsed": true
+ "collapsed": false
},
"outputs": [],
"source": [
@@ -1537,7 +963,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
- "collapsed": true
+ "collapsed": false
},
"outputs": [],
"source": [
@@ -1549,14 +975,14 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
- "collapsed": true
+ "collapsed": false
},
"outputs": [],
"source": [
"%%time\n",
"bk_cols = ['B365', 'BW', 'IW', 'LB', 'PS', 'WH', 'SJ', 'VC', 'GB', 'BS']\n",
"bk_cols_selected = ['B365', 'BW'] \n",
- "feables = create_feables(match_data, fifa_data, bk_cols_selected, get_overall = True)\n",
+ "feables = create_feables(match_data, fifa_data, bk_cols_selected, get_overall = True, all_leagues = False)\n",
"print(feables.shape)\n",
"feables.head()"
]
@@ -1565,15 +991,15 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
- "collapsed": true
+ "collapsed": false
},
"outputs": [],
"source": [
- "feables_2015_2016 = feables[feables['season'] == '2015/2016']\n",
+ "feables_2015_2016 = feables[feables['season'] == 2015]\n",
"print(feables_2015_2016.shape)\n",
- "feables_2014_2015 = feables[feables['season'] == '2014/2015']\n",
+ "feables_2014_2015 = feables[feables['season'] == 2014]\n",
"print(feables_2014_2015.shape)\n",
- "feables_rest = feables[(feables['season'] != '2014/2015') & (feables['season'] != '2015/2016')]\n",
+ "feables_rest = feables[(feables['season'] != 2014) & (feables['season'] != 2015)]\n",
"print(feables_rest.shape)"
]
},
@@ -1581,7 +1007,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
- "collapsed": true
+ "collapsed": false
},
"outputs": [],
"source": [
@@ -1601,11 +1027,11 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
- "collapsed": true
+ "collapsed": false
},
"outputs": [],
"source": [
- "feables_up_to_2014_2015 = feables[feables['season'] != '2015/2016']\n",
+ "feables_up_to_2014_2015 = feables[feables['season'] != 2015]\n",
"print(feables_up_to_2014_2015.shape)\n",
"X_train2 = feables_up_to_2014_2015[feables_up_to_2014_2015.columns.difference(['match_api_id', 'label', 'season'])]\n",
"y_train2 = feables_up_to_2014_2015['label']"
@@ -1619,7 +1045,7 @@
},
"outputs": [],
"source": [
- "clf = XGBClassifier(max_depth=8, \n",
+ "clf1 = XGBClassifier(max_depth=8, \n",
" learning_rate=0.1, \n",
" scale_pos_weight=2,\n",
" min_child_weight=5,\n",
@@ -1631,19 +1057,19 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
- "collapsed": true
+ "collapsed": false
},
"outputs": [],
"source": [
"%%time\n",
- "clf.fit(X_train, y_train, verbose=True, eval_metric='merror')"
+ "clf1.fit(X_train, y_train, verbose=True, eval_metric='multi_error')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
- "collapsed": true
+ "collapsed": false
},
"outputs": [],
"source": [
@@ -1656,7 +1082,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
- "collapsed": true
+ "collapsed": false
},
"outputs": [],
"source": [
@@ -1669,7 +1095,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
- "collapsed": true
+ "collapsed": false
},
"outputs": [],
"source": [
@@ -1681,7 +1107,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
- "collapsed": true
+ "collapsed": false
},
"outputs": [],
"source": [
@@ -1710,19 +1136,19 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
- "collapsed": true
+ "collapsed": false
},
"outputs": [],
"source": [
"%%time\n",
- "clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True, eval_metric='multi_error')"
+ "clf.fit(X_train, y_train, verbose=True, eval_metric='multi_error')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
- "collapsed": true
+ "collapsed": false
},
"outputs": [],
"source": [
@@ -1735,7 +1161,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
- "collapsed": true
+ "collapsed": false
},
"outputs": [],
"source": [
@@ -1748,7 +1174,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
- "collapsed": true
+ "collapsed": false
},
"outputs": [],
"source": [
@@ -1760,7 +1186,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
- "collapsed": true
+ "collapsed": false
},
"outputs": [],
"source": [
diff --git a/experiments/libs/football.py b/experiments/libs/football.py
index 1af0aef..4a13110 100644
--- a/experiments/libs/football.py
+++ b/experiments/libs/football.py
@@ -202,10 +202,13 @@ def get_match_features(match, matches, x = 10):
result.loc[0, 'games_against_won'] = get_wins(last_matches_against, home_team)
result.loc[0, 'games_against_lost'] = get_wins(last_matches_against, away_team)
+ #Add season
+ result.loc[0, 'season'] = int(match['season'].split('/')[0])
+
#Return match features
return result.loc[0]
-def create_feables(matches, fifa, bookkeepers, get_overall = False, horizontal = True, x = 10, verbose = True):
+def create_feables(matches, fifa, bookkeepers, get_overall = False, horizontal = True, x = 10, all_leagues = True, verbose = True):
''' Create and aggregate features and labels for all matches. '''
#Get fifa stats features
@@ -214,40 +217,29 @@ def create_feables(matches, fifa, bookkeepers, get_overall = False, horizontal =
if verbose == True:
print("Generating match features...")
- start = time()
#Get match features for all matches
match_stats = matches.apply(lambda x: get_match_features(x, matches, x = 10), axis = 1)
#Create dummies for league ID feature
- dummies = pd.get_dummies(match_stats['league_id']).rename(columns = lambda x: 'League_' + str(x))
- match_stats = pd.concat([match_stats, dummies], axis = 1)
- match_stats.drop(['league_id'], inplace = True, axis = 1)
-
- end = time()
- if verbose == True:
- print("Match features generated in {:.1f} minutes".format((end - start)/60))
+ if all_leagues:
+ dummies = pd.get_dummies(match_stats['league_id']).rename(columns = lambda x: 'League_' + str(x))
+ match_stats = pd.concat([match_stats, dummies], axis = 1)
+ match_stats.drop(['league_id'], inplace = True, axis = 1)
+
if verbose == True:
print("Generating match labels...")
- start = time()
#Create match labels
labels = matches.apply(get_match_label, axis = 1)
- end = time()
- if verbose == True:
- print("Match labels generated in {:.1f} minutes".format((end - start)/60))
if verbose == True:
print("Generating bookkeeper data...")
- start = time()
#Get bookkeeper quotas for all matches
bk_data = get_bookkeeper_data(matches, bookkeepers, horizontal = True)
bk_data.loc[:,'match_api_id'] = matches.loc[:,'match_api_id']
- end = time()
- if verbose == True:
- print("Bookkeeper data generated in {:.1f} minutes".format((end - start)/60))
#Merges features and labels into one frame
features = pd.merge(match_stats, fifa_stats, on = 'match_api_id', how = 'left')