diff --git a/experiments/03_football.ipynb b/experiments/03_football.ipynb index 1cc821d..93d638b 100644 --- a/experiments/03_football.ipynb +++ b/experiments/03_football.ipynb @@ -25,16 +25,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "System version: 3.6.0 |Anaconda 4.3.1 (64-bit)| (default, Dec 23 2016, 12:22:00) \n", - "[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/hoaphumanoid/anaconda3/envs/strata/lib/python3.6/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", - " \"This module will be removed in 0.20.\", DeprecationWarning)\n" + "System version: 3.5.2 |Anaconda custom (64-bit)| (default, Jul 2 2016, 17:53:06) \n", + "[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]\n", + "XGBoost version: 0.6\n", + "LightGBM version: 0.2\n" ] } ], @@ -61,8 +55,10 @@ "from lightgbm import LGBMClassifier\n", "from libs.loaders import load_football\n", "from libs.football import get_fifa_data, create_feables\n", + "import pickle\n", "import pkg_resources\n", "\n", + "os.environ['MOUNT_POINT'] = '/strata'\n", "print(\"System version: {}\".format(sys.version))\n", "print(\"XGBoost version: {}\".format(pkg_resources.get_distribution('xgboost').version))\n", "print(\"LightGBM version: {}\".format(pkg_resources.get_distribution('lightgbm').version))" @@ -81,14 +77,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "MOUNT_POINT not found in environment. Defaulting to /fileshare\n", "(11, 2)\n", "(25979, 115)\n", "(11, 3)\n", "(299, 5)\n", "(183978, 42)\n", - "CPU times: user 3.91 s, sys: 412 ms, total: 4.32 s\n", - "Wall time: 4.34 s\n" + "CPU times: user 4.73 s, sys: 728 ms, total: 5.46 s\n", + "Wall time: 19.2 s\n" ] } ], @@ -421,7 +416,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 5, "metadata": { "collapsed": false, "deletable": true, @@ -610,7 +605,7 @@ "[5 rows x 115 columns]" ] }, - "execution_count": 68, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -619,17 +614,6 @@ "matches.tail()" ] }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": 6, @@ -656,13 +640,12 @@ " \"away_player_2\", \"away_player_3\", \"away_player_4\", \"away_player_5\", \"away_player_6\",\n", " \"away_player_7\", \"away_player_8\", \"away_player_9\", \"away_player_10\", \"away_player_11\"]\n", "match_data = matches.dropna(subset = rows)\n", - "print(match_data.shape)\n", - "#match_data = match_data.tail(1500)" + "print(match_data.shape)\n" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": { "collapsed": false, "deletable": true, @@ -674,21 +657,25 @@ "output_type": "stream", "text": [ "(21374, 23)\n", - "CPU times: user 33min 8s, sys: 4.06 s, total: 33min 12s\n", - "Wall time: 32min 42s\n" + "CPU times: user 4 ms, sys: 0 ns, total: 4 ms\n", + "Wall time: 2.94 ms\n" ] } ], "source": [ "%%time\n", - "fifa_data = get_fifa_data(match_data, players)\n", - "print(fifa_data.shape)\n", - "fifa_data.head()" + "fifa_data_filename = 'fifa_data.pk'\n", + "if os.path.isfile(fifa_data_filename):\n", + " fifa_data = pd.read_pickle(fifa_data_filename)\n", + "else:\n", + " fifa_data = get_fifa_data(match_data, players)\n", + " fifa_data.to_pickle(fifa_data_filename)\n", + "print(fifa_data.shape)\n" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": { "collapsed": false, "deletable": true, @@ -699,39 +686,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Generating match features...\n", - "Match features generated in 13.0 minutes\n", - "Generating match labels...\n", - "Match labels generated in 1.2 minutes\n", - "Generating bookkeeper data...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/hoaphumanoid/anaconda3/envs/strata/lib/python3.6/site-packages/pandas/core/indexing.py:297: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", - " self.obj[key] = _infer_fill_value(value)\n", - "/home/hoaphumanoid/anaconda3/envs/strata/lib/python3.6/site-packages/pandas/core/indexing.py:477: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", - " self.obj[item] = s\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Bookkeeper data generated in 0.0 minutes\n", - "(19673, 47)\n", - "CPU times: user 14min 27s, sys: 5.35 s, total: 14min 32s\n", - "Wall time: 14min 14s\n" + "Generating match features...\n" ] } ], @@ -745,243 +700,26 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
match_api_idhome_team_goals_differenceaway_team_goals_differencegames_won_home_teamgames_won_away_teamgames_against_wongames_against_lostLeague_1.0League_1729.0League_4769.0...away_player_9_overall_ratingaway_player_10_overall_ratingaway_player_11_overall_ratingB365_WinB365_DrawB365_DefeatBW_WinBW_DrawBW_Defeatlabel
0493017.00.00.00.00.00.00.0100...70.068.063.00.3138040.2768860.4093100.3078250.2794100.412765Win
1493025.00.00.00.00.00.00.0100...67.073.068.00.3271790.2862810.3865400.2904930.3001760.409331Defeat
2493027.00.00.00.00.00.00.0100...55.058.064.00.6728970.2093460.1177570.6722690.2268910.100840Win
3493034.01.02.01.01.00.00.0100...74.070.069.00.2074070.2592590.5333330.1927170.2744760.532807Win
4493040.0-2.00.00.00.00.00.0100...60.063.065.00.5352110.2676060.1971830.5657590.2549900.179250Draw
\n", - "

5 rows × 47 columns

\n", - "
" - ], - "text/plain": [ - " match_api_id home_team_goals_difference away_team_goals_difference \\\n", - "0 493017.0 0.0 0.0 \n", - "1 493025.0 0.0 0.0 \n", - "2 493027.0 0.0 0.0 \n", - "3 493034.0 1.0 2.0 \n", - "4 493040.0 -2.0 0.0 \n", - "\n", - " games_won_home_team games_won_away_team games_against_won \\\n", - "0 0.0 0.0 0.0 \n", - "1 0.0 0.0 0.0 \n", - "2 0.0 0.0 0.0 \n", - "3 1.0 1.0 0.0 \n", - "4 0.0 0.0 0.0 \n", - "\n", - " games_against_lost League_1.0 League_1729.0 League_4769.0 ... \\\n", - "0 0.0 1 0 0 ... \n", - "1 0.0 1 0 0 ... \n", - "2 0.0 1 0 0 ... \n", - "3 0.0 1 0 0 ... \n", - "4 0.0 1 0 0 ... \n", - "\n", - " away_player_9_overall_rating away_player_10_overall_rating \\\n", - "0 70.0 68.0 \n", - "1 67.0 73.0 \n", - "2 55.0 58.0 \n", - "3 74.0 70.0 \n", - "4 60.0 63.0 \n", - "\n", - " away_player_11_overall_rating B365_Win B365_Draw B365_Defeat BW_Win \\\n", - "0 63.0 0.313804 0.276886 0.409310 0.307825 \n", - "1 68.0 0.327179 0.286281 0.386540 0.290493 \n", - "2 64.0 0.672897 0.209346 0.117757 0.672269 \n", - "3 69.0 0.207407 0.259259 0.533333 0.192717 \n", - "4 65.0 0.535211 0.267606 0.197183 0.565759 \n", - "\n", - " BW_Draw BW_Defeat label \n", - "0 0.279410 0.412765 Win \n", - "1 0.300176 0.409331 Defeat \n", - "2 0.226891 0.100840 Win \n", - "3 0.274476 0.532807 Win \n", - "4 0.254990 0.179250 Draw \n", - "\n", - "[5 rows x 47 columns]" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "feables.head()" ] }, { "cell_type": "code", - "execution_count": 38, + "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(19673, 45)\n", - "(19673,)\n" - ] - } - ], + "outputs": [], "source": [ "features = feables[feables.columns.difference(['match_api_id', 'label'])]\n", "labs = feables['label']\n", @@ -991,24 +729,13 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(12590, 45)\n", - "(3148, 45)\n", - "(3935, 45)\n", - "19673\n" - ] - } - ], + "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(features, labs, test_size=0.2, random_state=42, stratify=labs)\n", "X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)\n", @@ -1021,7 +748,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": null, "metadata": { "collapsed": false, "deletable": true, @@ -1039,136 +766,13 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[0]\tvalidation_0-merror:0.492058\n", - "[1]\tvalidation_0-merror:0.487611\n", - "[2]\tvalidation_0-merror:0.484435\n", - "[3]\tvalidation_0-merror:0.484435\n", - "[4]\tvalidation_0-merror:0.481576\n", - "[5]\tvalidation_0-merror:0.478399\n", - "[6]\tvalidation_0-merror:0.478399\n", - "[7]\tvalidation_0-merror:0.48094\n", - "[8]\tvalidation_0-merror:0.481576\n", - "[9]\tvalidation_0-merror:0.484117\n", - "[10]\tvalidation_0-merror:0.484752\n", - "[11]\tvalidation_0-merror:0.483164\n", - "[12]\tvalidation_0-merror:0.482529\n", - "[13]\tvalidation_0-merror:0.482529\n", - "[14]\tvalidation_0-merror:0.483164\n", - "[15]\tvalidation_0-merror:0.483799\n", - "[16]\tvalidation_0-merror:0.482529\n", - "[17]\tvalidation_0-merror:0.481576\n", - "[18]\tvalidation_0-merror:0.482211\n", - "[19]\tvalidation_0-merror:0.480623\n", - "[20]\tvalidation_0-merror:0.483482\n", - "[21]\tvalidation_0-merror:0.484435\n", - "[22]\tvalidation_0-merror:0.485388\n", - "[23]\tvalidation_0-merror:0.483799\n", - "[24]\tvalidation_0-merror:0.483799\n", - "[25]\tvalidation_0-merror:0.483482\n", - "[26]\tvalidation_0-merror:0.481258\n", - "[27]\tvalidation_0-merror:0.481258\n", - "[28]\tvalidation_0-merror:0.481576\n", - "[29]\tvalidation_0-merror:0.481893\n", - "[30]\tvalidation_0-merror:0.482846\n", - "[31]\tvalidation_0-merror:0.481576\n", - "[32]\tvalidation_0-merror:0.47967\n", - "[33]\tvalidation_0-merror:0.482211\n", - "[34]\tvalidation_0-merror:0.48094\n", - "[35]\tvalidation_0-merror:0.481576\n", - "[36]\tvalidation_0-merror:0.481258\n", - "[37]\tvalidation_0-merror:0.48094\n", - "[38]\tvalidation_0-merror:0.482529\n", - "[39]\tvalidation_0-merror:0.483164\n", - "[40]\tvalidation_0-merror:0.48094\n", - "[41]\tvalidation_0-merror:0.481576\n", - "[42]\tvalidation_0-merror:0.481893\n", - "[43]\tvalidation_0-merror:0.482211\n", - "[44]\tvalidation_0-merror:0.482211\n", - "[45]\tvalidation_0-merror:0.481893\n", - "[46]\tvalidation_0-merror:0.482529\n", - "[47]\tvalidation_0-merror:0.480305\n", - "[48]\tvalidation_0-merror:0.47967\n", - "[49]\tvalidation_0-merror:0.479987\n", - "[50]\tvalidation_0-merror:0.479352\n", - "[51]\tvalidation_0-merror:0.47967\n", - "[52]\tvalidation_0-merror:0.480623\n", - "[53]\tvalidation_0-merror:0.482211\n", - "[54]\tvalidation_0-merror:0.482846\n", - "[55]\tvalidation_0-merror:0.481258\n", - "[56]\tvalidation_0-merror:0.48094\n", - "[57]\tvalidation_0-merror:0.482529\n", - "[58]\tvalidation_0-merror:0.482211\n", - "[59]\tvalidation_0-merror:0.483799\n", - "[60]\tvalidation_0-merror:0.484117\n", - "[61]\tvalidation_0-merror:0.485705\n", - "[62]\tvalidation_0-merror:0.487611\n", - "[63]\tvalidation_0-merror:0.487294\n", - "[64]\tvalidation_0-merror:0.486976\n", - "[65]\tvalidation_0-merror:0.488247\n", - "[66]\tvalidation_0-merror:0.487294\n", - "[67]\tvalidation_0-merror:0.487929\n", - "[68]\tvalidation_0-merror:0.488247\n", - "[69]\tvalidation_0-merror:0.488882\n", - "[70]\tvalidation_0-merror:0.487611\n", - "[71]\tvalidation_0-merror:0.487611\n", - "[72]\tvalidation_0-merror:0.487611\n", - "[73]\tvalidation_0-merror:0.487611\n", - "[74]\tvalidation_0-merror:0.487294\n", - "[75]\tvalidation_0-merror:0.487294\n", - "[76]\tvalidation_0-merror:0.486976\n", - "[77]\tvalidation_0-merror:0.488247\n", - "[78]\tvalidation_0-merror:0.487294\n", - "[79]\tvalidation_0-merror:0.486658\n", - "[80]\tvalidation_0-merror:0.486341\n", - "[81]\tvalidation_0-merror:0.486341\n", - "[82]\tvalidation_0-merror:0.486023\n", - "[83]\tvalidation_0-merror:0.485705\n", - "[84]\tvalidation_0-merror:0.485705\n", - "[85]\tvalidation_0-merror:0.487611\n", - "[86]\tvalidation_0-merror:0.486341\n", - "[87]\tvalidation_0-merror:0.48507\n", - "[88]\tvalidation_0-merror:0.484752\n", - "[89]\tvalidation_0-merror:0.48507\n", - "[90]\tvalidation_0-merror:0.485388\n", - "[91]\tvalidation_0-merror:0.484117\n", - "[92]\tvalidation_0-merror:0.483799\n", - "[93]\tvalidation_0-merror:0.486976\n", - "[94]\tvalidation_0-merror:0.486341\n", - "[95]\tvalidation_0-merror:0.486023\n", - "[96]\tvalidation_0-merror:0.486023\n", - "[97]\tvalidation_0-merror:0.485388\n", - "[98]\tvalidation_0-merror:0.484435\n", - "[99]\tvalidation_0-merror:0.485705\n", - "CPU times: user 19.4 s, sys: 12.7 s, total: 32.2 s\n", - "Wall time: 1.55 s\n" - ] - }, - { - "data": { - "text/plain": [ - "XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,\n", - " gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=8,\n", - " min_child_weight=5, missing=None, n_estimators=100, nthread=-1,\n", - " objective='multi:softprob', reg_alpha=0, reg_lambda=1,\n", - " scale_pos_weight=2, seed=0, silent=True, subsample=1)" - ] - }, - "execution_count": 60, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "%%time\n", "clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True, eval_metric='merror')" @@ -1176,22 +780,13 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Score of XGBClassifier for training set: 0.7590.\n", - "Score of XGBClassifier for test set: 0.5225.\n" - ] - } - ], + "outputs": [], "source": [ "y_pred = clf.predict(X_test)\n", "print(\"Score of {} for training set: {:.4f}.\".format(clf.__class__.__name__, accuracy_score(y_train, clf.predict(X_train))))\n", @@ -1200,7 +795,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": null, "metadata": { "collapsed": true, "deletable": true, @@ -1221,30 +816,13 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'Accuracy': 0.52249047013977123,\n", - " 'Confusion Matrix': array([[1452, 88, 267],\n", - " [ 650, 65, 278],\n", - " [ 532, 64, 539]]),\n", - " 'F1': 0.46751804585279649,\n", - " 'Precision': 0.472151244629816,\n", - " 'Recall': 0.52249047013977123}" - ] - }, - "execution_count": 63, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "labels = [\"Win\", \"Draw\", \"Defeat\"]\n", "report = classification_metrics_multilabel(y_test, y_pred, labels)\n", @@ -1253,7 +831,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": null, "metadata": { "collapsed": false, "deletable": true, @@ -1271,139 +849,13 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[1]\tvalid_0's multi_error: 0.520013\n", - "[2]\tvalid_0's multi_error: 0.524778\n", - "[3]\tvalid_0's multi_error: 0.513342\n", - "[4]\tvalid_0's multi_error: 0.511118\n", - "[5]\tvalid_0's multi_error: 0.5054\n", - "[6]\tvalid_0's multi_error: 0.50413\n", - "[7]\tvalid_0's multi_error: 0.499682\n", - "[8]\tvalid_0's multi_error: 0.506353\n", - "[9]\tvalid_0's multi_error: 0.505083\n", - "[10]\tvalid_0's multi_error: 0.506671\n", - "[11]\tvalid_0's multi_error: 0.506036\n", - "[12]\tvalid_0's multi_error: 0.499365\n", - "[13]\tvalid_0's multi_error: 0.499682\n", - "[14]\tvalid_0's multi_error: 0.501271\n", - "[15]\tvalid_0's multi_error: 0.496506\n", - "[16]\tvalid_0's multi_error: 0.499047\n", - "[17]\tvalid_0's multi_error: 0.498729\n", - "[18]\tvalid_0's multi_error: 0.499682\n", - "[19]\tvalid_0's multi_error: 0.499682\n", - "[20]\tvalid_0's multi_error: 0.498094\n", - "[21]\tvalid_0's multi_error: 0.498094\n", - "[22]\tvalid_0's multi_error: 0.499682\n", - "[23]\tvalid_0's multi_error: 0.497141\n", - "[24]\tvalid_0's multi_error: 0.498412\n", - "[25]\tvalid_0's multi_error: 0.499365\n", - "[26]\tvalid_0's multi_error: 0.503494\n", - "[27]\tvalid_0's multi_error: 0.502541\n", - "[28]\tvalid_0's multi_error: 0.501588\n", - "[29]\tvalid_0's multi_error: 0.499047\n", - "[30]\tvalid_0's multi_error: 0.500953\n", - "[31]\tvalid_0's multi_error: 0.501906\n", - "[32]\tvalid_0's multi_error: 0.502224\n", - "[33]\tvalid_0's multi_error: 0.500953\n", - "[34]\tvalid_0's multi_error: 0.500635\n", - "[35]\tvalid_0's multi_error: 0.502224\n", - "[36]\tvalid_0's multi_error: 0.502859\n", - "[37]\tvalid_0's multi_error: 0.500953\n", - "[38]\tvalid_0's multi_error: 0.5054\n", - "[39]\tvalid_0's multi_error: 0.506671\n", - "[40]\tvalid_0's multi_error: 0.507306\n", - "[41]\tvalid_0's multi_error: 0.5054\n", - "[42]\tvalid_0's multi_error: 0.506671\n", - "[43]\tvalid_0's multi_error: 0.50413\n", - "[44]\tvalid_0's multi_error: 0.503494\n", - "[45]\tvalid_0's multi_error: 0.502859\n", - "[46]\tvalid_0's multi_error: 0.5054\n", - "[47]\tvalid_0's multi_error: 0.506353\n", - "[48]\tvalid_0's multi_error: 0.505718\n", - "[49]\tvalid_0's multi_error: 0.506671\n", - "[50]\tvalid_0's multi_error: 0.506671\n", - "[51]\tvalid_0's multi_error: 0.505718\n", - "[52]\tvalid_0's multi_error: 0.50413\n", - "[53]\tvalid_0's multi_error: 0.504447\n", - "[54]\tvalid_0's multi_error: 0.50413\n", - "[55]\tvalid_0's multi_error: 0.502541\n", - "[56]\tvalid_0's multi_error: 0.502224\n", - "[57]\tvalid_0's multi_error: 0.500953\n", - "[58]\tvalid_0's multi_error: 0.501588\n", - "[59]\tvalid_0's multi_error: 0.50413\n", - "[60]\tvalid_0's multi_error: 0.502859\n", - "[61]\tvalid_0's multi_error: 0.504765\n", - "[62]\tvalid_0's multi_error: 0.50413\n", - "[63]\tvalid_0's multi_error: 0.50413\n", - "[64]\tvalid_0's multi_error: 0.502541\n", - "[65]\tvalid_0's multi_error: 0.501271\n", - "[66]\tvalid_0's multi_error: 0.5\n", - "[67]\tvalid_0's multi_error: 0.500635\n", - "[68]\tvalid_0's multi_error: 0.499047\n", - "[69]\tvalid_0's multi_error: 0.499682\n", - "[70]\tvalid_0's multi_error: 0.498729\n", - "[71]\tvalid_0's multi_error: 0.498412\n", - "[72]\tvalid_0's multi_error: 0.497141\n", - "[73]\tvalid_0's multi_error: 0.499682\n", - "[74]\tvalid_0's multi_error: 0.499682\n", - "[75]\tvalid_0's multi_error: 0.498729\n", - "[76]\tvalid_0's multi_error: 0.5\n", - "[77]\tvalid_0's multi_error: 0.500318\n", - "[78]\tvalid_0's multi_error: 0.499047\n", - "[79]\tvalid_0's multi_error: 0.499682\n", - "[80]\tvalid_0's multi_error: 0.499682\n", - "[81]\tvalid_0's multi_error: 0.499682\n", - "[82]\tvalid_0's multi_error: 0.500635\n", - "[83]\tvalid_0's multi_error: 0.497459\n", - "[84]\tvalid_0's multi_error: 0.496823\n", - "[85]\tvalid_0's multi_error: 0.496506\n", - "[86]\tvalid_0's multi_error: 0.5\n", - "[87]\tvalid_0's multi_error: 0.498094\n", - "[88]\tvalid_0's multi_error: 0.498729\n", - "[89]\tvalid_0's multi_error: 0.497776\n", - "[90]\tvalid_0's multi_error: 0.497459\n", - "[91]\tvalid_0's multi_error: 0.495553\n", - "[92]\tvalid_0's multi_error: 0.497459\n", - "[93]\tvalid_0's multi_error: 0.498729\n", - "[94]\tvalid_0's multi_error: 0.498412\n", - "[95]\tvalid_0's multi_error: 0.497776\n", - "[96]\tvalid_0's multi_error: 0.499365\n", - "[97]\tvalid_0's multi_error: 0.497459\n", - "[98]\tvalid_0's multi_error: 0.497776\n", - "[99]\tvalid_0's multi_error: 0.49587\n", - "[100]\tvalid_0's multi_error: 0.497141\n", - "CPU times: user 40.5 s, sys: 54.7 s, total: 1min 35s\n", - "Wall time: 4.7 s\n" - ] - }, - { - "data": { - "text/plain": [ - "LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, drop_rate=0.1,\n", - " is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,\n", - " max_drop=50, min_child_samples=10, min_child_weight=5,\n", - " min_split_gain=0, n_estimators=100, nthread=-1, num_leaves=255,\n", - " objective='multiclass', reg_alpha=0, reg_lambda=0,\n", - " scale_pos_weight=2, seed=0, sigmoid=1.0, silent=True,\n", - " skip_drop=0.5, subsample=1, subsample_for_bin=50000,\n", - " subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)" - ] - }, - "execution_count": 65, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "%%time\n", "clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True, eval_metric='multi_error')" @@ -1411,22 +863,13 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Score of LGBMClassifier for training set: 0.9998.\n", - "Score of LGBMClassifier for test set: 0.5116.\n" - ] - } - ], + "outputs": [], "source": [ "y_pred = clf.predict(X_test)\n", "print(\"Score of {} for training set: {:.4f}.\".format(clf.__class__.__name__, accuracy_score(y_train, clf.predict(X_train))))\n", @@ -1435,30 +878,13 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'Accuracy': 0.51156289707750957,\n", - " 'Confusion Matrix': array([[1387, 147, 273],\n", - " [ 623, 109, 261],\n", - " [ 503, 115, 517]]),\n", - " 'F1': 0.47163912122565715,\n", - " 'Precision': 0.4694795495800122,\n", - " 'Recall': 0.51156289707750957}" - ] - }, - "execution_count": 67, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "report = classification_metrics_multilabel(y_test, y_pred, labels)\n", "report" @@ -1479,7 +905,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true, + "collapsed": false, "deletable": true, "editable": true }, @@ -1496,7 +922,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ @@ -1514,7 +940,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ @@ -1537,7 +963,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ @@ -1549,14 +975,14 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ "%%time\n", "bk_cols = ['B365', 'BW', 'IW', 'LB', 'PS', 'WH', 'SJ', 'VC', 'GB', 'BS']\n", "bk_cols_selected = ['B365', 'BW'] \n", - "feables = create_feables(match_data, fifa_data, bk_cols_selected, get_overall = True)\n", + "feables = create_feables(match_data, fifa_data, bk_cols_selected, get_overall = True, all_leagues = False)\n", "print(feables.shape)\n", "feables.head()" ] @@ -1565,15 +991,15 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ - "feables_2015_2016 = feables[feables['season'] == '2015/2016']\n", + "feables_2015_2016 = feables[feables['season'] == 2015]\n", "print(feables_2015_2016.shape)\n", - "feables_2014_2015 = feables[feables['season'] == '2014/2015']\n", + "feables_2014_2015 = feables[feables['season'] == 2014]\n", "print(feables_2014_2015.shape)\n", - "feables_rest = feables[(feables['season'] != '2014/2015') & (feables['season'] != '2015/2016')]\n", + "feables_rest = feables[(feables['season'] != 2014) & (feables['season'] != 2015)]\n", "print(feables_rest.shape)" ] }, @@ -1581,7 +1007,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ @@ -1601,11 +1027,11 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ - "feables_up_to_2014_2015 = feables[feables['season'] != '2015/2016']\n", + "feables_up_to_2014_2015 = feables[feables['season'] != 2015]\n", "print(feables_up_to_2014_2015.shape)\n", "X_train2 = feables_up_to_2014_2015[feables_up_to_2014_2015.columns.difference(['match_api_id', 'label', 'season'])]\n", "y_train2 = feables_up_to_2014_2015['label']" @@ -1619,7 +1045,7 @@ }, "outputs": [], "source": [ - "clf = XGBClassifier(max_depth=8, \n", + "clf1 = XGBClassifier(max_depth=8, \n", " learning_rate=0.1, \n", " scale_pos_weight=2,\n", " min_child_weight=5,\n", @@ -1631,19 +1057,19 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ "%%time\n", - "clf.fit(X_train, y_train, verbose=True, eval_metric='merror')" + "clf1.fit(X_train, y_train, verbose=True, eval_metric='multi_error')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ @@ -1656,7 +1082,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ @@ -1669,7 +1095,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ @@ -1681,7 +1107,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ @@ -1710,19 +1136,19 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ "%%time\n", - "clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True, eval_metric='multi_error')" + "clf.fit(X_train, y_train, verbose=True, eval_metric='multi_error')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ @@ -1735,7 +1161,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ @@ -1748,7 +1174,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ @@ -1760,7 +1186,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ diff --git a/experiments/libs/football.py b/experiments/libs/football.py index 1af0aef..4a13110 100644 --- a/experiments/libs/football.py +++ b/experiments/libs/football.py @@ -202,10 +202,13 @@ def get_match_features(match, matches, x = 10): result.loc[0, 'games_against_won'] = get_wins(last_matches_against, home_team) result.loc[0, 'games_against_lost'] = get_wins(last_matches_against, away_team) + #Add season + result.loc[0, 'season'] = int(match['season'].split('/')[0]) + #Return match features return result.loc[0] -def create_feables(matches, fifa, bookkeepers, get_overall = False, horizontal = True, x = 10, verbose = True): +def create_feables(matches, fifa, bookkeepers, get_overall = False, horizontal = True, x = 10, all_leagues = True, verbose = True): ''' Create and aggregate features and labels for all matches. ''' #Get fifa stats features @@ -214,40 +217,29 @@ def create_feables(matches, fifa, bookkeepers, get_overall = False, horizontal = if verbose == True: print("Generating match features...") - start = time() #Get match features for all matches match_stats = matches.apply(lambda x: get_match_features(x, matches, x = 10), axis = 1) #Create dummies for league ID feature - dummies = pd.get_dummies(match_stats['league_id']).rename(columns = lambda x: 'League_' + str(x)) - match_stats = pd.concat([match_stats, dummies], axis = 1) - match_stats.drop(['league_id'], inplace = True, axis = 1) - - end = time() - if verbose == True: - print("Match features generated in {:.1f} minutes".format((end - start)/60)) + if all_leagues: + dummies = pd.get_dummies(match_stats['league_id']).rename(columns = lambda x: 'League_' + str(x)) + match_stats = pd.concat([match_stats, dummies], axis = 1) + match_stats.drop(['league_id'], inplace = True, axis = 1) + if verbose == True: print("Generating match labels...") - start = time() #Create match labels labels = matches.apply(get_match_label, axis = 1) - end = time() - if verbose == True: - print("Match labels generated in {:.1f} minutes".format((end - start)/60)) if verbose == True: print("Generating bookkeeper data...") - start = time() #Get bookkeeper quotas for all matches bk_data = get_bookkeeper_data(matches, bookkeepers, horizontal = True) bk_data.loc[:,'match_api_id'] = matches.loc[:,'match_api_id'] - end = time() - if verbose == True: - print("Bookkeeper data generated in {:.1f} minutes".format((end - start)/60)) #Merges features and labels into one frame features = pd.merge(match_stats, fifa_stats, on = 'match_api_id', how = 'left')