This commit is contained in:
miguelgfierro 2017-05-21 07:02:04 +00:00
Родитель 2eb40b4bea
Коммит 616dae3bdf
1 изменённых файлов: 293 добавлений и 52 удалений

Просмотреть файл

@ -15,7 +15,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 36,
"metadata": {
"collapsed": false,
"deletable": true,
@ -37,7 +37,7 @@
"import os,sys\n",
"import numpy as np\n",
"import pandas as pd\n",
"from lightgbm.sklearn import LGBMRegressor\n",
"from lightgbm.sklearn import LGBMRegressor, LGBMClassifier\n",
"from xgboost import XGBRegressor\n",
"from sklearn.metrics import (confusion_matrix, accuracy_score, roc_auc_score, f1_score, log_loss, precision_score,\n",
" recall_score)\n",
@ -45,7 +45,6 @@
"from libs.conversion import convert_cols_categorical_to_numeric, convert_related_cols_categorical_to_numeric\n",
"import pkg_resources\n",
"\n",
"\n",
"print(\"System version: {}\".format(sys.version))\n",
"print(\"XGBoost version: {}\".format(pkg_resources.get_distribution('xgboost').version))\n",
"print(\"LightGBM version: {}\".format(pkg_resources.get_distribution('lightgbm').version))\n"
@ -1085,7 +1084,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 17,
"metadata": {
"collapsed": true,
"deletable": true,
@ -1102,7 +1101,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 18,
"metadata": {
"collapsed": false,
"deletable": true,
@ -1179,8 +1178,60 @@
"[64]\tvalid_0's log_loss: 0.527862\n",
"[65]\tvalid_0's log_loss: 0.527307\n",
"[66]\tvalid_0's log_loss: 0.526442\n",
"[67]\tvalid_0's log_loss: 0.526158\n"
"[67]\tvalid_0's log_loss: 0.526158\n",
"[68]\tvalid_0's log_loss: 0.52545\n",
"[69]\tvalid_0's log_loss: 0.525063\n",
"[70]\tvalid_0's log_loss: 0.524413\n",
"[71]\tvalid_0's log_loss: 0.524119\n",
"[72]\tvalid_0's log_loss: 0.523447\n",
"[73]\tvalid_0's log_loss: 0.52282\n",
"[74]\tvalid_0's log_loss: 0.522554\n",
"[75]\tvalid_0's log_loss: 0.522167\n",
"[76]\tvalid_0's log_loss: 0.521837\n",
"[77]\tvalid_0's log_loss: 0.521335\n",
"[78]\tvalid_0's log_loss: 0.521158\n",
"[79]\tvalid_0's log_loss: 0.520606\n",
"[80]\tvalid_0's log_loss: 0.520276\n",
"[81]\tvalid_0's log_loss: 0.51986\n",
"[82]\tvalid_0's log_loss: 0.519674\n",
"[83]\tvalid_0's log_loss: 0.519353\n",
"[84]\tvalid_0's log_loss: 0.519099\n",
"[85]\tvalid_0's log_loss: 0.518659\n",
"[86]\tvalid_0's log_loss: 0.518253\n",
"[87]\tvalid_0's log_loss: 0.518062\n",
"[88]\tvalid_0's log_loss: 0.517721\n",
"[89]\tvalid_0's log_loss: 0.517529\n",
"[90]\tvalid_0's log_loss: 0.517188\n",
"[91]\tvalid_0's log_loss: 0.516802\n",
"[92]\tvalid_0's log_loss: 0.516655\n",
"[93]\tvalid_0's log_loss: 0.516295\n",
"[94]\tvalid_0's log_loss: 0.515957\n",
"[95]\tvalid_0's log_loss: 0.515669\n",
"[96]\tvalid_0's log_loss: 0.515474\n",
"[97]\tvalid_0's log_loss: 0.515313\n",
"[98]\tvalid_0's log_loss: 0.51517\n",
"[99]\tvalid_0's log_loss: 0.514818\n",
"[100]\tvalid_0's log_loss: 0.514533\n",
"CPU times: user 2h 43min 9s, sys: 8min 49s, total: 2h 51min 58s\n",
"Wall time: 18min 45s\n"
]
},
{
"data": {
"text/plain": [
"LGBMRegressor(boosting_type='gbdt', colsample_bytree=0.8, drop_rate=0.1,\n",
" fair_c=1.0, gaussian_eta=1.0, huber_delta=1.0, learning_rate=0.1,\n",
" max_bin=255, max_depth=-1, max_drop=50, min_child_samples=10,\n",
" min_child_weight=30, min_split_gain=0, n_estimators=100, nthread=-1,\n",
" num_leaves=255, objective='regression', poisson_max_delta_step=0.7,\n",
" reg_alpha=0, reg_lambda=0, seed=77, silent=False, skip_drop=0.5,\n",
" subsample=0.8, subsample_for_bin=50000, subsample_freq=1,\n",
" uniform_drop=False, xgboost_dart_mode=False)"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
@ -1213,13 +1264,22 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 19,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 3min 33s, sys: 6.73 s, total: 3min 40s\n",
"Wall time: 15 s\n"
]
}
],
"source": [
"%%time\n",
"y_prob_xgb = np.clip(xgb_clf_pipeline.predict(X_test), 0.0001, 0.9999)"
@ -1227,16 +1287,25 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 22,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1h 55min, sys: 14min 30s, total: 2h 9min 31s\n",
"Wall time: 5min 44s\n"
]
}
],
"source": [
"%%time\n",
"y_prob_lgbm = np.clip(lgbm_clf_pipeline.predict_proba(X_test), 0.0001, 0.9999)"
"y_prob_lgbm = np.clip(lgbm_clf_pipeline.predict(X_test), 0.0001, 0.9999)"
]
},
{
@ -1252,7 +1321,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 23,
"metadata": {
"collapsed": true,
"deletable": true,
@ -1273,7 +1342,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 24,
"metadata": {
"collapsed": true,
"deletable": true,
@ -1291,7 +1360,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 25,
"metadata": {
"collapsed": true,
"deletable": true,
@ -1306,7 +1375,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 26,
"metadata": {
"collapsed": false,
"deletable": true,
@ -1320,13 +1389,23 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 27,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'Recall': 0.63961308830403441, 'F1': 0.69233995199951537, 'Precision': 0.75454095919535258, 'Confusion Matrix': array([[9822644, 2271976],\n",
" [3935131, 6984053]]), 'Accuracy': 0.73028765692103748}\n",
"{'AUC': 0.80363587688230786, 'Log loss': 0.53966634712421813}\n"
]
}
],
"source": [
"report1_xgb = classification_metrics_binary(y_test, y_pred_xgb)\n",
"print(report1_xgb)\n",
@ -1336,13 +1415,23 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 28,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'Recall': 0.65893742609337835, 'F1': 0.71517074946795001, 'Precision': 0.78189740329600466, 'Confusion Matrix': array([[10087629, 2006991],\n",
" [ 3724125, 7195059]]), 'Accuracy': 0.75097050448504732}\n",
"{'AUC': 0.82629988528878362, 'Log loss': 0.51120057686403264}\n"
]
}
],
"source": [
"report1_lgbm = classification_metrics_binary(y_test, y_pred_lgbm)\n",
"print(report1_lgbm)\n",
@ -1377,7 +1466,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 29,
"metadata": {
"collapsed": true,
"deletable": true,
@ -1392,13 +1481,43 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 30,
"metadata": {
"collapsed": true,
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1287333, 15)\n",
"(5126498, 15)\n",
"(290827, 15)\n",
"(5110527, 15)\n",
"(4995005, 15)\n",
"(5020651, 15)\n",
"(4993587, 15)\n",
"(5078411, 15)\n",
"(5219140, 15)\n",
"(5209326, 15)\n",
"(5301999, 15)\n",
"(5227051, 15)\n",
"(5360018, 15)\n",
"(5481303, 15)\n",
"(5723673, 15)\n",
"(5197860, 15)\n",
"(6375689, 15)\n",
"(6987729, 15)\n",
"(6992838, 15)\n",
"(7003802, 15)\n",
"(7275288, 15)\n",
"CPU times: user 7.08 s, sys: 2.18 s, total: 9.27 s\n",
"Wall time: 9.44 s\n"
]
}
],
"source": [
"%%time\n",
"data_yearly_list = get_data_list_yearly(df_plane_numeric)\n",
@ -1408,13 +1527,21 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 31,
"metadata": {
"collapsed": true,
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of years: 21\n"
]
}
],
"source": [
"total_subsets = len(data_yearly_list)\n",
"print(\"Number of years: {}\".format(total_subsets))\n",
@ -1423,7 +1550,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 32,
"metadata": {
"collapsed": true,
"deletable": true,
@ -1451,13 +1578,23 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 33,
"metadata": {
"collapsed": true,
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(16810190, 15)\n",
"CPU times: user 620 ms, sys: 1.13 s, total: 1.75 s\n",
"Wall time: 1.75 s\n"
]
}
],
"source": [
"%%time\n",
"subset_base = generate_subset(data_yearly_list, num_ini)\n",
@ -1466,7 +1603,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 34,
"metadata": {
"collapsed": true,
"deletable": true,
@ -1479,9 +1616,9 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 38,
"metadata": {
"collapsed": true,
"collapsed": false,
"deletable": true,
"editable": true
},
@ -1498,13 +1635,39 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 39,
"metadata": {
"collapsed": true,
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 25min 38s, sys: 1min 34s, total: 27min 12s\n",
"Wall time: 1min 15s\n"
]
},
{
"data": {
"text/plain": [
"LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.8, drop_rate=0.1,\n",
" is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,\n",
" max_drop=50, min_child_samples=10, min_child_weight=30,\n",
" min_split_gain=0, n_estimators=100, nthread=-1, num_leaves=255,\n",
" objective='binary', reg_alpha=0, reg_lambda=0, scale_pos_weight=1,\n",
" seed=42, sigmoid=1.0, silent=True, skip_drop=0.5, subsample=0.8,\n",
" subsample_for_bin=50000, subsample_freq=1, uniform_drop=False,\n",
" xgboost_dart_mode=False)"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"clf.fit(X_train, y_train)"
@ -1512,7 +1675,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 40,
"metadata": {
"collapsed": true,
"deletable": true,
@ -1535,13 +1698,39 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 41,
"metadata": {
"collapsed": true,
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Predicting year 1992...\n",
"Predicting year 1993...\n",
"Predicting year 1994...\n",
"Predicting year 1995...\n",
"Predicting year 1996...\n",
"Predicting year 1997...\n",
"Predicting year 1998...\n",
"Predicting year 1999...\n",
"Predicting year 2000...\n",
"Predicting year 2001...\n",
"Predicting year 2002...\n",
"Predicting year 2003...\n",
"Predicting year 2004...\n",
"Predicting year 2005...\n",
"Predicting year 2006...\n",
"Predicting year 2007...\n",
"{1992: 0.75635888652686678, 1993: 0.75543231749041317, 1994: 0.74359046560036202, 1995: 0.73113386496625876, 1996: 0.72234411898967354, 1997: 0.71978342508174742, 1998: 0.70525617599675228, 1999: 0.69996761204906399, 2000: 0.68899913031627702, 2001: 0.67335170964518765, 2002: 0.67853347339097247, 2003: 0.68455330866985509, 2004: 0.68012640444413341, 2005: 0.67296453886104612, 2006: 0.66332086486739628, 2007: 0.65060764604782662}\n",
"CPU times: user 7h 17min 43s, sys: 46min 34s, total: 8h 4min 18s\n",
"Wall time: 21min 46s\n"
]
}
],
"source": [
"%%time\n",
"accuracy_dict = predict_accuracy_future_years(clf, data_yearly_list, num_ini)\n",
@ -1571,7 +1760,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 42,
"metadata": {
"collapsed": true,
"deletable": true,
@ -1584,13 +1773,23 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 43,
"metadata": {
"collapsed": true,
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(69425349, 15)\n",
"CPU times: user 8.86 s, sys: 43.2 s, total: 52.1 s\n",
"Wall time: 53.1 s\n"
]
}
],
"source": [
"%%time\n",
"subset_retrain = generate_subset(data_yearly_list, new_init)\n",
@ -1599,7 +1798,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 44,
"metadata": {
"collapsed": true,
"deletable": true,
@ -1612,7 +1811,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 45,
"metadata": {
"collapsed": true,
"deletable": true,
@ -1631,13 +1830,39 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 46,
"metadata": {
"collapsed": true,
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1h 53min 58s, sys: 12min 27s, total: 2h 6min 25s\n",
"Wall time: 7min 7s\n"
]
},
{
"data": {
"text/plain": [
"LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.8, drop_rate=0.1,\n",
" is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,\n",
" max_drop=50, min_child_samples=10, min_child_weight=30,\n",
" min_split_gain=0, n_estimators=100, nthread=-1, num_leaves=255,\n",
" objective='binary', reg_alpha=0, reg_lambda=0, scale_pos_weight=1,\n",
" seed=42, sigmoid=1.0, silent=True, skip_drop=0.5, subsample=0.8,\n",
" subsample_for_bin=50000, subsample_freq=1, uniform_drop=False,\n",
" xgboost_dart_mode=False)"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"clf_retrain.fit(X_train, y_train)"
@ -1645,13 +1870,29 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 47,
"metadata": {
"collapsed": true,
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Predicting year 2002...\n",
"Predicting year 2003...\n",
"Predicting year 2004...\n",
"Predicting year 2005...\n",
"Predicting year 2006...\n",
"Predicting year 2007...\n",
"{2002: 0.75089709996036835, 2003: 0.74953483458807357, 2004: 0.72610099790647287, 2005: 0.72036803941404048, 2006: 0.70790022333583957, 2007: 0.6962276957283341}\n",
"CPU times: user 3h 8min 36s, sys: 18min 9s, total: 3h 26min 45s\n",
"Wall time: 9min 47s\n"
]
}
],
"source": [
"%%time\n",
"accuracy_retrain = predict_accuracy_future_years(clf_retrain, data_yearly_list, new_init)\n",