This commit is contained in:
miguelgfierro 2017-05-20 15:25:07 +00:00
Родитель 16683837ea
Коммит d442baf222
1 изменённых файлов: 110 добавлений и 49 удалений

Просмотреть файл

@ -15,7 +15,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 66,
"metadata": {
"collapsed": false,
"deletable": true,
@ -27,7 +27,9 @@
"output_type": "stream",
"text": [
"System version: 3.5.2 |Anaconda custom (64-bit)| (default, Jul 2 2016, 17:53:06) \n",
"[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]\n"
"[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]\n",
"XGBoost version: 0.6\n",
"LightGBM version: 0.1\n"
]
}
],
@ -35,8 +37,8 @@
"import os,sys\n",
"import numpy as np\n",
"import pandas as pd\n",
"from lightgbm.sklearn import LGBMClassifier\n",
"from xgboost import XGBClassifier\n",
"from lightgbm.sklearn import LGBMRegressor\n",
"from xgboost import XGBRegressor\n",
"from sklearn.metrics import (confusion_matrix, accuracy_score, roc_auc_score, f1_score, log_loss, precision_score,\n",
" recall_score, mean_squared_error, mean_absolute_error, r2_score)\n",
"from sklearn.model_selection import train_test_split\n",
@ -44,11 +46,12 @@
"from sklearn.pipeline import Pipeline, FeatureUnion\n",
"from libs.loaders import load_airline\n",
"from libs.conversion import convert_cols_categorical_to_numeric, convert_related_cols_categorical_to_numeric\n",
"import pkg_resources\n",
"\n",
"\n",
"os.environ['MOUNT_POINT'] = '/strata'\n",
"print(\"System version: {}\".format(sys.version))\n",
"print(\"XGBoost version: {}\".format(xgboost.__version__))\n",
"print(\"LightGBM version: {}\".format(lightgbm.__version__))\n"
"print(\"XGBoost version: {}\".format(pkg_resources.get_distribution('xgboost').version))\n",
"print(\"LightGBM version: {}\".format(pkg_resources.get_distribution('lightgbm').version))\n"
]
},
{
@ -885,21 +888,6 @@
"Let's start with the XGBoost classifier."
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"pipeline_steps = [('scale', StandardScaler())]\n",
"continuous_pipeline = Pipeline(steps=pipeline_steps)\n",
"featurisers = [('continuous', continuous_pipeline)]"
]
},
{
"cell_type": "code",
"execution_count": 33,
@ -910,15 +898,7 @@
},
"outputs": [],
"source": [
"xgb_clf_pipeline = Pipeline(steps=[('features', FeatureUnion(featurisers)),\n",
" ('clf', XGBClassifier(max_depth=8,\n",
" n_estimators=100,\n",
" min_child_weight=30,\n",
" learning_rate=0.1,\n",
" subsample=0.80,\n",
" colsample_bytree=0.80,\n",
" seed=77))])\n",
" \n",
" \n",
"xgb_clf_pipeline = XGBClassifier(max_depth=8,\n",
" n_estimators=100,\n",
" min_child_weight=30,\n",
@ -1078,7 +1058,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 57,
"metadata": {
"collapsed": true,
"deletable": true,
@ -1090,24 +1070,65 @@
" n_estimators=100,\n",
" min_child_weight=30,\n",
" learning_rate=0.1,\n",
" nthread=20,\n",
" subsample=0.80,\n",
" colsample_bytree=0.80,\n",
" seed=77)"
" seed=77,\n",
" silent=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
"collapsed": true
},
"outputs": [],
"source": [
"def loglikelood(y_true, y_pred):\n",
" eval_result = log_loss(y_true, y_pred)\n",
" eval_name = 'log_loss'\n",
" is_bigger_better = False\n",
" return eval_name, eval_result, is_bigger_better"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1h 44min 37s, sys: 57min 8s, total: 2h 41min 45s\n",
"Wall time: 9min 1s\n"
]
},
{
"data": {
"text/plain": [
"LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.8, drop_rate=0.1,\n",
" is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,\n",
" max_drop=50, min_child_samples=10, min_child_weight=30,\n",
" min_split_gain=0, n_estimators=100, nthread=-1, num_leaves=255,\n",
" objective='binary', reg_alpha=0, reg_lambda=0, scale_pos_weight=1,\n",
" seed=77, sigmoid=1.0, silent=False, skip_drop=0.5, subsample=0.8,\n",
" subsample_for_bin=50000, subsample_freq=1, uniform_drop=False,\n",
" xgboost_dart_mode=False)"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"lgbm_clf_pipeline.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True, eval_metric='rmse')"
"lgbm_clf_pipeline.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True, eval_metric=loglikelood)"
]
},
{
@ -1132,13 +1153,22 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 38,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 3min 14s, sys: 1min 22s, total: 4min 37s\n",
"Wall time: 19.3 s\n"
]
}
],
"source": [
"%%time\n",
"y_prob_xgb = xgb_clf_pipeline.predict_proba(X_test)"
@ -1146,13 +1176,45 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 64,
"metadata": {
"collapsed": true,
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 0, 0, 0, 0, 0, 1, 0, 0, 1])"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"yyy = xgb_clf_pipeline.predict(X_test)\n",
"yyy[:10]"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 40min 30s, sys: 28min 28s, total: 1h 8min 58s\n",
"Wall time: 3min 35s\n"
]
}
],
"source": [
"%%time\n",
"y_prob_lgbm = lgbm_clf_pipeline.predict_proba(X_test)"
@ -1171,7 +1233,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 40,
"metadata": {
"collapsed": true,
"deletable": true,
@ -1192,7 +1254,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 41,
"metadata": {
"collapsed": true,
"deletable": true,
@ -1210,7 +1272,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 59,
"metadata": {
"collapsed": true,
"deletable": true,
@ -1219,14 +1281,13 @@
"outputs": [],
"source": [
"def binarize_prediction(y, threshold=0.5):\n",
" threshold_func = lambda x: 0 if x<=threshold else 1\n",
" y_pred = y.map(threshold_func)\n",
" y_pred = np.where(y > threshold, 1, 0)\n",
" return y_pred\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 60,
"metadata": {
"collapsed": false,
"deletable": true,