new pass airline
This commit is contained in:
Родитель
16683837ea
Коммит
d442baf222
|
@ -15,7 +15,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"execution_count": 66,
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"deletable": true,
|
||||
|
@ -27,7 +27,9 @@
|
|||
"output_type": "stream",
|
||||
"text": [
|
||||
"System version: 3.5.2 |Anaconda custom (64-bit)| (default, Jul 2 2016, 17:53:06) \n",
|
||||
"[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]\n"
|
||||
"[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]\n",
|
||||
"XGBoost version: 0.6\n",
|
||||
"LightGBM version: 0.1\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -35,8 +37,8 @@
|
|||
"import os,sys\n",
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"from lightgbm.sklearn import LGBMClassifier\n",
|
||||
"from xgboost import XGBClassifier\n",
|
||||
"from lightgbm.sklearn import LGBMRegressor\n",
|
||||
"from xgboost import XGBRegressor\n",
|
||||
"from sklearn.metrics import (confusion_matrix, accuracy_score, roc_auc_score, f1_score, log_loss, precision_score,\n",
|
||||
" recall_score, mean_squared_error, mean_absolute_error, r2_score)\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
|
@ -44,11 +46,12 @@
|
|||
"from sklearn.pipeline import Pipeline, FeatureUnion\n",
|
||||
"from libs.loaders import load_airline\n",
|
||||
"from libs.conversion import convert_cols_categorical_to_numeric, convert_related_cols_categorical_to_numeric\n",
|
||||
"import pkg_resources\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"os.environ['MOUNT_POINT'] = '/strata'\n",
|
||||
"print(\"System version: {}\".format(sys.version))\n",
|
||||
"print(\"XGBoost version: {}\".format(xgboost.__version__))\n",
|
||||
"print(\"LightGBM version: {}\".format(lightgbm.__version__))\n"
|
||||
"print(\"XGBoost version: {}\".format(pkg_resources.get_distribution('xgboost').version))\n",
|
||||
"print(\"LightGBM version: {}\".format(pkg_resources.get_distribution('lightgbm').version))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -885,21 +888,6 @@
|
|||
"Let's start with the XGBoost classifier."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pipeline_steps = [('scale', StandardScaler())]\n",
|
||||
"continuous_pipeline = Pipeline(steps=pipeline_steps)\n",
|
||||
"featurisers = [('continuous', continuous_pipeline)]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
|
@ -910,15 +898,7 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"xgb_clf_pipeline = Pipeline(steps=[('features', FeatureUnion(featurisers)),\n",
|
||||
" ('clf', XGBClassifier(max_depth=8,\n",
|
||||
" n_estimators=100,\n",
|
||||
" min_child_weight=30,\n",
|
||||
" learning_rate=0.1,\n",
|
||||
" subsample=0.80,\n",
|
||||
" colsample_bytree=0.80,\n",
|
||||
" seed=77))])\n",
|
||||
" \n",
|
||||
" \n",
|
||||
"xgb_clf_pipeline = XGBClassifier(max_depth=8,\n",
|
||||
" n_estimators=100,\n",
|
||||
" min_child_weight=30,\n",
|
||||
|
@ -1078,7 +1058,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 57,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"deletable": true,
|
||||
|
@ -1090,24 +1070,65 @@
|
|||
" n_estimators=100,\n",
|
||||
" min_child_weight=30,\n",
|
||||
" learning_rate=0.1,\n",
|
||||
" nthread=20,\n",
|
||||
" subsample=0.80,\n",
|
||||
" colsample_bytree=0.80,\n",
|
||||
" seed=77)"
|
||||
" seed=77,\n",
|
||||
" silent=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def loglikelood(y_true, y_pred):\n",
|
||||
" eval_result = log_loss(y_true, y_pred)\n",
|
||||
" eval_name = 'log_loss'\n",
|
||||
" is_bigger_better = False\n",
|
||||
" return eval_name, eval_result, is_bigger_better"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 58,
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CPU times: user 1h 44min 37s, sys: 57min 8s, total: 2h 41min 45s\n",
|
||||
"Wall time: 9min 1s\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.8, drop_rate=0.1,\n",
|
||||
" is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,\n",
|
||||
" max_drop=50, min_child_samples=10, min_child_weight=30,\n",
|
||||
" min_split_gain=0, n_estimators=100, nthread=-1, num_leaves=255,\n",
|
||||
" objective='binary', reg_alpha=0, reg_lambda=0, scale_pos_weight=1,\n",
|
||||
" seed=77, sigmoid=1.0, silent=False, skip_drop=0.5, subsample=0.8,\n",
|
||||
" subsample_for_bin=50000, subsample_freq=1, uniform_drop=False,\n",
|
||||
" xgboost_dart_mode=False)"
|
||||
]
|
||||
},
|
||||
"execution_count": 58,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"lgbm_clf_pipeline.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True, eval_metric='rmse')"
|
||||
"lgbm_clf_pipeline.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True, eval_metric=loglikelood)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -1132,13 +1153,22 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 38,
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CPU times: user 3min 14s, sys: 1min 22s, total: 4min 37s\n",
|
||||
"Wall time: 19.3 s\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"y_prob_xgb = xgb_clf_pipeline.predict_proba(X_test)"
|
||||
|
@ -1146,13 +1176,45 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 64,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"array([0, 0, 0, 0, 0, 0, 1, 0, 0, 1])"
|
||||
]
|
||||
},
|
||||
"execution_count": 64,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"yyy = xgb_clf_pipeline.predict(X_test)\n",
|
||||
"yyy[:10]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 39,
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CPU times: user 40min 30s, sys: 28min 28s, total: 1h 8min 58s\n",
|
||||
"Wall time: 3min 35s\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"y_prob_lgbm = lgbm_clf_pipeline.predict_proba(X_test)"
|
||||
|
@ -1171,7 +1233,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 40,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"deletable": true,
|
||||
|
@ -1192,7 +1254,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 41,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"deletable": true,
|
||||
|
@ -1210,7 +1272,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 59,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"deletable": true,
|
||||
|
@ -1219,14 +1281,13 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"def binarize_prediction(y, threshold=0.5):\n",
|
||||
" threshold_func = lambda x: 0 if x<=threshold else 1\n",
|
||||
" y_pred = y.map(threshold_func)\n",
|
||||
" y_pred = np.where(y > threshold, 1, 0)\n",
|
||||
" return y_pred\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 60,
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"deletable": true,
|
||||
|
|
Загрузка…
Ссылка в новой задаче