new pass airline

2017-05-20 15:25:07 +00:00 · 2017-05-20 15:25:07 +00:00 · d442baf222
--- a/experiments/exp01_airline.ipynb
+++ b/experiments/exp01_airline.ipynb
@ -15,7 +15,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 66,
   "metadata": {
    "collapsed": false,
    "deletable": true,
@ -27,7 +27,9 @@
     "output_type": "stream",
     "text": [
      "System version: 3.5.2 |Anaconda custom (64-bit)| (default, Jul  2 2016, 17:53:06) \n",
-      "[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]\n"
+      "[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]\n",
+      "XGBoost version: 0.6\n",
+      "LightGBM version: 0.1\n"
     ]
    }
   ],
@ -35,8 +37,8 @@
    "import os,sys\n",
    "import numpy as np\n",
    "import pandas as pd\n",
-    "from lightgbm.sklearn import LGBMClassifier\n",
-    "from xgboost import XGBClassifier\n",
+    "from lightgbm.sklearn import LGBMRegressor\n",
+    "from xgboost import XGBRegressor\n",
    "from sklearn.metrics import (confusion_matrix, accuracy_score, roc_auc_score, f1_score, log_loss, precision_score,\n",
    "                             recall_score, mean_squared_error, mean_absolute_error, r2_score)\n",
    "from sklearn.model_selection import train_test_split\n",
@ -44,11 +46,12 @@
    "from sklearn.pipeline import Pipeline, FeatureUnion\n",
    "from libs.loaders import load_airline\n",
    "from libs.conversion import convert_cols_categorical_to_numeric, convert_related_cols_categorical_to_numeric\n",
+    "import pkg_resources\n",
+    "\n",
    "\n",
-    "os.environ['MOUNT_POINT'] = '/strata'\n",
    "print(\"System version: {}\".format(sys.version))\n",
-    "print(\"XGBoost version: {}\".format(xgboost.__version__))\n",
-    "print(\"LightGBM version: {}\".format(lightgbm.__version__))\n"
+    "print(\"XGBoost version: {}\".format(pkg_resources.get_distribution('xgboost').version))\n",
+    "print(\"LightGBM version: {}\".format(pkg_resources.get_distribution('lightgbm').version))\n"
   ]
  },
  {
@ -885,21 +888,6 @@
    "Let's start with the XGBoost classifier."
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "metadata": {
-    "collapsed": true,
-    "deletable": true,
-    "editable": true
-   },
-   "outputs": [],
-   "source": [
-    "pipeline_steps = [('scale', StandardScaler())]\n",
-    "continuous_pipeline = Pipeline(steps=pipeline_steps)\n",
-    "featurisers = [('continuous', continuous_pipeline)]"
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": 33,
@ -910,15 +898,7 @@
   },
   "outputs": [],
   "source": [
-    "xgb_clf_pipeline = Pipeline(steps=[('features', FeatureUnion(featurisers)),\n",
-    "                                 ('clf', XGBClassifier(max_depth=8,\n",
-    "                                                      n_estimators=100,\n",
-    "                                                      min_child_weight=30,\n",
-    "                                                      learning_rate=0.1,\n",
-    "                                                      subsample=0.80,\n",
-    "                                                      colsample_bytree=0.80,\n",
-    "                                                      seed=77))])\n",
-    "    \n",
+    "   \n",
    "xgb_clf_pipeline = XGBClassifier(max_depth=8,\n",
    "                                n_estimators=100,\n",
    "                                min_child_weight=30,\n",
@ -1078,7 +1058,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 57,
   "metadata": {
    "collapsed": true,
    "deletable": true,
@ -1090,24 +1070,65 @@
    "                                    n_estimators=100,\n",
    "                                    min_child_weight=30,\n",
    "                                    learning_rate=0.1,\n",
-    "                                    nthread=20,\n",
    "                                    subsample=0.80,\n",
    "                                    colsample_bytree=0.80,\n",
-    "                                    seed=77)"
+    "                                    seed=77,\n",
+    "                                    silent=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
-    "collapsed": true,
-    "deletable": true,
-    "editable": true
+    "collapsed": true
   },
   "outputs": [],
+   "source": [
+    "def loglikelood(y_true, y_pred):\n",
+    "    eval_result = log_loss(y_true, y_pred)\n",
+    "    eval_name = 'log_loss'\n",
+    "    is_bigger_better = False\n",
+    "    return eval_name, eval_result, is_bigger_better"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 1h 44min 37s, sys: 57min 8s, total: 2h 41min 45s\n",
+      "Wall time: 9min 1s\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.8, drop_rate=0.1,\n",
+       "        is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,\n",
+       "        max_drop=50, min_child_samples=10, min_child_weight=30,\n",
+       "        min_split_gain=0, n_estimators=100, nthread=-1, num_leaves=255,\n",
+       "        objective='binary', reg_alpha=0, reg_lambda=0, scale_pos_weight=1,\n",
+       "        seed=77, sigmoid=1.0, silent=False, skip_drop=0.5, subsample=0.8,\n",
+       "        subsample_for_bin=50000, subsample_freq=1, uniform_drop=False,\n",
+       "        xgboost_dart_mode=False)"
+      ]
+     },
+     "execution_count": 58,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "%%time\n",
-    "lgbm_clf_pipeline.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True, eval_metric='rmse')"
+    "lgbm_clf_pipeline.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True, eval_metric=loglikelood)"
   ]
  },
  {
@ -1132,13 +1153,22 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 38,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 3min 14s, sys: 1min 22s, total: 4min 37s\n",
+      "Wall time: 19.3 s\n"
+     ]
+    }
+   ],
   "source": [
    "%%time\n",
    "y_prob_xgb = xgb_clf_pipeline.predict_proba(X_test)"
@ -1146,13 +1176,45 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 64,
   "metadata": {
-    "collapsed": true,
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([0, 0, 0, 0, 0, 0, 1, 0, 0, 1])"
+      ]
+     },
+     "execution_count": 64,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "yyy = xgb_clf_pipeline.predict(X_test)\n",
+    "yyy[:10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {
+    "collapsed": false,
    "deletable": true,
    "editable": true
   },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 40min 30s, sys: 28min 28s, total: 1h 8min 58s\n",
+      "Wall time: 3min 35s\n"
+     ]
+    }
+   ],
   "source": [
    "%%time\n",
    "y_prob_lgbm = lgbm_clf_pipeline.predict_proba(X_test)"
@ -1171,7 +1233,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 40,
   "metadata": {
    "collapsed": true,
    "deletable": true,
@ -1192,7 +1254,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 41,
   "metadata": {
    "collapsed": true,
    "deletable": true,
@ -1210,7 +1272,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 59,
   "metadata": {
    "collapsed": true,
    "deletable": true,
@ -1219,14 +1281,13 @@
   "outputs": [],
   "source": [
    "def binarize_prediction(y, threshold=0.5):\n",
-    "    threshold_func = lambda x: 0 if x<=threshold else 1\n",
-    "    y_pred = y.map(threshold_func)\n",
+    "    y_pred = np.where(y > threshold, 1, 0)\n",
    "    return y_pred\n"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 60,
   "metadata": {
    "collapsed": false,
    "deletable": true,