starting refactoritn

2017-05-22 09:39:02 +00:00 · 2017-05-22 09:39:02 +00:00 · 2576101088
--- a/experiments/01_airline.ipynb
+++ b/experiments/01_airline.ipynb
@ -15,7 +15,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 1,
   "metadata": {
    "collapsed": false,
    "deletable": true,
@ -43,6 +43,7 @@
    "                             recall_score)\n",
    "from libs.loaders import load_airline\n",
    "from libs.conversion import convert_cols_categorical_to_numeric, convert_related_cols_categorical_to_numeric\n",
+    "from libs.timer import Timer\n",
    "import pkg_resources\n",
    "\n",
    "print(\"System version: {}\".format(sys.version))\n",
@ -882,6 +883,25 @@
    "X_test, y_test = generate_feables(test)\n"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Training \n",
+    "Now we are going to create two pipelines, one of XGBoost and one for LightGBM. The technology behind both libraries is different, so it is difficult to compare them in the exact same model setting. XGBoost grows the trees depth-wise and controls model complexity with `max_depth`. Instead, LightGBM uses a leaf-wise algorithm and controls the model complexity by `num_leaves`. As a tradeoff, we use XGBoost with `max_depth=8`, which will have max number leaves of 255, and compare it with LightGBM with `num_leaves=255`. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "results_dict = dict()"
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {
@ -890,9 +910,6 @@
    "editable": true
   },
   "source": [
-    "### Training \n",
-    "Now we are going to create two pipelines, one of XGBoost and one for LightGBM. The technology behind both libraries is different, so it is difficult to compare them in the exact same model setting. XGBoost grows the trees depth-wise and controls model complexity with `max_depth`. Instead, LightGBM uses a leaf-wise algorithm and controls the model complexity by `num_leaves`. As a tradeoff, we use XGBoost with `max_depth=8`, which will have max number leaves of 255, and compare it with LightGBM with `num_leaves=255`. \n",
-    "\n",
    "Let's start with the XGBoost classifier."
   ]
  },
@ -907,7 +924,7 @@
   "outputs": [],
   "source": [
    "xgb_clf_pipeline = XGBRegressor(max_depth=8,\n",
-    "                                n_estimators=100,\n",
+    "                                n_estimators=50,\n",
    "                                min_child_weight=30,\n",
    "                                learning_rate=0.1,\n",
    "                                subsample=0.80,\n",
@ -1048,8 +1065,21 @@
    }
   ],
   "source": [
-    "%%time\n",
-    "xgb_clf_pipeline.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True, eval_metric='logloss')"
+    "with Timer() as t:\n",
+    "    xgb_clf_pipeline.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True, eval_metric='logloss')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "results_dict['xgb']={\n",
+    "    'train_time': t.interval\n",
+    "}"
   ]
  },
  {
@ -1073,7 +1103,7 @@
   "outputs": [],
   "source": [
    "lgbm_clf_pipeline = LGBMRegressor(num_leaves=255,\n",
-    "                                 n_estimators=100,\n",
+    "                                 n_estimators=50,\n",
    "                                 min_child_weight=30,\n",
    "                                 learning_rate=0.1,\n",
    "                                 subsample=0.80,\n",
@ -1235,8 +1265,21 @@
    }
   ],
   "source": [
-    "%%time\n",
-    "lgbm_clf_pipeline.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True, eval_metric=loglikelood)"
+    "with Timer() as t:\n",
+    "    lgbm_clf_pipeline.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True, eval_metric=loglikelood)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "results_dict['lgbm']={\n",
+    "    'train_time': t.interval\n",
+    "}"
   ]
  },
  {
@ -1281,8 +1324,21 @@
    }
   ],
   "source": [
-    "%%time\n",
-    "y_prob_xgb = np.clip(xgb_clf_pipeline.predict(X_test), 0.0001, 0.9999)"
+    "with Timer() as t:\n",
+    "    y_prob_xgb = np.clip(xgb_clf_pipeline.predict(X_test), 0.0001, 0.9999)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "results_dict['xgb']={\n",
+    "    'test_time': t.interval\n",
+    "}"
   ]
  },
  {
@ -1304,8 +1360,21 @@
    }
   ],
   "source": [
-    "%%time\n",
-    "y_prob_lgbm = np.clip(lgbm_clf_pipeline.predict(X_test), 0.0001, 0.9999)"
+    "with Timer() as t:\n",
+    "    y_prob_lgbm = np.clip(lgbm_clf_pipeline.predict(X_test), 0.0001, 0.9999)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "results_dict['lgbm']={\n",
+    "    'test_time': t.interval\n",
+    "}"
   ]
  },
  {
@ -1319,6 +1388,15 @@
    "We are going to obtain some metrics to evaluate the performance of each of the models."
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
  {
   "cell_type": "code",
   "execution_count": 23,
--- a/experiments/03_football.ipynb
+++ b/experiments/03_football.ipynb
@ -2676,7 +2676,10 @@
  },
  {
   "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
   "source": [
    "Now let's try with LightGBM"
   ]
@ -2850,17 +2853,6 @@
    "print(\"Score of {} for training set: {:.4f}.\".format(clf.__class__.__name__, accuracy_score(y_train2, clf.predict(X_train2))))\n",
    "print(\"Score of {} for test set: {:.4f}.\".format(clf.__class__.__name__, accuracy_score(y_test2, y_pred2)))"
   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true,
-    "deletable": true,
-    "editable": true
-   },
-   "outputs": [],
-   "source": []
  }
 ],
 "metadata": {