Merge pull request #17 from microsoft/journeyman/readme_scenario_and_notebook

Updating README to make it clearer that this tool expects initial con…
2020-12-08 10:58:17 -08:00 · 2020-12-08 10:58:17 -08:00 · 8dc778f901
--- a/README.md
+++ b/README.md
@ -132,6 +132,8 @@ Create a brain and write Inkling with type definitions that match what the simul

 Be sure to specify `noise_percentage` in your Inkling's scenario. Training a brain can benefit from adding noise to the states of an approximated simulator to promote robustness.

+> The episode_start in `train_bonsai_main.py` is expecting initial conditions of your states defined in `config_model.yml` to match scenario dictionary passed in. If you want to pass in other variables that are not modeled by the datadrivenmodel tool (except for noise_percentage), you'll likely have to modify `train_bonsai_main.py`.
+
 ```javascript
 lesson `Start Inverted` {
    scenario {
--- a/release/presales_evaluation/presales_evaluation.ipynb
+++ b/release/presales_evaluation/presales_evaluation.ipynb
@ -65,7 +65,7 @@
   "source": [
    "filenames = [\n",
    "    './csv_data/example_data.csv',\n",
-    "    './csv_data/faulty_data.csv',\n",
+    "    #'./csv_data/faulty_data.csv',\n",
    "]"
   ]
  },
@ -194,7 +194,7 @@
    "editable": false,
    "nbgrader": {
     "cell_type": "code",
-     "checksum": "1a585e582a1642fb084cb68a7e2a48d2",
+     "checksum": "68b048bb5cfa3c0035df6604e346159f",
     "grade": true,
     "grade_id": "cell-075b43e670fe45d3",
     "locked": true,
@ -257,7 +257,7 @@
    "    check_nan.append(nan_count)\n",
    "    check_.append(_)\n",
    "    for i in range(0, df[req_keys].shape[1]):\n",
-    "        print('Detected {} NaN and the following issues in column, {}:  {}'.format(int(nan_count[0, i]), req_keys[i],  _[i]))\n"
+    "        print('Detected {} NaN and the following issues in column, {}:  {}'.format(int(nan_count[0, i]), req_keys[i], set(_[i])))\n"
   ]
  },
  {
@ -294,7 +294,7 @@
    "editable": false,
    "nbgrader": {
     "cell_type": "code",
-     "checksum": "20543c3d75f2615aa84bfbe70d7e7bb9",
+     "checksum": "150b07c15b65cb50779cdd1507a68942",
     "grade": true,
     "grade_id": "cell-125c8ee9119954e1",
     "locked": true,
@ -313,7 +313,7 @@
    "from env_data_modeler import env_gb_modeler\n",
    "\n",
    "def plotOutliers(y_set, y_predict_all, outlier_data, config):\n",
-    "    fig = plt.figure()\n",
+    "    fig = plt.figure(figsize=(20, 15))\n",
    "    numSubPlots = y_set.shape[1]\n",
    "\n",
    "    outlierData = outlier_data['y' + str(0)]\n",
@ -322,22 +322,26 @@
    "        if value == 'state':\n",
    "            dataLabel.append(key)\n",
    "\n",
-    "    ax1 = plt.subplot(numSubPlots, 1, 0+1)\n",
+    "    ax1 = plt.subplot(1, 1, 0+1)\n",
    "    plt.plot(y_set[:,0], label=dataLabel[0], linewidth=1, color = 'blue' )\n",
    "    plt.plot(y_predict_all[0], label=dataLabel[0], linewidth=1, color = 'black' )\n",
    "    plt.scatter(outlierData,y_set[outlierData,0], label='outlier', linewidth=1, marker = '*', color = 'red', s = 50)\n",
    "    plt.xticks(rotation='horizontal')\n",
    "    plt.legend(loc='upper right')\n",
-    "    \n",
-    "    for i in range(1,numSubPlots):        \n",
-    "        outlierData = outlier_data['y' + str(i)]\n",
    "\n",
-    "        ax2 = plt.subplot(numSubPlots, 1, i+1, sharex=ax1)\n",
-    "        plt.plot(y_set[:,i], label=dataLabel[i], linewidth=1, color = 'blue' )\n",
-    "        plt.plot(y_predict_all[i], label=dataLabel[i], linewidth=1, color = 'black' )\n",
-    "        plt.scatter(outlierData,y_set[outlierData,i], label='outlier', linewidth=1, marker = '*', color = 'red', s = 50)\n",
-    "        plt.xticks(rotation='horizontal')\n",
-    "        plt.legend(loc='upper right')\n",
+    "    for i in range(1,numSubPlots, 2):\n",
+    "        fig = plt.figure(figsize=(20, 15))\n",
+    "        outlierData = outlier_data['y' + str(i)]\n",
+    "        for j in range(2):\n",
+    "            try:\n",
+    "                ax2 = plt.subplot(2, 1, j+1, sharex=ax1)\n",
+    "                plt.plot(y_set[:,i+j], label=dataLabel[i+j], linewidth=1, color = 'blue' )\n",
+    "                plt.plot(y_predict_all[i+j], label=dataLabel[i+j], linewidth=1, color = 'black' )\n",
+    "                plt.scatter(outlierData,y_set[outlierData,i+j], label='outlier', linewidth=1, marker = '*', color = 'red', s = 50)\n",
+    "                plt.xticks(rotation='horizontal')\n",
+    "                plt.legend(loc='upper right')\n",
+    "            except:\n",
+    "                pass\n",
    "\n",
    "    # plt.show()\n",
    "    \n",
@ -361,7 +365,7 @@
    "    return outL[0]\n",
    "\n",
    "def plotInputs(x_set, y_set, config):\n",
-    "    fig = plt.figure()\n",
+    "    fig = plt.figure(figsize=(20, 15))\n",
    "    numSubPlots = x_set.shape[1] - y_set.shape[1]  ## Num of inputs\n",
    "    \n",
    "    dataLabel = []\n",
@ -458,7 +462,7 @@
    "editable": false,
    "nbgrader": {
     "cell_type": "code",
-     "checksum": "3555db332f91478d30a5238de94fc487",
+     "checksum": "24fbb743fbc4886b04ee32c8781baa65",
     "grade": true,
     "grade_id": "cell-169a850c9712f865",
     "locked": true,
@ -478,7 +482,7 @@
    "\n",
    "nan_count, _ = hasNaN(df[req_keys])\n",
    "for i in range(0, df[req_keys].shape[1]):\n",
-    "    print('Detected {} NaN and the following issues in column, {}:  {}'.format(int(nan_count[0, i]), req_keys[i],  _[i]))\n"
+    "    print('Detected {} NaN and the following issues in column, {}:  {}'.format(int(nan_count[0, i]), req_keys[i],  set(_[i])))\n"
   ]
  },
  {
@ -518,7 +522,7 @@
    "editable": false,
    "nbgrader": {
     "cell_type": "code",
-     "checksum": "ef087ec26d29f32bfed74ee5c64359e6",
+     "checksum": "01679b4490fb6e7dbd1479b687f7159b",
     "grade": true,
     "grade_id": "cell-8094f7632eb6baee",
     "locked": true,
@ -537,7 +541,7 @@
    "from datamodeler import read_env_data\n",
    "\n",
    "def feature_plots(feature_data, state_space_dim, action_space_dim, config, total_width=0.5):\n",
-    "    fig, ax = plt.subplots()\n",
+    "    fig, ax = plt.subplots(figsize=(20, 15))\n",
    "\n",
    "    colors = plt.rcParams['axes.prop_cycle'].by_key()['color']  \n",
    "    n_bars = len(feature_data)\n",
@ -555,6 +559,7 @@
    "    plt.ylabel('Feature Importance', fontsize=18)\n",
    "\n",
    "    plt.xticks(ticks=range(state_space_dim+action_space_dim), labels=config['IO']['feature_name'].keys())\n",
+    "    ax.tick_params(labelrotation=90)\n",
    "    plt.show()\n",
    "\n",
    "state_space_dim = 0\n",
@ -697,7 +702,7 @@
    "editable": false,
    "nbgrader": {
     "cell_type": "code",
-     "checksum": "b7459a0ae18cd384ac698611b5dec5b5",
+     "checksum": "54c75706bf3a6c7a969e702e61c6a3cb",
     "grade": true,
     "grade_id": "cell-22ed4f13332907f2",
     "locked": true,
@ -716,11 +721,12 @@
    "\n",
    "with open('config/model_limits.yml') as conf:\n",
    "    model_limits = yaml.full_load(conf)\n",
-    "    \n",
-    "fig, axs = plt.subplots(1, len(feature_names), sharey=True)\n",
-    "for i, f in enumerate(feature_names):\n",
-    "    (n, bins, patches) = axs[i].hist(x_set[:, i], bins=100)\n",
-    "    plt.setp(axs[i], xlabel=feature_names[i])\n"
+    "\n",
+    "for j in range(0, len(feature_names), 3):\n",
+    "    fig, axs = plt.subplots(3, 1, figsize=(20,15), sharey=True)\n",
+    "    for i, f in enumerate(feature_names[j:j+3]):\n",
+    "        (n, bins, patches) = axs[i].hist(x_set[:, i], bins=100)\n",
+    "        plt.setp(axs[i], xlabel=feature_names[i])\n"
   ]
  },
  {
@ -734,7 +740,9 @@
    "- evaluate region confidence with SME max\n",
    "- evaluate region confidence with SME min\n",
    "\n",
-    "We use a Gaussian Mixture Model (GMM) to fit to the data to be able to cluster distributions with means and covariances. We can then sample the GMM with a random state-action pair and evaluate the regions to trust based compared to SME desired limits. "
+    "We use a Gaussian Mixture Model (GMM) to fit to the data to be able to cluster distributions with means and covariances. We can then test the GMM with a random state-action pair and evaluate the regions to trust based compared to SME desired limits. \n",
+    "\n",
+    "> Note: Section C needs improvement, GMMs are probably not helpful here as much as KL-divergence or MMD. "
   ]
  },
  {
@ -745,7 +753,7 @@
    "editable": false,
    "nbgrader": {
     "cell_type": "code",
-     "checksum": "ed4356f51617c8bb2ded26e1bda412f4",
+     "checksum": "8d20228298a49dfd38a3419817d8d12a",
     "grade": true,
     "grade_id": "cell-d880180658ddb8e4",
     "locked": true,
@ -766,22 +774,8 @@
    "initial_n_components = pca_data.shape[1]\n",
    "\n",
    "from sklearn.mixture import GaussianMixture\n",
-    "\n",
-    "bic_list = []\n",
-    "upper_comp = 20\n",
-    "print('Evaluating best number of components for fitting using GMM...')\n",
-    "for i in range(initial_n_components, initial_n_components+upper_comp, 2):\n",
-    "    gmm = GaussianMixture(\n",
-    "        n_components=i,\n",
-    "        covariance_type='full',\n",
-    "        random_state=0\n",
-    "    )\n",
-    "    gmm.fit(dfs)\n",
-    "    bic_list.append(gmm.bic(dfs))   \n",
-    "    print('{} of {}...'.format(i, initial_n_components+upper_comp))\n",
    "    \n",
-    "n_components = bic_list.index(min(bic_list))+len(feature_names)\n",
-    "print('picking {} components using Bayesian Information Criterion'.format(n_components))\n",
+    "n_components = len(feature_names)\n",
    "\n",
    "gmm = GaussianMixture(\n",
    "    n_components=n_components,\n",
@ -933,6 +927,13 @@
    "most_likely = max(np.ravel(prob_result))\n",
    "assert(most_likely > threshold and most_likely != 1 )"
   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
  }
 ],
 "metadata": {
--- a/source/presales_evaluation/.ipynb_checkpoints/presales_evaluation-checkpoint.ipynb
+++ b/source/presales_evaluation/.ipynb_checkpoints/presales_evaluation-checkpoint.ipynb
@ -21,7 +21,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -59,13 +59,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "filenames = [\n",
    "    './csv_data/example_data.csv',\n",
-    "    './csv_data/faulty_data.csv',\n",
+    "    #'./csv_data/faulty_data.csv',\n",
    "]"
   ]
  },
@ -78,7 +78,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
   "metadata": {
    "nbgrader": {
     "grade": true,
@ -249,7 +249,7 @@
    "    check_nan.append(nan_count)\n",
    "    check_.append(_)\n",
    "    for i in range(0, df[req_keys].shape[1]):\n",
-    "        print('Detected {} NaN and the following issues in column, {}:  {}'.format(int(nan_count[0, i]), req_keys[i],  _[i]))\n",
+    "        print('Detected {} NaN and the following issues in column, {}:  {}'.format(int(nan_count[0, i]), req_keys[i], set(_[i])))\n",
    "\n",
    "### BEGIN HIDDEN TESTS\n",
    "for dataset in check_nan:\n",
@ -311,7 +311,7 @@
    "from env_data_modeler import env_gb_modeler\n",
    "\n",
    "def plotOutliers(y_set, y_predict_all, outlier_data, config):\n",
-    "    fig = plt.figure()\n",
+    "    fig = plt.figure(figsize=(20, 15))\n",
    "    numSubPlots = y_set.shape[1]\n",
    "\n",
    "    outlierData = outlier_data['y' + str(0)]\n",
@ -320,22 +320,26 @@
    "        if value == 'state':\n",
    "            dataLabel.append(key)\n",
    "\n",
-    "    ax1 = plt.subplot(numSubPlots, 1, 0+1)\n",
+    "    ax1 = plt.subplot(1, 1, 0+1)\n",
    "    plt.plot(y_set[:,0], label=dataLabel[0], linewidth=1, color = 'blue' )\n",
    "    plt.plot(y_predict_all[0], label=dataLabel[0], linewidth=1, color = 'black' )\n",
    "    plt.scatter(outlierData,y_set[outlierData,0], label='outlier', linewidth=1, marker = '*', color = 'red', s = 50)\n",
    "    plt.xticks(rotation='horizontal')\n",
    "    plt.legend(loc='upper right')\n",
-    "    \n",
-    "    for i in range(1,numSubPlots):        \n",
-    "        outlierData = outlier_data['y' + str(i)]\n",
    "\n",
-    "        ax2 = plt.subplot(numSubPlots, 1, i+1, sharex=ax1)\n",
-    "        plt.plot(y_set[:,i], label=dataLabel[i], linewidth=1, color = 'blue' )\n",
-    "        plt.plot(y_predict_all[i], label=dataLabel[i], linewidth=1, color = 'black' )\n",
-    "        plt.scatter(outlierData,y_set[outlierData,i], label='outlier', linewidth=1, marker = '*', color = 'red', s = 50)\n",
-    "        plt.xticks(rotation='horizontal')\n",
-    "        plt.legend(loc='upper right')\n",
+    "    for i in range(1,numSubPlots, 2):\n",
+    "        fig = plt.figure(figsize=(20, 15))\n",
+    "        outlierData = outlier_data['y' + str(i)]\n",
+    "        for j in range(2):\n",
+    "            try:\n",
+    "                ax2 = plt.subplot(2, 1, j+1, sharex=ax1)\n",
+    "                plt.plot(y_set[:,i+j], label=dataLabel[i+j], linewidth=1, color = 'blue' )\n",
+    "                plt.plot(y_predict_all[i+j], label=dataLabel[i+j], linewidth=1, color = 'black' )\n",
+    "                plt.scatter(outlierData,y_set[outlierData,i+j], label='outlier', linewidth=1, marker = '*', color = 'red', s = 50)\n",
+    "                plt.xticks(rotation='horizontal')\n",
+    "                plt.legend(loc='upper right')\n",
+    "            except:\n",
+    "                pass\n",
    "\n",
    "    # plt.show()\n",
    "    \n",
@ -359,7 +363,7 @@
    "    return outL[0]\n",
    "\n",
    "def plotInputs(x_set, y_set, config):\n",
-    "    fig = plt.figure()\n",
+    "    fig = plt.figure(figsize=(20, 15))\n",
    "    numSubPlots = x_set.shape[1] - y_set.shape[1]  ## Num of inputs\n",
    "    \n",
    "    dataLabel = []\n",
@ -472,7 +476,7 @@
    "\n",
    "nan_count, _ = hasNaN(df[req_keys])\n",
    "for i in range(0, df[req_keys].shape[1]):\n",
-    "    print('Detected {} NaN and the following issues in column, {}:  {}'.format(int(nan_count[0, i]), req_keys[i],  _[i]))\n",
+    "    print('Detected {} NaN and the following issues in column, {}:  {}'.format(int(nan_count[0, i]), req_keys[i],  set(_[i])))\n",
    "\n",
    "### BEGIN HIDDEN TESTS\n",
    "for col in nan_count:\n",
@ -534,7 +538,7 @@
    "from datamodeler import read_env_data\n",
    "\n",
    "def feature_plots(feature_data, state_space_dim, action_space_dim, config, total_width=0.5):\n",
-    "    fig, ax = plt.subplots()\n",
+    "    fig, ax = plt.subplots(figsize=(20, 15))\n",
    "\n",
    "    colors = plt.rcParams['axes.prop_cycle'].by_key()['color']  \n",
    "    n_bars = len(feature_data)\n",
@ -552,6 +556,7 @@
    "    plt.ylabel('Feature Importance', fontsize=18)\n",
    "\n",
    "    plt.xticks(ticks=range(state_space_dim+action_space_dim), labels=config['IO']['feature_name'].keys())\n",
+    "    ax.tick_params(labelrotation=90)\n",
    "    plt.show()\n",
    "\n",
    "state_space_dim = 0\n",
@ -705,11 +710,12 @@
    "\n",
    "with open('config/model_limits.yml') as conf:\n",
    "    model_limits = yaml.full_load(conf)\n",
-    "    \n",
-    "fig, axs = plt.subplots(1, len(feature_names), sharey=True)\n",
-    "for i, f in enumerate(feature_names):\n",
-    "    (n, bins, patches) = axs[i].hist(x_set[:, i], bins=100)\n",
-    "    plt.setp(axs[i], xlabel=feature_names[i])\n",
+    "\n",
+    "for j in range(0, len(feature_names), 3):\n",
+    "    fig, axs = plt.subplots(3, 1, figsize=(20,15), sharey=True)\n",
+    "    for i, f in enumerate(feature_names[j:j+3]):\n",
+    "        (n, bins, patches) = axs[i].hist(x_set[:, i], bins=100)\n",
+    "        plt.setp(axs[i], xlabel=feature_names[i])\n",
    "\n",
    "### BEGIN HIDDEN TESTS\n",
    "for key in config['IO']['feature_name'].keys():\n",
@ -730,7 +736,9 @@
    "- evaluate region confidence with SME max\n",
    "- evaluate region confidence with SME min\n",
    "\n",
-    "We use a Gaussian Mixture Model (GMM) to fit to the data to be able to cluster distributions with means and covariances. We can then sample the GMM with a random state-action pair and evaluate the regions to trust based compared to SME desired limits. "
+    "We use a Gaussian Mixture Model (GMM) to fit to the data to be able to cluster distributions with means and covariances. We can then test the GMM with a random state-action pair and evaluate the regions to trust based compared to SME desired limits. \n",
+    "\n",
+    "> Note: Section C needs improvement, GMMs are probably not helpful here as much as KL-divergence or MMD. "
   ]
  },
  {
@ -758,22 +766,8 @@
    "initial_n_components = pca_data.shape[1]\n",
    "\n",
    "from sklearn.mixture import GaussianMixture\n",
-    "\n",
-    "bic_list = []\n",
-    "upper_comp = 20\n",
-    "print('Evaluating best number of components for fitting using GMM...')\n",
-    "for i in range(initial_n_components, initial_n_components+upper_comp, 2):\n",
-    "    gmm = GaussianMixture(\n",
-    "        n_components=i,\n",
-    "        covariance_type='full',\n",
-    "        random_state=0\n",
-    "    )\n",
-    "    gmm.fit(dfs)\n",
-    "    bic_list.append(gmm.bic(dfs))   \n",
-    "    print('{} of {}...'.format(i, initial_n_components+upper_comp))\n",
    "    \n",
-    "n_components = bic_list.index(min(bic_list))+len(feature_names)\n",
-    "print('picking {} components using Bayesian Information Criterion'.format(n_components))\n",
+    "n_components = len(feature_names)\n",
    "\n",
    "gmm = GaussianMixture(\n",
    "    n_components=n_components,\n",
@ -909,6 +903,13 @@
    "most_likely = max(np.ravel(prob_result))\n",
    "assert(most_likely > threshold and most_likely != 1 )"
   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
  }
 ],
 "metadata": {
--- a/source/presales_evaluation/presales_evaluation.ipynb
+++ b/source/presales_evaluation/presales_evaluation.ipynb
@ -21,7 +21,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -59,13 +59,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "filenames = [\n",
    "    './csv_data/example_data.csv',\n",
-    "    './csv_data/faulty_data.csv',\n",
+    "    #'./csv_data/faulty_data.csv',\n",
    "]"
   ]
  },
@ -78,7 +78,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
   "metadata": {
    "nbgrader": {
     "grade": true,
@ -249,7 +249,7 @@
    "    check_nan.append(nan_count)\n",
    "    check_.append(_)\n",
    "    for i in range(0, df[req_keys].shape[1]):\n",
-    "        print('Detected {} NaN and the following issues in column, {}:  {}'.format(int(nan_count[0, i]), req_keys[i],  _[i]))\n",
+    "        print('Detected {} NaN and the following issues in column, {}:  {}'.format(int(nan_count[0, i]), req_keys[i], set(_[i])))\n",
    "\n",
    "### BEGIN HIDDEN TESTS\n",
    "for dataset in check_nan:\n",
@ -311,7 +311,7 @@
    "from env_data_modeler import env_gb_modeler\n",
    "\n",
    "def plotOutliers(y_set, y_predict_all, outlier_data, config):\n",
-    "    fig = plt.figure()\n",
+    "    fig = plt.figure(figsize=(20, 15))\n",
    "    numSubPlots = y_set.shape[1]\n",
    "\n",
    "    outlierData = outlier_data['y' + str(0)]\n",
@ -320,22 +320,26 @@
    "        if value == 'state':\n",
    "            dataLabel.append(key)\n",
    "\n",
-    "    ax1 = plt.subplot(numSubPlots, 1, 0+1)\n",
+    "    ax1 = plt.subplot(1, 1, 0+1)\n",
    "    plt.plot(y_set[:,0], label=dataLabel[0], linewidth=1, color = 'blue' )\n",
    "    plt.plot(y_predict_all[0], label=dataLabel[0], linewidth=1, color = 'black' )\n",
    "    plt.scatter(outlierData,y_set[outlierData,0], label='outlier', linewidth=1, marker = '*', color = 'red', s = 50)\n",
    "    plt.xticks(rotation='horizontal')\n",
    "    plt.legend(loc='upper right')\n",
-    "    \n",
-    "    for i in range(1,numSubPlots):        \n",
-    "        outlierData = outlier_data['y' + str(i)]\n",
    "\n",
-    "        ax2 = plt.subplot(numSubPlots, 1, i+1, sharex=ax1)\n",
-    "        plt.plot(y_set[:,i], label=dataLabel[i], linewidth=1, color = 'blue' )\n",
-    "        plt.plot(y_predict_all[i], label=dataLabel[i], linewidth=1, color = 'black' )\n",
-    "        plt.scatter(outlierData,y_set[outlierData,i], label='outlier', linewidth=1, marker = '*', color = 'red', s = 50)\n",
-    "        plt.xticks(rotation='horizontal')\n",
-    "        plt.legend(loc='upper right')\n",
+    "    for i in range(1,numSubPlots, 2):\n",
+    "        fig = plt.figure(figsize=(20, 15))\n",
+    "        outlierData = outlier_data['y' + str(i)]\n",
+    "        for j in range(2):\n",
+    "            try:\n",
+    "                ax2 = plt.subplot(2, 1, j+1, sharex=ax1)\n",
+    "                plt.plot(y_set[:,i+j], label=dataLabel[i+j], linewidth=1, color = 'blue' )\n",
+    "                plt.plot(y_predict_all[i+j], label=dataLabel[i+j], linewidth=1, color = 'black' )\n",
+    "                plt.scatter(outlierData,y_set[outlierData,i+j], label='outlier', linewidth=1, marker = '*', color = 'red', s = 50)\n",
+    "                plt.xticks(rotation='horizontal')\n",
+    "                plt.legend(loc='upper right')\n",
+    "            except:\n",
+    "                pass\n",
    "\n",
    "    # plt.show()\n",
    "    \n",
@ -359,7 +363,7 @@
    "    return outL[0]\n",
    "\n",
    "def plotInputs(x_set, y_set, config):\n",
-    "    fig = plt.figure()\n",
+    "    fig = plt.figure(figsize=(20, 15))\n",
    "    numSubPlots = x_set.shape[1] - y_set.shape[1]  ## Num of inputs\n",
    "    \n",
    "    dataLabel = []\n",
@ -472,7 +476,7 @@
    "\n",
    "nan_count, _ = hasNaN(df[req_keys])\n",
    "for i in range(0, df[req_keys].shape[1]):\n",
-    "    print('Detected {} NaN and the following issues in column, {}:  {}'.format(int(nan_count[0, i]), req_keys[i],  _[i]))\n",
+    "    print('Detected {} NaN and the following issues in column, {}:  {}'.format(int(nan_count[0, i]), req_keys[i],  set(_[i])))\n",
    "\n",
    "### BEGIN HIDDEN TESTS\n",
    "for col in nan_count:\n",
@ -534,7 +538,7 @@
    "from datamodeler import read_env_data\n",
    "\n",
    "def feature_plots(feature_data, state_space_dim, action_space_dim, config, total_width=0.5):\n",
-    "    fig, ax = plt.subplots()\n",
+    "    fig, ax = plt.subplots(figsize=(20, 15))\n",
    "\n",
    "    colors = plt.rcParams['axes.prop_cycle'].by_key()['color']  \n",
    "    n_bars = len(feature_data)\n",
@ -552,6 +556,7 @@
    "    plt.ylabel('Feature Importance', fontsize=18)\n",
    "\n",
    "    plt.xticks(ticks=range(state_space_dim+action_space_dim), labels=config['IO']['feature_name'].keys())\n",
+    "    ax.tick_params(labelrotation=90)\n",
    "    plt.show()\n",
    "\n",
    "state_space_dim = 0\n",
@ -705,11 +710,12 @@
    "\n",
    "with open('config/model_limits.yml') as conf:\n",
    "    model_limits = yaml.full_load(conf)\n",
-    "    \n",
-    "fig, axs = plt.subplots(1, len(feature_names), sharey=True)\n",
-    "for i, f in enumerate(feature_names):\n",
-    "    (n, bins, patches) = axs[i].hist(x_set[:, i], bins=100)\n",
-    "    plt.setp(axs[i], xlabel=feature_names[i])\n",
+    "\n",
+    "for j in range(0, len(feature_names), 3):\n",
+    "    fig, axs = plt.subplots(3, 1, figsize=(20,15), sharey=True)\n",
+    "    for i, f in enumerate(feature_names[j:j+3]):\n",
+    "        (n, bins, patches) = axs[i].hist(x_set[:, i], bins=100)\n",
+    "        plt.setp(axs[i], xlabel=feature_names[i])\n",
    "\n",
    "### BEGIN HIDDEN TESTS\n",
    "for key in config['IO']['feature_name'].keys():\n",
@ -730,7 +736,9 @@
    "- evaluate region confidence with SME max\n",
    "- evaluate region confidence with SME min\n",
    "\n",
-    "We use a Gaussian Mixture Model (GMM) to fit to the data to be able to cluster distributions with means and covariances. We can then sample the GMM with a random state-action pair and evaluate the regions to trust based compared to SME desired limits. "
+    "We use a Gaussian Mixture Model (GMM) to fit to the data to be able to cluster distributions with means and covariances. We can then test the GMM with a random state-action pair and evaluate the regions to trust based compared to SME desired limits. \n",
+    "\n",
+    "> Note: Section C needs improvement, GMMs are probably not helpful here as much as KL-divergence or MMD. "
   ]
  },
  {
@ -758,22 +766,8 @@
    "initial_n_components = pca_data.shape[1]\n",
    "\n",
    "from sklearn.mixture import GaussianMixture\n",
-    "\n",
-    "bic_list = []\n",
-    "upper_comp = 20\n",
-    "print('Evaluating best number of components for fitting using GMM...')\n",
-    "for i in range(initial_n_components, initial_n_components+upper_comp, 2):\n",
-    "    gmm = GaussianMixture(\n",
-    "        n_components=i,\n",
-    "        covariance_type='full',\n",
-    "        random_state=0\n",
-    "    )\n",
-    "    gmm.fit(dfs)\n",
-    "    bic_list.append(gmm.bic(dfs))   \n",
-    "    print('{} of {}...'.format(i, initial_n_components+upper_comp))\n",
    "    \n",
-    "n_components = bic_list.index(min(bic_list))+len(feature_names)\n",
-    "print('picking {} components using Bayesian Information Criterion'.format(n_components))\n",
+    "n_components = len(feature_names)\n",
    "\n",
    "gmm = GaussianMixture(\n",
    "    n_components=n_components,\n",
@ -909,6 +903,13 @@
    "most_likely = max(np.ravel(prob_result))\n",
    "assert(most_likely > threshold and most_likely != 1 )"
   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
  }
 ],
 "metadata": {