Merge pull request #17 from microsoft/journeyman/readme_scenario_and_notebook
Updating README to make it clearer that this tool expects initial con…
This commit is contained in:
Коммит
8dc778f901
|
@ -132,6 +132,8 @@ Create a brain and write Inkling with type definitions that match what the simul
|
|||
|
||||
Be sure to specify `noise_percentage` in your Inkling's scenario. Training a brain can benefit from adding noise to the states of an approximated simulator to promote robustness.
|
||||
|
||||
> The episode_start in `train_bonsai_main.py` is expecting initial conditions of your states defined in `config_model.yml` to match scenario dictionary passed in. If you want to pass in other variables that are not modeled by the datadrivenmodel tool (except for noise_percentage), you'll likely have to modify `train_bonsai_main.py`.
|
||||
|
||||
```javascript
|
||||
lesson `Start Inverted` {
|
||||
scenario {
|
||||
|
|
|
@ -65,7 +65,7 @@
|
|||
"source": [
|
||||
"filenames = [\n",
|
||||
" './csv_data/example_data.csv',\n",
|
||||
" './csv_data/faulty_data.csv',\n",
|
||||
" #'./csv_data/faulty_data.csv',\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
|
@ -194,7 +194,7 @@
|
|||
"editable": false,
|
||||
"nbgrader": {
|
||||
"cell_type": "code",
|
||||
"checksum": "1a585e582a1642fb084cb68a7e2a48d2",
|
||||
"checksum": "68b048bb5cfa3c0035df6604e346159f",
|
||||
"grade": true,
|
||||
"grade_id": "cell-075b43e670fe45d3",
|
||||
"locked": true,
|
||||
|
@ -257,7 +257,7 @@
|
|||
" check_nan.append(nan_count)\n",
|
||||
" check_.append(_)\n",
|
||||
" for i in range(0, df[req_keys].shape[1]):\n",
|
||||
" print('Detected {} NaN and the following issues in column, {}: {}'.format(int(nan_count[0, i]), req_keys[i], _[i]))\n"
|
||||
" print('Detected {} NaN and the following issues in column, {}: {}'.format(int(nan_count[0, i]), req_keys[i], set(_[i])))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -294,7 +294,7 @@
|
|||
"editable": false,
|
||||
"nbgrader": {
|
||||
"cell_type": "code",
|
||||
"checksum": "20543c3d75f2615aa84bfbe70d7e7bb9",
|
||||
"checksum": "150b07c15b65cb50779cdd1507a68942",
|
||||
"grade": true,
|
||||
"grade_id": "cell-125c8ee9119954e1",
|
||||
"locked": true,
|
||||
|
@ -313,7 +313,7 @@
|
|||
"from env_data_modeler import env_gb_modeler\n",
|
||||
"\n",
|
||||
"def plotOutliers(y_set, y_predict_all, outlier_data, config):\n",
|
||||
" fig = plt.figure()\n",
|
||||
" fig = plt.figure(figsize=(20, 15))\n",
|
||||
" numSubPlots = y_set.shape[1]\n",
|
||||
"\n",
|
||||
" outlierData = outlier_data['y' + str(0)]\n",
|
||||
|
@ -322,22 +322,26 @@
|
|||
" if value == 'state':\n",
|
||||
" dataLabel.append(key)\n",
|
||||
"\n",
|
||||
" ax1 = plt.subplot(numSubPlots, 1, 0+1)\n",
|
||||
" ax1 = plt.subplot(1, 1, 0+1)\n",
|
||||
" plt.plot(y_set[:,0], label=dataLabel[0], linewidth=1, color = 'blue' )\n",
|
||||
" plt.plot(y_predict_all[0], label=dataLabel[0], linewidth=1, color = 'black' )\n",
|
||||
" plt.scatter(outlierData,y_set[outlierData,0], label='outlier', linewidth=1, marker = '*', color = 'red', s = 50)\n",
|
||||
" plt.xticks(rotation='horizontal')\n",
|
||||
" plt.legend(loc='upper right')\n",
|
||||
" \n",
|
||||
" for i in range(1,numSubPlots): \n",
|
||||
" outlierData = outlier_data['y' + str(i)]\n",
|
||||
"\n",
|
||||
" ax2 = plt.subplot(numSubPlots, 1, i+1, sharex=ax1)\n",
|
||||
" plt.plot(y_set[:,i], label=dataLabel[i], linewidth=1, color = 'blue' )\n",
|
||||
" plt.plot(y_predict_all[i], label=dataLabel[i], linewidth=1, color = 'black' )\n",
|
||||
" plt.scatter(outlierData,y_set[outlierData,i], label='outlier', linewidth=1, marker = '*', color = 'red', s = 50)\n",
|
||||
" plt.xticks(rotation='horizontal')\n",
|
||||
" plt.legend(loc='upper right')\n",
|
||||
" for i in range(1,numSubPlots, 2):\n",
|
||||
" fig = plt.figure(figsize=(20, 15))\n",
|
||||
" outlierData = outlier_data['y' + str(i)]\n",
|
||||
" for j in range(2):\n",
|
||||
" try:\n",
|
||||
" ax2 = plt.subplot(2, 1, j+1, sharex=ax1)\n",
|
||||
" plt.plot(y_set[:,i+j], label=dataLabel[i+j], linewidth=1, color = 'blue' )\n",
|
||||
" plt.plot(y_predict_all[i+j], label=dataLabel[i+j], linewidth=1, color = 'black' )\n",
|
||||
" plt.scatter(outlierData,y_set[outlierData,i+j], label='outlier', linewidth=1, marker = '*', color = 'red', s = 50)\n",
|
||||
" plt.xticks(rotation='horizontal')\n",
|
||||
" plt.legend(loc='upper right')\n",
|
||||
" except:\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
" # plt.show()\n",
|
||||
" \n",
|
||||
|
@ -361,7 +365,7 @@
|
|||
" return outL[0]\n",
|
||||
"\n",
|
||||
"def plotInputs(x_set, y_set, config):\n",
|
||||
" fig = plt.figure()\n",
|
||||
" fig = plt.figure(figsize=(20, 15))\n",
|
||||
" numSubPlots = x_set.shape[1] - y_set.shape[1] ## Num of inputs\n",
|
||||
" \n",
|
||||
" dataLabel = []\n",
|
||||
|
@ -458,7 +462,7 @@
|
|||
"editable": false,
|
||||
"nbgrader": {
|
||||
"cell_type": "code",
|
||||
"checksum": "3555db332f91478d30a5238de94fc487",
|
||||
"checksum": "24fbb743fbc4886b04ee32c8781baa65",
|
||||
"grade": true,
|
||||
"grade_id": "cell-169a850c9712f865",
|
||||
"locked": true,
|
||||
|
@ -478,7 +482,7 @@
|
|||
"\n",
|
||||
"nan_count, _ = hasNaN(df[req_keys])\n",
|
||||
"for i in range(0, df[req_keys].shape[1]):\n",
|
||||
" print('Detected {} NaN and the following issues in column, {}: {}'.format(int(nan_count[0, i]), req_keys[i], _[i]))\n"
|
||||
" print('Detected {} NaN and the following issues in column, {}: {}'.format(int(nan_count[0, i]), req_keys[i], set(_[i])))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -518,7 +522,7 @@
|
|||
"editable": false,
|
||||
"nbgrader": {
|
||||
"cell_type": "code",
|
||||
"checksum": "ef087ec26d29f32bfed74ee5c64359e6",
|
||||
"checksum": "01679b4490fb6e7dbd1479b687f7159b",
|
||||
"grade": true,
|
||||
"grade_id": "cell-8094f7632eb6baee",
|
||||
"locked": true,
|
||||
|
@ -537,7 +541,7 @@
|
|||
"from datamodeler import read_env_data\n",
|
||||
"\n",
|
||||
"def feature_plots(feature_data, state_space_dim, action_space_dim, config, total_width=0.5):\n",
|
||||
" fig, ax = plt.subplots()\n",
|
||||
" fig, ax = plt.subplots(figsize=(20, 15))\n",
|
||||
"\n",
|
||||
" colors = plt.rcParams['axes.prop_cycle'].by_key()['color'] \n",
|
||||
" n_bars = len(feature_data)\n",
|
||||
|
@ -555,6 +559,7 @@
|
|||
" plt.ylabel('Feature Importance', fontsize=18)\n",
|
||||
"\n",
|
||||
" plt.xticks(ticks=range(state_space_dim+action_space_dim), labels=config['IO']['feature_name'].keys())\n",
|
||||
" ax.tick_params(labelrotation=90)\n",
|
||||
" plt.show()\n",
|
||||
"\n",
|
||||
"state_space_dim = 0\n",
|
||||
|
@ -697,7 +702,7 @@
|
|||
"editable": false,
|
||||
"nbgrader": {
|
||||
"cell_type": "code",
|
||||
"checksum": "b7459a0ae18cd384ac698611b5dec5b5",
|
||||
"checksum": "54c75706bf3a6c7a969e702e61c6a3cb",
|
||||
"grade": true,
|
||||
"grade_id": "cell-22ed4f13332907f2",
|
||||
"locked": true,
|
||||
|
@ -716,11 +721,12 @@
|
|||
"\n",
|
||||
"with open('config/model_limits.yml') as conf:\n",
|
||||
" model_limits = yaml.full_load(conf)\n",
|
||||
" \n",
|
||||
"fig, axs = plt.subplots(1, len(feature_names), sharey=True)\n",
|
||||
"for i, f in enumerate(feature_names):\n",
|
||||
" (n, bins, patches) = axs[i].hist(x_set[:, i], bins=100)\n",
|
||||
" plt.setp(axs[i], xlabel=feature_names[i])\n"
|
||||
"\n",
|
||||
"for j in range(0, len(feature_names), 3):\n",
|
||||
" fig, axs = plt.subplots(3, 1, figsize=(20,15), sharey=True)\n",
|
||||
" for i, f in enumerate(feature_names[j:j+3]):\n",
|
||||
" (n, bins, patches) = axs[i].hist(x_set[:, i], bins=100)\n",
|
||||
" plt.setp(axs[i], xlabel=feature_names[i])\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -734,7 +740,9 @@
|
|||
"- evaluate region confidence with SME max\n",
|
||||
"- evaluate region confidence with SME min\n",
|
||||
"\n",
|
||||
"We use a Gaussian Mixture Model (GMM) to fit to the data to be able to cluster distributions with means and covariances. We can then sample the GMM with a random state-action pair and evaluate the regions to trust based compared to SME desired limits. "
|
||||
"We use a Gaussian Mixture Model (GMM) to fit to the data to be able to cluster distributions with means and covariances. We can then test the GMM with a random state-action pair and evaluate the regions to trust based compared to SME desired limits. \n",
|
||||
"\n",
|
||||
"> Note: Section C needs improvement, GMMs are probably not helpful here as much as KL-divergence or MMD. "
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -745,7 +753,7 @@
|
|||
"editable": false,
|
||||
"nbgrader": {
|
||||
"cell_type": "code",
|
||||
"checksum": "ed4356f51617c8bb2ded26e1bda412f4",
|
||||
"checksum": "8d20228298a49dfd38a3419817d8d12a",
|
||||
"grade": true,
|
||||
"grade_id": "cell-d880180658ddb8e4",
|
||||
"locked": true,
|
||||
|
@ -766,22 +774,8 @@
|
|||
"initial_n_components = pca_data.shape[1]\n",
|
||||
"\n",
|
||||
"from sklearn.mixture import GaussianMixture\n",
|
||||
"\n",
|
||||
"bic_list = []\n",
|
||||
"upper_comp = 20\n",
|
||||
"print('Evaluating best number of components for fitting using GMM...')\n",
|
||||
"for i in range(initial_n_components, initial_n_components+upper_comp, 2):\n",
|
||||
" gmm = GaussianMixture(\n",
|
||||
" n_components=i,\n",
|
||||
" covariance_type='full',\n",
|
||||
" random_state=0\n",
|
||||
" )\n",
|
||||
" gmm.fit(dfs)\n",
|
||||
" bic_list.append(gmm.bic(dfs)) \n",
|
||||
" print('{} of {}...'.format(i, initial_n_components+upper_comp))\n",
|
||||
" \n",
|
||||
"n_components = bic_list.index(min(bic_list))+len(feature_names)\n",
|
||||
"print('picking {} components using Bayesian Information Criterion'.format(n_components))\n",
|
||||
"n_components = len(feature_names)\n",
|
||||
"\n",
|
||||
"gmm = GaussianMixture(\n",
|
||||
" n_components=n_components,\n",
|
||||
|
@ -933,6 +927,13 @@
|
|||
"most_likely = max(np.ravel(prob_result))\n",
|
||||
"assert(most_likely > threshold and most_likely != 1 )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
|
|
@ -21,7 +21,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -59,13 +59,13 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"filenames = [\n",
|
||||
" './csv_data/example_data.csv',\n",
|
||||
" './csv_data/faulty_data.csv',\n",
|
||||
" #'./csv_data/faulty_data.csv',\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
|
@ -78,7 +78,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"nbgrader": {
|
||||
"grade": true,
|
||||
|
@ -249,7 +249,7 @@
|
|||
" check_nan.append(nan_count)\n",
|
||||
" check_.append(_)\n",
|
||||
" for i in range(0, df[req_keys].shape[1]):\n",
|
||||
" print('Detected {} NaN and the following issues in column, {}: {}'.format(int(nan_count[0, i]), req_keys[i], _[i]))\n",
|
||||
" print('Detected {} NaN and the following issues in column, {}: {}'.format(int(nan_count[0, i]), req_keys[i], set(_[i])))\n",
|
||||
"\n",
|
||||
"### BEGIN HIDDEN TESTS\n",
|
||||
"for dataset in check_nan:\n",
|
||||
|
@ -311,7 +311,7 @@
|
|||
"from env_data_modeler import env_gb_modeler\n",
|
||||
"\n",
|
||||
"def plotOutliers(y_set, y_predict_all, outlier_data, config):\n",
|
||||
" fig = plt.figure()\n",
|
||||
" fig = plt.figure(figsize=(20, 15))\n",
|
||||
" numSubPlots = y_set.shape[1]\n",
|
||||
"\n",
|
||||
" outlierData = outlier_data['y' + str(0)]\n",
|
||||
|
@ -320,22 +320,26 @@
|
|||
" if value == 'state':\n",
|
||||
" dataLabel.append(key)\n",
|
||||
"\n",
|
||||
" ax1 = plt.subplot(numSubPlots, 1, 0+1)\n",
|
||||
" ax1 = plt.subplot(1, 1, 0+1)\n",
|
||||
" plt.plot(y_set[:,0], label=dataLabel[0], linewidth=1, color = 'blue' )\n",
|
||||
" plt.plot(y_predict_all[0], label=dataLabel[0], linewidth=1, color = 'black' )\n",
|
||||
" plt.scatter(outlierData,y_set[outlierData,0], label='outlier', linewidth=1, marker = '*', color = 'red', s = 50)\n",
|
||||
" plt.xticks(rotation='horizontal')\n",
|
||||
" plt.legend(loc='upper right')\n",
|
||||
" \n",
|
||||
" for i in range(1,numSubPlots): \n",
|
||||
" outlierData = outlier_data['y' + str(i)]\n",
|
||||
"\n",
|
||||
" ax2 = plt.subplot(numSubPlots, 1, i+1, sharex=ax1)\n",
|
||||
" plt.plot(y_set[:,i], label=dataLabel[i], linewidth=1, color = 'blue' )\n",
|
||||
" plt.plot(y_predict_all[i], label=dataLabel[i], linewidth=1, color = 'black' )\n",
|
||||
" plt.scatter(outlierData,y_set[outlierData,i], label='outlier', linewidth=1, marker = '*', color = 'red', s = 50)\n",
|
||||
" plt.xticks(rotation='horizontal')\n",
|
||||
" plt.legend(loc='upper right')\n",
|
||||
" for i in range(1,numSubPlots, 2):\n",
|
||||
" fig = plt.figure(figsize=(20, 15))\n",
|
||||
" outlierData = outlier_data['y' + str(i)]\n",
|
||||
" for j in range(2):\n",
|
||||
" try:\n",
|
||||
" ax2 = plt.subplot(2, 1, j+1, sharex=ax1)\n",
|
||||
" plt.plot(y_set[:,i+j], label=dataLabel[i+j], linewidth=1, color = 'blue' )\n",
|
||||
" plt.plot(y_predict_all[i+j], label=dataLabel[i+j], linewidth=1, color = 'black' )\n",
|
||||
" plt.scatter(outlierData,y_set[outlierData,i+j], label='outlier', linewidth=1, marker = '*', color = 'red', s = 50)\n",
|
||||
" plt.xticks(rotation='horizontal')\n",
|
||||
" plt.legend(loc='upper right')\n",
|
||||
" except:\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
" # plt.show()\n",
|
||||
" \n",
|
||||
|
@ -359,7 +363,7 @@
|
|||
" return outL[0]\n",
|
||||
"\n",
|
||||
"def plotInputs(x_set, y_set, config):\n",
|
||||
" fig = plt.figure()\n",
|
||||
" fig = plt.figure(figsize=(20, 15))\n",
|
||||
" numSubPlots = x_set.shape[1] - y_set.shape[1] ## Num of inputs\n",
|
||||
" \n",
|
||||
" dataLabel = []\n",
|
||||
|
@ -472,7 +476,7 @@
|
|||
"\n",
|
||||
"nan_count, _ = hasNaN(df[req_keys])\n",
|
||||
"for i in range(0, df[req_keys].shape[1]):\n",
|
||||
" print('Detected {} NaN and the following issues in column, {}: {}'.format(int(nan_count[0, i]), req_keys[i], _[i]))\n",
|
||||
" print('Detected {} NaN and the following issues in column, {}: {}'.format(int(nan_count[0, i]), req_keys[i], set(_[i])))\n",
|
||||
"\n",
|
||||
"### BEGIN HIDDEN TESTS\n",
|
||||
"for col in nan_count:\n",
|
||||
|
@ -534,7 +538,7 @@
|
|||
"from datamodeler import read_env_data\n",
|
||||
"\n",
|
||||
"def feature_plots(feature_data, state_space_dim, action_space_dim, config, total_width=0.5):\n",
|
||||
" fig, ax = plt.subplots()\n",
|
||||
" fig, ax = plt.subplots(figsize=(20, 15))\n",
|
||||
"\n",
|
||||
" colors = plt.rcParams['axes.prop_cycle'].by_key()['color'] \n",
|
||||
" n_bars = len(feature_data)\n",
|
||||
|
@ -552,6 +556,7 @@
|
|||
" plt.ylabel('Feature Importance', fontsize=18)\n",
|
||||
"\n",
|
||||
" plt.xticks(ticks=range(state_space_dim+action_space_dim), labels=config['IO']['feature_name'].keys())\n",
|
||||
" ax.tick_params(labelrotation=90)\n",
|
||||
" plt.show()\n",
|
||||
"\n",
|
||||
"state_space_dim = 0\n",
|
||||
|
@ -705,11 +710,12 @@
|
|||
"\n",
|
||||
"with open('config/model_limits.yml') as conf:\n",
|
||||
" model_limits = yaml.full_load(conf)\n",
|
||||
" \n",
|
||||
"fig, axs = plt.subplots(1, len(feature_names), sharey=True)\n",
|
||||
"for i, f in enumerate(feature_names):\n",
|
||||
" (n, bins, patches) = axs[i].hist(x_set[:, i], bins=100)\n",
|
||||
" plt.setp(axs[i], xlabel=feature_names[i])\n",
|
||||
"\n",
|
||||
"for j in range(0, len(feature_names), 3):\n",
|
||||
" fig, axs = plt.subplots(3, 1, figsize=(20,15), sharey=True)\n",
|
||||
" for i, f in enumerate(feature_names[j:j+3]):\n",
|
||||
" (n, bins, patches) = axs[i].hist(x_set[:, i], bins=100)\n",
|
||||
" plt.setp(axs[i], xlabel=feature_names[i])\n",
|
||||
"\n",
|
||||
"### BEGIN HIDDEN TESTS\n",
|
||||
"for key in config['IO']['feature_name'].keys():\n",
|
||||
|
@ -730,7 +736,9 @@
|
|||
"- evaluate region confidence with SME max\n",
|
||||
"- evaluate region confidence with SME min\n",
|
||||
"\n",
|
||||
"We use a Gaussian Mixture Model (GMM) to fit to the data to be able to cluster distributions with means and covariances. We can then sample the GMM with a random state-action pair and evaluate the regions to trust based compared to SME desired limits. "
|
||||
"We use a Gaussian Mixture Model (GMM) to fit to the data to be able to cluster distributions with means and covariances. We can then test the GMM with a random state-action pair and evaluate the regions to trust based compared to SME desired limits. \n",
|
||||
"\n",
|
||||
"> Note: Section C needs improvement, GMMs are probably not helpful here as much as KL-divergence or MMD. "
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -758,22 +766,8 @@
|
|||
"initial_n_components = pca_data.shape[1]\n",
|
||||
"\n",
|
||||
"from sklearn.mixture import GaussianMixture\n",
|
||||
"\n",
|
||||
"bic_list = []\n",
|
||||
"upper_comp = 20\n",
|
||||
"print('Evaluating best number of components for fitting using GMM...')\n",
|
||||
"for i in range(initial_n_components, initial_n_components+upper_comp, 2):\n",
|
||||
" gmm = GaussianMixture(\n",
|
||||
" n_components=i,\n",
|
||||
" covariance_type='full',\n",
|
||||
" random_state=0\n",
|
||||
" )\n",
|
||||
" gmm.fit(dfs)\n",
|
||||
" bic_list.append(gmm.bic(dfs)) \n",
|
||||
" print('{} of {}...'.format(i, initial_n_components+upper_comp))\n",
|
||||
" \n",
|
||||
"n_components = bic_list.index(min(bic_list))+len(feature_names)\n",
|
||||
"print('picking {} components using Bayesian Information Criterion'.format(n_components))\n",
|
||||
"n_components = len(feature_names)\n",
|
||||
"\n",
|
||||
"gmm = GaussianMixture(\n",
|
||||
" n_components=n_components,\n",
|
||||
|
@ -909,6 +903,13 @@
|
|||
"most_likely = max(np.ravel(prob_result))\n",
|
||||
"assert(most_likely > threshold and most_likely != 1 )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
|
|
@ -21,7 +21,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -59,13 +59,13 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"filenames = [\n",
|
||||
" './csv_data/example_data.csv',\n",
|
||||
" './csv_data/faulty_data.csv',\n",
|
||||
" #'./csv_data/faulty_data.csv',\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
|
@ -78,7 +78,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"nbgrader": {
|
||||
"grade": true,
|
||||
|
@ -249,7 +249,7 @@
|
|||
" check_nan.append(nan_count)\n",
|
||||
" check_.append(_)\n",
|
||||
" for i in range(0, df[req_keys].shape[1]):\n",
|
||||
" print('Detected {} NaN and the following issues in column, {}: {}'.format(int(nan_count[0, i]), req_keys[i], _[i]))\n",
|
||||
" print('Detected {} NaN and the following issues in column, {}: {}'.format(int(nan_count[0, i]), req_keys[i], set(_[i])))\n",
|
||||
"\n",
|
||||
"### BEGIN HIDDEN TESTS\n",
|
||||
"for dataset in check_nan:\n",
|
||||
|
@ -311,7 +311,7 @@
|
|||
"from env_data_modeler import env_gb_modeler\n",
|
||||
"\n",
|
||||
"def plotOutliers(y_set, y_predict_all, outlier_data, config):\n",
|
||||
" fig = plt.figure()\n",
|
||||
" fig = plt.figure(figsize=(20, 15))\n",
|
||||
" numSubPlots = y_set.shape[1]\n",
|
||||
"\n",
|
||||
" outlierData = outlier_data['y' + str(0)]\n",
|
||||
|
@ -320,22 +320,26 @@
|
|||
" if value == 'state':\n",
|
||||
" dataLabel.append(key)\n",
|
||||
"\n",
|
||||
" ax1 = plt.subplot(numSubPlots, 1, 0+1)\n",
|
||||
" ax1 = plt.subplot(1, 1, 0+1)\n",
|
||||
" plt.plot(y_set[:,0], label=dataLabel[0], linewidth=1, color = 'blue' )\n",
|
||||
" plt.plot(y_predict_all[0], label=dataLabel[0], linewidth=1, color = 'black' )\n",
|
||||
" plt.scatter(outlierData,y_set[outlierData,0], label='outlier', linewidth=1, marker = '*', color = 'red', s = 50)\n",
|
||||
" plt.xticks(rotation='horizontal')\n",
|
||||
" plt.legend(loc='upper right')\n",
|
||||
" \n",
|
||||
" for i in range(1,numSubPlots): \n",
|
||||
" outlierData = outlier_data['y' + str(i)]\n",
|
||||
"\n",
|
||||
" ax2 = plt.subplot(numSubPlots, 1, i+1, sharex=ax1)\n",
|
||||
" plt.plot(y_set[:,i], label=dataLabel[i], linewidth=1, color = 'blue' )\n",
|
||||
" plt.plot(y_predict_all[i], label=dataLabel[i], linewidth=1, color = 'black' )\n",
|
||||
" plt.scatter(outlierData,y_set[outlierData,i], label='outlier', linewidth=1, marker = '*', color = 'red', s = 50)\n",
|
||||
" plt.xticks(rotation='horizontal')\n",
|
||||
" plt.legend(loc='upper right')\n",
|
||||
" for i in range(1,numSubPlots, 2):\n",
|
||||
" fig = plt.figure(figsize=(20, 15))\n",
|
||||
" outlierData = outlier_data['y' + str(i)]\n",
|
||||
" for j in range(2):\n",
|
||||
" try:\n",
|
||||
" ax2 = plt.subplot(2, 1, j+1, sharex=ax1)\n",
|
||||
" plt.plot(y_set[:,i+j], label=dataLabel[i+j], linewidth=1, color = 'blue' )\n",
|
||||
" plt.plot(y_predict_all[i+j], label=dataLabel[i+j], linewidth=1, color = 'black' )\n",
|
||||
" plt.scatter(outlierData,y_set[outlierData,i+j], label='outlier', linewidth=1, marker = '*', color = 'red', s = 50)\n",
|
||||
" plt.xticks(rotation='horizontal')\n",
|
||||
" plt.legend(loc='upper right')\n",
|
||||
" except:\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
" # plt.show()\n",
|
||||
" \n",
|
||||
|
@ -359,7 +363,7 @@
|
|||
" return outL[0]\n",
|
||||
"\n",
|
||||
"def plotInputs(x_set, y_set, config):\n",
|
||||
" fig = plt.figure()\n",
|
||||
" fig = plt.figure(figsize=(20, 15))\n",
|
||||
" numSubPlots = x_set.shape[1] - y_set.shape[1] ## Num of inputs\n",
|
||||
" \n",
|
||||
" dataLabel = []\n",
|
||||
|
@ -472,7 +476,7 @@
|
|||
"\n",
|
||||
"nan_count, _ = hasNaN(df[req_keys])\n",
|
||||
"for i in range(0, df[req_keys].shape[1]):\n",
|
||||
" print('Detected {} NaN and the following issues in column, {}: {}'.format(int(nan_count[0, i]), req_keys[i], _[i]))\n",
|
||||
" print('Detected {} NaN and the following issues in column, {}: {}'.format(int(nan_count[0, i]), req_keys[i], set(_[i])))\n",
|
||||
"\n",
|
||||
"### BEGIN HIDDEN TESTS\n",
|
||||
"for col in nan_count:\n",
|
||||
|
@ -534,7 +538,7 @@
|
|||
"from datamodeler import read_env_data\n",
|
||||
"\n",
|
||||
"def feature_plots(feature_data, state_space_dim, action_space_dim, config, total_width=0.5):\n",
|
||||
" fig, ax = plt.subplots()\n",
|
||||
" fig, ax = plt.subplots(figsize=(20, 15))\n",
|
||||
"\n",
|
||||
" colors = plt.rcParams['axes.prop_cycle'].by_key()['color'] \n",
|
||||
" n_bars = len(feature_data)\n",
|
||||
|
@ -552,6 +556,7 @@
|
|||
" plt.ylabel('Feature Importance', fontsize=18)\n",
|
||||
"\n",
|
||||
" plt.xticks(ticks=range(state_space_dim+action_space_dim), labels=config['IO']['feature_name'].keys())\n",
|
||||
" ax.tick_params(labelrotation=90)\n",
|
||||
" plt.show()\n",
|
||||
"\n",
|
||||
"state_space_dim = 0\n",
|
||||
|
@ -705,11 +710,12 @@
|
|||
"\n",
|
||||
"with open('config/model_limits.yml') as conf:\n",
|
||||
" model_limits = yaml.full_load(conf)\n",
|
||||
" \n",
|
||||
"fig, axs = plt.subplots(1, len(feature_names), sharey=True)\n",
|
||||
"for i, f in enumerate(feature_names):\n",
|
||||
" (n, bins, patches) = axs[i].hist(x_set[:, i], bins=100)\n",
|
||||
" plt.setp(axs[i], xlabel=feature_names[i])\n",
|
||||
"\n",
|
||||
"for j in range(0, len(feature_names), 3):\n",
|
||||
" fig, axs = plt.subplots(3, 1, figsize=(20,15), sharey=True)\n",
|
||||
" for i, f in enumerate(feature_names[j:j+3]):\n",
|
||||
" (n, bins, patches) = axs[i].hist(x_set[:, i], bins=100)\n",
|
||||
" plt.setp(axs[i], xlabel=feature_names[i])\n",
|
||||
"\n",
|
||||
"### BEGIN HIDDEN TESTS\n",
|
||||
"for key in config['IO']['feature_name'].keys():\n",
|
||||
|
@ -730,7 +736,9 @@
|
|||
"- evaluate region confidence with SME max\n",
|
||||
"- evaluate region confidence with SME min\n",
|
||||
"\n",
|
||||
"We use a Gaussian Mixture Model (GMM) to fit to the data to be able to cluster distributions with means and covariances. We can then sample the GMM with a random state-action pair and evaluate the regions to trust based compared to SME desired limits. "
|
||||
"We use a Gaussian Mixture Model (GMM) to fit to the data to be able to cluster distributions with means and covariances. We can then test the GMM with a random state-action pair and evaluate the regions to trust based compared to SME desired limits. \n",
|
||||
"\n",
|
||||
"> Note: Section C needs improvement, GMMs are probably not helpful here as much as KL-divergence or MMD. "
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -758,22 +766,8 @@
|
|||
"initial_n_components = pca_data.shape[1]\n",
|
||||
"\n",
|
||||
"from sklearn.mixture import GaussianMixture\n",
|
||||
"\n",
|
||||
"bic_list = []\n",
|
||||
"upper_comp = 20\n",
|
||||
"print('Evaluating best number of components for fitting using GMM...')\n",
|
||||
"for i in range(initial_n_components, initial_n_components+upper_comp, 2):\n",
|
||||
" gmm = GaussianMixture(\n",
|
||||
" n_components=i,\n",
|
||||
" covariance_type='full',\n",
|
||||
" random_state=0\n",
|
||||
" )\n",
|
||||
" gmm.fit(dfs)\n",
|
||||
" bic_list.append(gmm.bic(dfs)) \n",
|
||||
" print('{} of {}...'.format(i, initial_n_components+upper_comp))\n",
|
||||
" \n",
|
||||
"n_components = bic_list.index(min(bic_list))+len(feature_names)\n",
|
||||
"print('picking {} components using Bayesian Information Criterion'.format(n_components))\n",
|
||||
"n_components = len(feature_names)\n",
|
||||
"\n",
|
||||
"gmm = GaussianMixture(\n",
|
||||
" n_components=n_components,\n",
|
||||
|
@ -909,6 +903,13 @@
|
|||
"most_likely = max(np.ravel(prob_result))\n",
|
||||
"assert(most_likely > threshold and most_likely != 1 )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
|
Загрузка…
Ссылка в новой задаче