This commit is contained in:
Gustavo Pabon 2022-06-14 22:54:25 +00:00
Родитель 74946cffa2
Коммит ced8867fb6
14 изменённых файлов: 1500 добавлений и 1542 удалений

Просмотреть файл

@ -0,0 +1,651 @@
{
"cells": [
{
"cell_type": "markdown",
"source": [
"# iJungle Tutorial Training Pipeline Example\r\n",
"\r\n",
"*TODO: Summary of the iJungle technique* \r\n"
],
"metadata": {}
},
{
"cell_type": "code",
"source": [
"import iJungle\n",
"from azureml.core import Workspace, Datastore, Dataset, Experiment, Environment, ScriptRunConfig\n",
"import pandas as pd\n",
"import os\n",
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"from azureml.train.hyperdrive import GridParameterSampling, HyperDriveConfig, PrimaryMetricGoal, choice\n",
"from azureml.core.runconfig import RunConfiguration\n",
"from azureml.pipeline.core import Pipeline\n",
"from azureml.pipeline.steps import PythonScriptStep, HyperDriveStep, HyperDriveStepRun\n",
"from azureml.data import OutputFileDatasetConfig\n",
"\n",
"print(\"iJungle version:\", iJungle.__version__)"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "iJungle version: 0.1.73\n"
}
],
"execution_count": 1,
"metadata": {
"gather": {
"logged": 1655239329501
}
}
},
{
"cell_type": "markdown",
"source": [
"# 1. Parameters definition"
],
"metadata": {
"nteract": {
"transient": {
"deleting": false
}
}
}
},
{
"cell_type": "code",
"source": [
"cluster_name = \"cluster4\"\r\n",
"environment_name = \"ijungle-training-env\"\r\n",
"working_datastore_name = \"workspaceblobstore\"\r\n",
"training_dataset_name = \"ijungle-trainining-dataset\"\r\n",
"test_dataset_name = \"ijungle-test-dataset\"\r\n",
"y_test_dataset_name = \"ijungle-y-test-dataset\"\r\n",
"index_feature = 'index'\r\n",
"pipeline_name = \"ijungle-training-pipeline\"\r\n",
"subsample_list = [4096, 2048, 1024, 512]\r\n",
"trees_list = [500, 100, 20, 10]\r\n",
"train_expected_m = 50000\r\n",
"overhead_expected_m = 50000\r\n",
"\r\n"
],
"outputs": [],
"execution_count": 2,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1655239329609
}
}
},
{
"cell_type": "markdown",
"source": [
"# 2. Preparation of cluster, environment and run configuration"
],
"metadata": {
"nteract": {
"transient": {
"deleting": false
}
}
}
},
{
"cell_type": "code",
"source": [
"\r\n",
"ws = Workspace.from_config()\r\n",
"\r\n",
"# Verify that cluster does not exist already\r\n",
"try:\r\n",
" pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)\r\n",
" print('Found existing cluster, use it.')\r\n",
"except:\r\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)\r\n",
" pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)\r\n",
"\r\n",
"# Creation of environment\r\n",
"new_env = Environment(environment_name)\r\n",
"packages = CondaDependencies.create(\r\n",
" conda_packages=['pip'],\r\n",
" pip_packages=['azureml-defaults','scikit-learn','pandas','pyarrow'])\r\n",
"\r\n",
"# Add iJungle library\r\n",
"\r\n",
"whl_filename = \"../dist/iJungle-\"+iJungle.__version__+\"-py3-none-any.whl\"\r\n",
"\r\n",
"whl_url = Environment.add_private_pip_wheel(workspace=ws,file_path = whl_filename, exist_ok=True)\r\n",
"packages.add_pip_package(whl_url)\r\n",
"\r\n",
"\r\n",
"# Add the dependencies to the environment\r\n",
"new_env.python.conda_dependencies = packages\r\n",
"\r\n",
"# Register the environment \r\n",
"new_env.register(workspace=ws)\r\n",
"registered_env = Environment.get(ws, environment_name)\r\n",
"\r\n",
"# Create a new runconfig object for the pipeline\r\n",
"pipeline_run_config = RunConfiguration()\r\n",
"\r\n",
"# Use the compute you created above. \r\n",
"pipeline_run_config.target = pipeline_cluster\r\n",
"\r\n",
"# Assign the environment to the run configuration\r\n",
"pipeline_run_config.environment = registered_env\r\n",
"\r\n",
"print (\"Run configuration created.\")\r\n"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Found existing cluster, use it.\nRun configuration created.\n"
}
],
"execution_count": 3,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1655239330859
}
}
},
{
"cell_type": "markdown",
"source": [
"\n",
"# 3. Data preparation and dataset registration\n",
"\n",
"*TODO: description of the data*\n",
"\n",
"1. Use the following data in this repository *TODO: KDD url to download the files*\n",
" - kddcup.names\n",
" - kddcup.data\n",
" - corrected"
],
"metadata": {
"nteract": {
"transient": {
"deleting": false
}
}
}
},
{
"cell_type": "code",
"source": [
"## Move to data directory\n",
"os.chdir(os.path.dirname(os.path.abspath('__file__'))+'/../data')\n",
"\n",
"## Generate DataFrame with kdd data(csv format)\n",
"names = list(pd.read_csv('kddcup.names',sep=':', header=None)[0])\n",
"df = pd.read_csv('kddcup.data.gz', header=None, names=names)\n",
"df_test = pd.read_csv('corrected.gz', header=None, names=names)\n",
"\n",
"print(\"Shape of raw data:\", df.shape)\n",
"print(\"Shape of test data:\", df_test.shape)\n",
"\n",
"# Remove entries which protocol is not Http\n",
"df = df[df.service == 'http']\n",
"df_test = df_test[df_test.service == 'http']\n",
"print(\"Shape of filtered train data:\", df.shape)\n",
"print(\"Shape of filtered test data:\", df_test.shape)\n",
"\n",
"# Preparation of labels\n",
"y_train = df.pop('label')\n",
"y_test = df_test.pop('label')\n",
"y_train = pd.Series([1 if val == 'normal.' else -1 for val in y_train], name=\"y\")\n",
"y_test = pd.Series([1 if val == 'normal.' else -1 for val in y_test], name=\"y\")\n",
"print(\"Shape of train labels:\", y_train.shape)\n",
"print(\"Shape of test labels:\", y_test.shape)\n",
"\n",
"# Final preparation of training and testing data\n",
"df.drop(['service'], axis=1, inplace=True)\n",
"df_test.drop(['service'], axis=1, inplace=True)\n",
"\n",
"cat_columns = ['protocol_type', 'flag']\n",
"\n",
"for col in cat_columns:\n",
" df_test[col] = df_test[col].astype('category')\n",
" df[col] = df[col].astype('category')\n",
"\n",
"cat_columns = df.select_dtypes(['category']).columns\n",
"df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)\n",
"\n",
"cat_columns = df_test.select_dtypes(['category']).columns\n",
"df_test[cat_columns] = df_test[cat_columns].apply(lambda x: x.cat.codes)\n",
"\n",
"df.reset_index(inplace=True)\n",
"df_test.reset_index(inplace=True)\n",
"df_y_test = y_test.reset_index()\n",
"\n",
"print(\"Shape of train data:\", df.shape)\n",
"print(\"Shape of test data:\", df_test.shape)\n",
"print(\"Shape of y-test data:\", df_y_test.shape)\n",
"\n",
"datastore = Datastore.get(ws, working_datastore_name)\n",
"\n",
"print(\"Registering training dataset ...\")\n",
"train_dataset = Dataset.Tabular.register_pandas_dataframe(df, datastore, training_dataset_name)\n",
"\n",
"print(\"Registering testing dataset ...\")\n",
"test_dataset = Dataset.Tabular.register_pandas_dataframe(df_test, datastore, test_dataset_name)\n",
"\n",
"print(\"Registering y-testing dataset ...\")\n",
"y_test_dataset = Dataset.Tabular.register_pandas_dataframe(df_y_test, datastore, y_test_dataset_name)\n"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Shape of raw data: (4898431, 42)\nShape of test data: (311029, 42)\nShape of filtered train data: (623091, 42)\nShape of filtered test data: (41237, 42)\nShape of train labels: (623091,)\nShape of test labels: (41237,)\nShape of train data: (623091, 41)\nShape of test data: (41237, 41)\nShape of y-test data: (41237, 2)\nRegistering training dataset ...\nValidating arguments.\nArguments validated.\nSuccessfully obtained datastore reference and path.\nUploading file to managed-dataset/bd6448b8-8c72-4b54-be45-cd4e6ab5e212/\nSuccessfully uploaded file to datastore.\nCreating and registering a new dataset.\nSuccessfully created and registered a new dataset.\nRegistering testing dataset ...\nValidating arguments.\nArguments validated.\nSuccessfully obtained datastore reference and path.\nUploading file to managed-dataset/8d4f60e5-0127-4d71-8720-b0290a26ebce/\nSuccessfully uploaded file to datastore.\nCreating and registering a new dataset.\nSuccessfully created and registered a new dataset.\nRegistering y-testing dataset ...\nValidating arguments.\nArguments validated.\nSuccessfully obtained datastore reference and path.\nUploading file to managed-dataset/7fd41a9d-8dd1-4ebd-9a69-26cccf32c490/\nSuccessfully uploaded file to datastore.\nCreating and registering a new dataset.\nSuccessfully created and registered a new dataset.\n"
}
],
"execution_count": 4,
"metadata": {
"collapsed": true,
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1655239359512
}
}
},
{
"cell_type": "markdown",
"source": [
"# 2. Creation of training pipeline"
],
"metadata": {
"nteract": {
"transient": {
"deleting": false
}
}
}
},
{
"cell_type": "code",
"source": [
"# Get the training dataset\r\n",
"train_ds = ws.datasets.get(training_dataset_name)\r\n",
"\r\n",
"# Intermadiate data\r\n",
"dataprep_output = OutputFileDatasetConfig(\r\n",
" name=\"processed_data\", \r\n",
" destination=(\r\n",
" ws.datastores.get(working_datastore_name), \r\n",
" \"invoices/{run-id}/{output-name}\")\r\n",
").as_upload()\r\n",
"\r\n",
"# Step 1, Run the data prep script\r\n",
"prep_step = PythonScriptStep(\r\n",
" name = \"Feature engineering Step\",\r\n",
" source_directory = \"../scripts\",\r\n",
" script_name = \"feat_eng.py\",\r\n",
" arguments = [\r\n",
" '--input-data', train_ds.as_named_input('input'),\r\n",
" '--prepped-data', dataprep_output,\r\n",
" '--index-feature', index_feature, \r\n",
" '--training', 'True' \r\n",
" ],\r\n",
" outputs=[dataprep_output],\r\n",
" compute_target = pipeline_cluster,\r\n",
" runconfig = pipeline_run_config,\r\n",
" allow_reuse = False\r\n",
")\r\n",
"\r\n",
"# Initial definition of the pipeline steps\r\n",
"pipeline_steps = [prep_step]\r\n"
],
"outputs": [],
"execution_count": 5,
"metadata": {
"gather": {
"logged": 1655239360524
}
}
},
{
"cell_type": "code",
"source": [
"# Next Step, run the training script\r\n",
"\r\n",
"dataprep_input = dataprep_output.as_input()\r\n",
"node_count = int(pipeline_cluster.serialize()['properties']['properties']['scaleSettings']['maxNodeCount'])\r\n",
"\r\n",
"model_output_dir = OutputFileDatasetConfig(\r\n",
" name=\"model_output\", \r\n",
" destination=(\r\n",
" ws.datastores.get(working_datastore_name), \r\n",
" \"invoices/{run-id}/{output-name}\")\r\n",
").as_upload()\r\n",
"\r\n",
"script_config = ScriptRunConfig(\r\n",
" source_directory=\"../scripts\",\r\n",
" script=\"training.py\",\r\n",
" arguments = [\r\n",
" '--training-folder', dataprep_input,\r\n",
" '--max-subsample-size', max(subsample_list),\r\n",
" '--model-output', model_output_dir,\r\n",
" '--id-feat', index_feature,\r\n",
" '--train-expected-m', train_expected_m\r\n",
" ],\r\n",
" run_config = pipeline_run_config\r\n",
")\r\n",
"\r\n",
"params = GridParameterSampling(\r\n",
" {\r\n",
" '--trees': choice(trees_list),\r\n",
" '--subsample-size' : choice(subsample_list)\r\n",
" }\r\n",
")\r\n",
"\r\n",
"hyperdrive_config = HyperDriveConfig(\r\n",
" run_config = script_config, \r\n",
" hyperparameter_sampling = params, \r\n",
" policy = None, \r\n",
" primary_metric_name = 'Dummy', \r\n",
" primary_metric_goal = PrimaryMetricGoal.MAXIMIZE, \r\n",
" max_total_runs = len(trees_list)*len(subsample_list), \r\n",
" max_concurrent_runs = node_count\r\n",
") \r\n",
"\r\n",
"train_step = HyperDriveStep(\r\n",
" name = \"iJungle Trainining Step\", \r\n",
" hyperdrive_config = hyperdrive_config, \r\n",
" inputs=[dataprep_input],\r\n",
" outputs=[model_output_dir],\r\n",
" allow_reuse=False\r\n",
")\r\n",
"\r\n",
"pipeline_steps.append(train_step)"
],
"outputs": [],
"execution_count": 6,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1655239360652
}
}
},
{
"cell_type": "code",
"source": [
"# Next step, overhead dataset calculation\r\n",
"\r\n",
"overhead_ds_output = OutputFileDatasetConfig(\r\n",
" name=\"overhead_ds_output\", \r\n",
" destination=(\r\n",
" ws.datastores.get(working_datastore_name), \r\n",
" \"invoices/{run-id}/{output-name}\")\r\n",
").as_upload()\r\n",
"\r\n",
"overhead_ds_step = PythonScriptStep(\r\n",
" name = \"Overhead Dataset Step\",\r\n",
" source_directory = \"../scripts\",\r\n",
" script_name = \"overhead_ds.py\",\r\n",
" arguments = [\r\n",
" '--input-data', dataprep_input,\r\n",
" '--overhead-data', overhead_ds_output,\r\n",
" '--overhead-expected-m', overhead_expected_m\r\n",
" ],\r\n",
" inputs=[dataprep_input],\r\n",
" outputs=[overhead_ds_output],\r\n",
" compute_target = pipeline_cluster,\r\n",
" runconfig = pipeline_run_config,\r\n",
" allow_reuse = False\r\n",
")\r\n",
"pipeline_steps.append(overhead_ds_step)\r\n"
],
"outputs": [],
"execution_count": 7,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1655239360833
}
}
},
{
"cell_type": "code",
"source": [
"# Next step, run the overhead script\r\n",
"\r\n",
"model_input_dir = model_output_dir.as_input()\r\n",
"overhead_ds_input = overhead_ds_output.as_input()\r\n",
"\r\n",
"overhead_output = OutputFileDatasetConfig(\r\n",
" name=\"overhead_output\", \r\n",
" destination=(\r\n",
" ws.datastores.get(working_datastore_name), \r\n",
" \"invoices/{run-id}/{output-name}\")\r\n",
").as_upload()\r\n",
"\r\n",
"script_config = ScriptRunConfig(\r\n",
" source_directory=\"../scripts\",\r\n",
" script=\"overhead.py\",\r\n",
" arguments = [\r\n",
" '--overhead-folder', overhead_ds_input,\r\n",
" '--model-input', model_input_dir,\r\n",
" '--overhead-output', overhead_output,\r\n",
" '--id-feat', index_feature\r\n",
" ],\r\n",
" run_config = pipeline_run_config\r\n",
")\r\n",
"\r\n",
"params = GridParameterSampling(\r\n",
" {\r\n",
" '--trees': choice(trees_list),\r\n",
" '--subsample-size' : choice(subsample_list)\r\n",
" }\r\n",
")\r\n",
"\r\n",
"hyperdrive_config = HyperDriveConfig(\r\n",
" run_config = script_config, \r\n",
" hyperparameter_sampling = params, \r\n",
" policy = None, \r\n",
" primary_metric_name = 'Dummy', \r\n",
" primary_metric_goal = PrimaryMetricGoal.MAXIMIZE, \r\n",
" max_total_runs = len(trees_list)*len(subsample_list), \r\n",
" max_concurrent_runs = node_count\r\n",
") \r\n",
"\r\n",
"overhead_step = HyperDriveStep(\r\n",
" name = \"iJungle Overhead Step\", \r\n",
" hyperdrive_config = hyperdrive_config, \r\n",
" inputs=[overhead_ds_input, model_input_dir],\r\n",
" outputs=[overhead_output],\r\n",
" allow_reuse=False\r\n",
")\r\n",
"\r\n",
"pipeline_steps.append(overhead_step)"
],
"outputs": [],
"execution_count": 8,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1655239360929
}
}
},
{
"cell_type": "code",
"source": [
"# Next steps, find the representative iForest\r\n",
"\r\n",
"overhead_input = overhead_output.as_input()\r\n",
"\r\n",
"best_iforest_step = PythonScriptStep(\r\n",
" name = \"Best iForest Step\",\r\n",
" source_directory = \"../scripts\",\r\n",
" script_name = \"best_iforest.py\",\r\n",
" arguments = [\r\n",
" '--overhead-input', overhead_input,\r\n",
" '--subsample-list', str(subsample_list),\r\n",
" '--trees-list', str(trees_list)\r\n",
" ],\r\n",
" inputs=[overhead_input],\r\n",
" compute_target = pipeline_cluster,\r\n",
" runconfig = pipeline_run_config,\r\n",
" allow_reuse = False\r\n",
")\r\n",
"pipeline_steps.append(best_iforest_step)"
],
"outputs": [],
"execution_count": 9,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1655239361026
}
}
},
{
"cell_type": "code",
"source": [
"# Construct the pipeline\r\n",
"pipeline = Pipeline(workspace=ws, steps=pipeline_steps)\r\n",
"print(\"Pipeline is built.\")\r\n",
"\r\n",
"# Create an experiment and run the pipeline\r\n",
"experiment = Experiment(workspace=ws, name = pipeline_name)\r\n",
"pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)\r\n",
"print(\"Pipeline submitted for execution.\")\r\n"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Pipeline is built.\nCreated step Feature engineering Step [1dab3f3a][da82670e-3f7e-4be0-88f6-507debe692f6], (This step will run and generate new outputs)\nCreated step iJungle Trainining Step [6b78089a][3c5b501f-987f-4dea-adae-e9613855b04c], (This step will run and generate new outputs)Created step Overhead Dataset Step [03970c0c][d8d83657-f3f1-4806-97c6-f40170b8f0b0], (This step will run and generate new outputs)\n\nCreated step iJungle Overhead Step [5e5520f5][074c153a-7a82-4e83-96da-6e62b1d5cedb], (This step will run and generate new outputs)\nCreated step Best iForest Step [26f37c61][5c436f27-c6b8-42b9-a0bd-5b91a3ea06eb], (This step will run and generate new outputs)\nSubmitted PipelineRun d571febb-ba82-4190-874a-4823dd9e978d\nLink to Azure Machine Learning Portal: https://ml.azure.com/runs/d571febb-ba82-4190-874a-4823dd9e978d?wsid=/subscriptions/d412dac0-d902-4cfb-b2f9-19dea115f7ff/resourcegroups/rg-dv-aidnaanomaly-corp-eus2/workspaces/wsmldvanomaly&tid=973ba820-4a58-4246-84bf-170e50b3152a\nPipeline submitted for execution.\n"
}
],
"execution_count": 10,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1655239374520
}
}
},
{
"cell_type": "code",
"source": [],
"outputs": [],
"execution_count": null,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
}
}
}
],
"metadata": {
"kernel_info": {
"name": "python38-azureml"
},
"kernelspec": {
"name": "python38-azureml",
"language": "python",
"display_name": "Python 3.8 - AzureML"
},
"language_info": {
"name": "python",
"version": "3.8.5",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"nteract": {
"version": "nteract-front-end@1.0.0"
},
"microsoft": {
"host": {
"AzureML": {
"notebookHasBeenCompleted": true
}
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}

Просмотреть файл

@ -0,0 +1,402 @@
{
"cells": [
{
"cell_type": "markdown",
"source": [
"# DO NOT START THIS NOTEBOOK UNTIL THE PIPELINE CREATED IN THE PREVIOUS STEP, THE IJUNGLE TRAINING PIPELINE, IS IN \"COMPLETE\" STATUS.\r\n",
"\r\n",
"# iJungle Inference pipeline"
],
"metadata": {
"nteract": {
"transient": {
"deleting": false
}
}
}
},
{
"cell_type": "code",
"source": [
"from azureml.core import Workspace, Environment, Experiment, ScriptRunConfig\r\n",
"from azureml.core.compute import ComputeTarget\r\n",
"from azureml.core.conda_dependencies import CondaDependencies\r\n",
"from azureml.core.runconfig import RunConfiguration\r\n",
"from azureml.pipeline.core import Pipeline\r\n",
"from azureml.pipeline.steps import PythonScriptStep, ParallelRunConfig, ParallelRunStep\r\n",
"from azureml.data import OutputFileDatasetConfig"
],
"outputs": [],
"execution_count": 1,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1655246932882
}
}
},
{
"cell_type": "code",
"source": [
"cluster_name = \"cluster4\"\r\n",
"environment_name = \"ijungle-inference-env\"\r\n",
"input_dataset_name=\"ijungle-test-dataset\"\r\n",
"working_datastore_name=\"workspaceblobstore\"\r\n",
"output_datastore_name=\"workspaceblobstore\"\r\n",
"output_path=\"iJungle/results/\"\r\n",
"pipeline_name=\"ijungle-inference-pipeline\"\r\n",
"\r\n",
"index_feature = 'index'\r\n",
"anomaly_score = -.8"
],
"outputs": [],
"execution_count": 2,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1655246933174
}
}
},
{
"cell_type": "code",
"source": [
"ws = Workspace.from_config()\r\n",
"pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)\r\n",
"print('Cluster configured to execute the pipeline:',cluster_name)"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Cluster configured to execute the pipeline: cluster4\n"
}
],
"execution_count": 3,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1655246934825
}
}
},
{
"cell_type": "code",
"source": [
"new_env = Environment(environment_name)\r\n",
"packages = CondaDependencies.create(\r\n",
" conda_packages=['pip'],\r\n",
" pip_packages=['azureml-defaults','azureml-interpret','scikit-learn','pandas','pyarrow'])\r\n",
"\r\n",
"# Add the dependencies to the environment\r\n",
"new_env.python.conda_dependencies = packages\r\n",
"\r\n",
"# Register the environment \r\n",
"new_env.register(workspace=ws)\r\n",
"registered_env = Environment.get(ws, environment_name)\r\n",
"\r\n",
"# Create a new runconfig object for the pipeline\r\n",
"pipeline_run_config = RunConfiguration()\r\n",
"\r\n",
"# Use the compute you created above. \r\n",
"pipeline_run_config.target = pipeline_cluster\r\n",
"\r\n",
"# Assign the environment to the run configuration\r\n",
"pipeline_run_config.environment = registered_env\r\n",
"\r\n",
"print (\"Run configuration created.\")\r\n"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Run configuration created.\n"
}
],
"execution_count": 4,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1655246935455
}
}
},
{
"cell_type": "code",
"source": [
"# Get the inference dataset\r\n",
"inference_ds = ws.datasets.get(input_dataset_name)\r\n",
"\r\n",
"# Intermadiate data\r\n",
"dataprep_output = OutputFileDatasetConfig(\r\n",
" name=\"processed_data\", \r\n",
" destination=(\r\n",
" ws.datastores.get(working_datastore_name), \r\n",
" \"invoices/{run-id}/{output-name}\")\r\n",
").as_upload()\r\n",
"\r\n",
"# Step 1, Run the data prep script\r\n",
"prep_step = PythonScriptStep(\r\n",
" name = \"Inference data preparation Step\",\r\n",
" source_directory = \"../scripts\",\r\n",
" script_name = \"feat_eng.py\",\r\n",
" arguments = [\r\n",
" '--input-data', inference_ds.as_named_input('input'),\r\n",
" '--prepped-data', dataprep_output,\r\n",
" '--index-feature', index_feature,\r\n",
" '--training', 'False', \r\n",
" ],\r\n",
" outputs=[dataprep_output],\r\n",
" compute_target = pipeline_cluster,\r\n",
" runconfig = pipeline_run_config,\r\n",
" allow_reuse = False\r\n",
")\r\n",
"\r\n",
"# Initial definition of the pipeline steps\r\n",
"pipeline_steps = [prep_step]\r\n"
],
"outputs": [],
"execution_count": 5,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1655246936585
}
}
},
{
"cell_type": "code",
"source": [
"# Next Step, run the inferencing script\r\n",
"\r\n",
"node_count = int(pipeline_cluster.serialize()['properties']['properties']['scaleSettings']['maxNodeCount'])\r\n",
"\r\n",
"dataprep_input = dataprep_output.read_parquet_files().as_input(\"inference_data\")\r\n",
"\r\n",
"inference_output_dir = OutputFileDatasetConfig(\r\n",
" name=\"inference_output\", \r\n",
" destination=(\r\n",
" ws.datastores.get(working_datastore_name), \r\n",
" \"invoices/{run-id}/{output-name}\")\r\n",
").as_upload()\r\n",
"\r\n",
"inference_step = PythonScriptStep(\r\n",
" name = \"Inference Step\",\r\n",
" source_directory = \"../scripts\",\r\n",
" script_name = \"inference.py\",\r\n",
" arguments = [\r\n",
" '--input', dataprep_input,\r\n",
" '--output', inference_output_dir,\r\n",
" '--feat-id', index_feature\r\n",
" ],\r\n",
" inputs=[dataprep_input],\r\n",
" outputs=[inference_output_dir],\r\n",
" compute_target = pipeline_cluster,\r\n",
" runconfig = pipeline_run_config,\r\n",
" allow_reuse = False\r\n",
")\r\n",
"\r\n",
"pipeline_steps.append(inference_step)\r\n"
],
"outputs": [],
"execution_count": 6,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1655246938497
}
}
},
{
"cell_type": "code",
"source": [
"# Next step, explainability\r\n",
"\r\n",
"interpret_input = inference_output_dir.read_parquet_files().as_input(\"interpret_input\")\r\n",
"\r\n",
"interpret_output_dir = OutputFileDatasetConfig(\r\n",
" name=\"interpret_output\", \r\n",
" destination=(\r\n",
" ws.datastores.get(output_datastore_name), \r\n",
" output_path)\r\n",
").as_upload()\r\n",
"\r\n",
"\r\n",
"interpret_step = PythonScriptStep(\r\n",
" name = \"Explainability Step\",\r\n",
" source_directory = \"../scripts\",\r\n",
" script_name = \"interpret.py\",\r\n",
" arguments = [\r\n",
" '--input', interpret_input,\r\n",
" '--dataprep', dataprep_input,\r\n",
" '--output', interpret_output_dir,\r\n",
" '--index-id', index_feature,\r\n",
" '--anomaly-score', anomaly_score\r\n",
" ],\r\n",
" inputs=[ interpret_input, dataprep_input],\r\n",
" outputs=[interpret_output_dir],\r\n",
" compute_target = pipeline_cluster,\r\n",
" runconfig = pipeline_run_config,\r\n",
" allow_reuse = False\r\n",
")\r\n",
"pipeline_steps.append(interpret_step)"
],
"outputs": [
{
"output_type": "error",
"ename": "NameError",
"evalue": "name 'dataprep_output_outliers' is not defined",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-7-d0664c56f92d>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0minterpret_input\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minference_output_dir\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_parquet_files\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mas_input\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"interpret_input\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mdataprep_input_outliers\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdataprep_output_outliers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_parquet_files\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mas_input\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"dataprep_input_outliers\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m interpret_output_dir = OutputFileDatasetConfig(\n",
"\u001b[0;31mNameError\u001b[0m: name 'dataprep_output_outliers' is not defined"
]
}
],
"execution_count": 7,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1655246938617
}
}
},
{
"cell_type": "code",
"source": [
"# Construct the pipeline\r\n",
"pipeline = Pipeline(workspace=ws, steps=pipeline_steps)\r\n",
"print(\"Pipeline is built.\")\r\n",
"\r\n",
"# Create an experiment and run the pipeline\r\n",
"experiment = Experiment(workspace=ws, name = pipeline_name)\r\n",
"pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)\r\n",
"print(\"Pipeline submitted for execution.\")\r\n"
],
"outputs": [],
"execution_count": null,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1655246938658
}
}
},
{
"cell_type": "code",
"source": [],
"outputs": [],
"execution_count": null,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
}
}
}
],
"metadata": {
"kernelspec": {
"name": "python38-azureml",
"language": "python",
"display_name": "Python 3.8 - AzureML"
},
"language_info": {
"name": "python",
"version": "3.8.5",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"kernel_info": {
"name": "python38-azureml"
},
"microsoft": {
"host": {
"AzureML": {
"notebookHasBeenCompleted": true
}
}
},
"nteract": {
"version": "nteract-front-end@1.0.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -1,4 +0,0 @@
jupyter
pandas
scikit-learn
matplotlib

Просмотреть файл

@ -1,27 +0,0 @@
import os
import numpy as np
from azureml.core import Model
import joblib
#import argparse
#parser = argparse.ArgumentParser()
#parser.add_argument("--id-feature", type=str, dest='id_feature', help='ID Freature')
#args = parser.parse_args()
#id_feat = str(args.id_feature)
#print('id feature', id_feat)
def init():
# Runs when the pipeline step is initialized
global model
# load the model
model_path = Model.get_model_path('best_iforest.pkl')
model = joblib.load(model_path)
def run(mini_batch):
mini_batch.set_index('Van_Stock_Proposal_Detail_Id', inplace=True)
index_list = list(mini_batch.index)
y_pred = model.predict(mini_batch).tolist()
score = model.score_samples(mini_batch).tolist()
return(list(zip(index_list, y_pred, score)))

Просмотреть файл

@ -1,56 +0,0 @@
# -*- coding: utf-8 -*-
from azureml.core import Run
import argparse
import numpy as np
import iJungle
run = Run.get_context()
print("iJungle version:", iJungle.__version__)
run.log('iJungle_version', iJungle.__version__)
parser = argparse.ArgumentParser()
# Input Data
parser.add_argument("--input-data", type=str, dest='input_data', help='training dataset')
parser.add_argument("--id-feature", type=str, dest='id_feature', help='ID Freature')
parser.add_argument("--max-subsample-size", type=int, dest='max_sss', help='Max subsample size')
parser.add_argument("--train-size", type=float, dest='train_size', help='Train size')
# Hyper parameters
parser.add_argument('--trees', type=int, dest='trees', default=100, help='Number of trees')
parser.add_argument('--subsample-size', type=int, dest='subsample_size', default=8192, help='Subsample size')
# Add arguments to args collection
args = parser.parse_args()
id_feat = str(args.id_feature)
print('id feature', id_feat)
# Log Hyperparameter values
trees = np.int(args.trees)
subsample_size = np.int(args.subsample_size)
print('trees', trees)
print('subsample_size', subsample_size)
run.log('trees', trees)
run.log('subsample_size', subsample_size)
# Other parameters
max_sss = np.int(args.max_sss)
train_size = np.float(args.train_size)
print("Max subsample size", max_sss)
print("Train size", train_size)
run.log('max_sss', max_sss)
run.log('train_size', train_size)
# Load training data
print("Loading Data...")
df = run.input_datasets['training_data'].to_pandas_dataframe() # Get the training data from the estimator input
df.set_index(id_feat, inplace=True)
print("Starting training ...")
model_filename = iJungle.model_train_fun(df, trees, subsample_size, train_size, max_sss)
print(model_filename)
# Log dummy metric
run.log('Dummy', np.float(0))
run.complete()

56
scripts/best_iforest.py Normal file
Просмотреть файл

@ -0,0 +1,56 @@
from azureml.core import Run, Model
import argparse
import pandas as pd
import os
import iJungle
import joblib
run = Run.get_context()
parser = argparse.ArgumentParser()
# Input Data
parser.add_argument('--overhead-input', type=str, dest='overhead_input', help='Overhead input')
parser.add_argument('--subsample-list', type=str, dest='subsample_list')
parser.add_argument('--trees-list', type=str, dest='trees_list')
# Add arguments to args collection
args = parser.parse_args()
overhead_input = args.overhead_input
print("Overhead input", overhead_input)
subsample_list = eval(args.subsample_list)
print("subsample_list", subsample_list)
trees_list = eval(args.trees_list)
print("subsample_list", trees_list)
# Load models
print("Loading Models...")
results_dic = {}
for subsample_size in subsample_list:
results_dic[str(subsample_size)] = {}
for trees in trees_list:
model_name = 'iJungle_light_' + str(trees) + '_' + str(subsample_size) + '_results'
print(model_name)
model_path = Model.get_model_path(model_name)
print(model_path)
results_dic[str(subsample_size)][str(trees)] = joblib.load(model_path)
results = pd.DataFrame(results_dic)
# Calculating best iForest
print("Calculating best iForest ...")
best_subsample_size, best_trees, best_iF_k = iJungle.best_iforest_params(results)
model_name = 'iJungle_light_' + str(trees) + '_' + str(subsample_size)
print("Best iForest model name:", model_name)
model_path = Model.get_model_path(model_name)
print("Loading best iFor_list from ", model_path)
iFor_list = joblib.load(model_path)
model = iFor_list[best_iF_k]
print("Model selected!")
print("Registering model...")
best_model_name = 'best_iforest.pkl'
best_model_path = os.path.join(iJungle._MODEL_DIR, best_model_name)
joblib.dump(model, best_model_path)
Model.register(workspace=run.experiment.workspace ,model_path=best_model_path, model_name=best_model_name)
run.complete()

65
scripts/feat_eng.py Normal file
Просмотреть файл

@ -0,0 +1,65 @@
import os
import argparse
import pandas as pd
from azureml.core import Run, Model
from sklearn import preprocessing
import joblib
import numpy as np
LOCAL_MODEL_PATH = 'outputs'
parser = argparse.ArgumentParser()
parser.add_argument("--input-data", type=str, dest='train_dataset_id')
parser.add_argument('--prepped-data', type=str, dest='prepped_data')
parser.add_argument('--index-feature', type=str, dest='index_feature')
parser.add_argument('--training', type=str, dest='training')
print('Loading parameters ...')
args = parser.parse_args()
save_folder = args.prepped_data
index_feature = args.index_feature
training = bool(args.training)
print('save_folder', save_folder)
print('index_feature', index_feature)
print('training', training)
run = Run.get_context()
print("Loading data ...")
df = run.input_datasets['input'].to_pandas_dataframe()
print(df)
print("Shape of data:",df.shape)
print("Setting index ...")
df.set_index(index_feature, inplace=True)
for feat in df.columns:
if training:
print("Training and registering scaler for feature:", feat)
scaler = preprocessing.StandardScaler()
scaler.fit(df[[feat]])
model_name = 'ijungle_scaler_model_'+feat
file_name = os.path.join(LOCAL_MODEL_PATH, model_name + '.pkl')
joblib.dump(value=scaler, filename=file_name)
Model.register(
workspace=run.experiment.workspace,
model_path = file_name,
model_name = model_name
)
else:
print("Applying scaler for feature:", feat)
model_name = 'invoices_scaler_model_'+feat
model_path = Model.get_model_path(model_name)
scaler = joblib.load(model_path)
df[feat] = scaler.transform(df[[feat]]).reshape(df.shape[0])
print("Reseting index ...")
df.reset_index(inplace=True)
print("Saving Data...")
os.makedirs(save_folder, exist_ok=True)
save_path = os.path.join(save_folder,'prepped.parquet')
df.to_parquet(save_path, index=False)
run.complete()

51
scripts/inference.py Normal file
Просмотреть файл

@ -0,0 +1,51 @@
import os
import argparse
import pandas as pd
from azureml.core import Run, Model
from sklearn import preprocessing
import joblib
import numpy as np
parser = argparse.ArgumentParser()
parser.add_argument("--input", type=str, dest='input_')
parser.add_argument("--output", type=str, dest='output_')
parser.add_argument("--feat-id", type=str, dest='feat_id')
run = Run.get_context()
print("Reading parameters ...")
args = parser.parse_args()
output_dir = args.output_
feat_id = args.feat_id
print("Output dir:", output_dir)
print("feat_id:", feat_id)
print("Loading data ...")
df = run.input_datasets['inference_data'].to_pandas_dataframe()
df.set_index(feat_id, inplace=True)
print(df)
print("Loading model ...")
model_name = 'best_iforest.pkl'
model_path = Model.get_model_path(model_name)
model = joblib.load(model_path)
print("model", model)
print(df.isnull().any())
print("Making predictions ...")
y_pred = model.predict(df)
scores = model.score_samples(df)
print("Number of anomalies: ", len(y_pred[y_pred==-1]))
print("Generating outputs ...")
df_out = pd.DataFrame()
df_out[feat_id] = df.index.values
df_out['pred'] = y_pred
df_out['score'] = scores
save_path = os.path.join(output_dir,'results.parquet')
df_out.to_parquet(save_path, index=False)
run.complete()

108
scripts/interpret.py Normal file
Просмотреть файл

@ -0,0 +1,108 @@
import os
import argparse
import pandas as pd
from azureml.core import Run, Model
from sklearn import preprocessing
import joblib
import numpy as np
from interpret.ext.blackbox import TabularExplainer
import time
parser = argparse.ArgumentParser()
parser.add_argument("--input", type=str, dest='input', help='inference dataset')
parser.add_argument("--dataprep", type=str, dest='dataprep', help='dataset')
parser.add_argument('--output', type=str, dest='output_dir', help='Folder for results')
parser.add_argument('--index-id', type=str, dest='index_id')
parser.add_argument('--anomaly-score', type=str, dest='anomaly_score')
run = Run.get_context()
print("Loading arguments ...")
args = parser.parse_args()
output_dir = args.output_dir
index_id = args.index_id
anomaly_score = float(args.anomaly_score)
print("outpur_dir", output_dir)
print("index_id", index_id)
print("anomaly_score", anomaly_score)
print("Loading results ...")
df = run.input_datasets["interpret_input"].to_pandas_dataframe()
print(df)
print("Selecting anomalies ...")
df = df[df['score']<=anomaly_score]
print(df)
ls_anomalies = []
if df.shape[0] > 0:
df_100 = df.sort_values("score", ascending=True).copy()
print(df_100)
print("Loading inference data ...")
df_inf = run.input_datasets["inference_data"].to_pandas_dataframe()
print(df_inf)
print("Creating input to interpret ...")
result_ids = df_100[index_id].values
df_inf = df_inf.loc[df_inf[index_id].isin(result_ids),:].copy()
X_explain = df_inf.set_index(index_id)
print(X_explain)
print("Loading model ...")
model_name = 'Invoices_best_iforest.pkl'
model_path = Model.get_model_path(model_name)
model = joblib.load(model_path)
print("model", model)
print("Creating explanations ...")
tab_explainer = TabularExplainer(model, X_explain)
print(tab_explainer)
# Get predictions
predictions = model.predict(X_explain)
# Get local explanations
local_tab_explanation = tab_explainer.explain_local(X_explain)
# Get feature names and importance for each possible label
local_tab_features = local_tab_explanation.get_ranked_local_names()
local_tab_importance = local_tab_explanation.get_ranked_local_values()
ls_explanations = []
for i in range(len(X_explain.index)):
detail_id = X_explain.index[i]
feat3, feat2, feat1 = tuple(local_tab_features[0][-3:])
score3, score2, score1 = tuple(local_tab_importance[0][-3:])
ls_explanations.append({
index_id:detail_id,
'Interpretation_Feature_1':feat1,
'Interpretation_Score_1':score1,
'Interpretation_Feature_2':feat2,
'Interpretation_Score_2':score2,
'Interpretation_Feature_3':feat3,
'Interpretation_Score_3':score3,
})
df_explanations = pd.DataFrame(ls_explanations)
print(df_explanations)
df_results_exp = df_100.merge(df_explanations, on=index_id, how='left')
print("Explanations:")
print(df_results_exp)
ls_anomalies.append(df_results_exp)
print("Consolidating all anomalies ...")
df_anomalies = pd.concat(ls_anomalies)
df_anomalies.sort_values(['score','Interpretation_Score_1'], ascending=True, inplace=True)
print(df_anomalies)
print("Adding additional columns ...")
df_anomalies = df_anomalies.merge(df_sp, on=index_id, how='left')
print(df_anomalies)
print("Saving anomalies ...")
timestr = time.strftime("%Y%m%d%H%M%S")
df_anomalies.to_csv(os.path.join(output_dir,'anomalies_'+timestr+'.csv'), index=False)
run.complete()

Просмотреть файл

@ -1,29 +1,34 @@
# -*- coding: utf-8 -*-
from azureml.core import Model, Run
from azureml.core import Run, Model
import argparse
import pandas as pd
import numpy as np
import iJungle
import joblib
import os
import iJungle
import shutil
run = Run.get_context()
print("iJungle version:", iJungle.__version__)
run.log('iJungle_version', iJungle.__version__)
parser = argparse.ArgumentParser()
# Input Data
parser.add_argument("--input-data", type=str, dest='input_data', help='Overhead dataset')
parser.add_argument("--id-feature", type=str, dest='id_feature', help='ID Freature')
parser.add_argument("--overhead-folder", type=str, dest='overhead_folder', help='overhead data folder')
parser.add_argument("--model-input", type=str, dest='model_input', help='model input folder')
parser.add_argument("--overhead-output", type=str, dest='overhead_output', help='overhead output folder')
parser.add_argument("--id-feat", type=str, dest='id_feat')
# Hyper parameters
parser.add_argument('--trees', type=int, dest='trees', default=100, help='Number of trees')
parser.add_argument('--subsample-size', type=int, dest='subsample_size', default=8192, help='Subsample size')
# Add arguments to args collection
args = parser.parse_args()
id_feat = str(args.id_feature)
print('id feature', id_feat)
overhead_folder = args.overhead_folder
print("Overhead folder", overhead_folder)
model_input = args.model_input
print("Model input", model_input)
overhead_output = args.overhead_output
print("Overhead output", overhead_output)
# Log Hyperparameter values
trees = np.int(args.trees)
@ -33,10 +38,18 @@ print('subsample_size', subsample_size)
run.log('trees', trees)
run.log('subsample_size', subsample_size)
# Other parameters
id_feat = args.id_feat
print("id_feat", id_feat)
run.log('id_feat', id_feat)
# Load training data
print("Loading Data...")
W = run.input_datasets['overhead_data'].to_pandas_dataframe() # Get the training data from the estimator input
load_path = os.path.join(overhead_folder,'W.parquet')
W = pd.read_parquet(load_path)
W.set_index(id_feat, inplace=True)
print("Overhead Data loaded. Shape:", W.shape)
# Load iFor_list pickle
print("Loading pickle...")
@ -54,7 +67,21 @@ results_filename = os.path.join(iJungle._MODEL_DIR, model_name + '_results.pkl')
print("Writing results:", results_filename)
joblib.dump(value=results, filename=results_filename)
model_name = 'iJungle_light_' + str(trees) + '_' + str(subsample_size) + '_results'
print(model_name)
print('Registering model...')
Model.register(
workspace=run.experiment.workspace,
model_path = results_filename,
model_name = model_name,
properties={
'trees':trees,
'subsample_size':subsample_size})
# Log dummy metric
run.log('Dummy', np.float(0))
shutil.copy(results_filename, os.path.join(overhead_output, model_name + '.pkl'))
run.complete()

43
scripts/overhead_ds.py Normal file
Просмотреть файл

@ -0,0 +1,43 @@
from azureml.core import Run
import argparse
import pandas as pd
import os
import iJungle
run = Run.get_context()
parser = argparse.ArgumentParser()
# Input Data
parser.add_argument("--input-data", type=str, dest='prepped_data', help='Prepped data')
parser.add_argument('--overhead-data', type=str, dest='overhead_data', help='Overhead data')
parser.add_argument('--overhead-expected-m', type=str, dest='overhead_expected_m')
# Add arguments to args collection
args = parser.parse_args()
prepped_data = args.prepped_data
print("Prepped folder", prepped_data)
overhead_data = args.overhead_data
print("Model input", overhead_data)
overhead_expected_m = int(args.overhead_expected_m)
print("overhead_expected_m", overhead_expected_m)
# Load training data
print("Loading Data...")
load_path = os.path.join(prepped_data,'prepped.parquet')
df = pd.read_parquet(load_path)
print("Data loaded. Shape:", df.shape)
# Overhead sample size calculation
n_records = df.shape[0]
overhead_size = min(1,overhead_expected_m/n_records)
print("Overhead size", overhead_size)
run.log('overhead_size', overhead_size)
W = iJungle.select_overhead_data(df, overhead_size=overhead_size)
print("Overhead shape", W.shape)
print("Saving Data...")
save_path = os.path.join(overhead_data,'W.parquet')
W.to_parquet(save_path, index=False)
run.complete()

83
scripts/training.py Normal file
Просмотреть файл

@ -0,0 +1,83 @@
from azureml.core import Run, Model
import argparse
import pandas as pd
import numpy as np
import joblib
import os
import iJungle
import shutil
run = Run.get_context()
parser = argparse.ArgumentParser()
# Input Data
parser.add_argument("--training-folder", type=str, dest='training_folder', help='training data folder')
parser.add_argument("--max-subsample-size", type=int, dest='max_sss', help='Max subsample size')
parser.add_argument("--model-output", type=str, dest='model_output', help='model output folder')
parser.add_argument("--id-feat", type=str, dest='id_feat')
parser.add_argument("--train-expected-m", type=str, dest='train_expected_m')
# Hyper parameters
parser.add_argument('--trees', type=int, dest='trees', default=100, help='Number of trees')
parser.add_argument('--subsample-size', type=int, dest='subsample_size', default=8192, help='Subsample size')
# Add arguments to args collection
args = parser.parse_args()
training_folder = args.training_folder
print("Training folder", training_folder)
model_output = args.model_output
print("Model output", model_output)
id_feat = args.id_feat
print("id_feat", id_feat)
train_expected_m = int(args.train_expected_m)
print("train_expected_m", train_expected_m)
# Log Hyperparameter values
trees = np.int(args.trees)
subsample_size = np.int(args.subsample_size)
print('trees', trees)
print('subsample_size', subsample_size)
run.log('trees', trees)
run.log('subsample_size', subsample_size)
# Other parameters
max_sss = np.int(args.max_sss)
print("Max subsample size", max_sss)
run.log('max_sss', max_sss)
# Load training data
print("Loading Data...")
load_path = os.path.join(training_folder,'prepped.parquet')
df = pd.read_parquet(load_path)
df.set_index(id_feat, inplace=True)
# Train sample size calculation
n_records = df.shape[0]
train_size = min(1,train_expected_m/n_records)
print("Train size", train_size)
run.log('train_size', train_size)
print("Starting training ...")
model_filename = iJungle.model_train_fun(df, trees, subsample_size, train_size, max_sss)
print(model_filename)
model_name = 'iJungle_light_' + str(trees) + '_' + str(subsample_size)
print(model_name)
model_path = os.path.join(iJungle._MODEL_DIR, model_filename)
print('Registering model...')
Model.register(
workspace=run.experiment.workspace,
model_path = model_path,
model_name = model_name,
properties={
'trees':trees,
'subsample_size':subsample_size})
# Log dummy metric
run.log('Dummy', np.float(0))
shutil.copy(model_path, os.path.join(model_output,model_name + '.pkl'))
run.complete()

Просмотреть файл

@ -1,2 +1,2 @@
__version__ = '0.1.73'
__version__ = '0.2.0'
_MODEL_DIR = 'outputs'