pipeline creation

2022-06-14 22:54:25 +00:00 · 2022-06-14 22:54:25 +00:00 · ced8867fb6
--- a/notebooks/01-iJungle-tutorial-training-pipeline.ipynb
+++ b/notebooks/01-iJungle-tutorial-training-pipeline.ipynb
@ -0,0 +1,651 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# iJungle Tutorial Training Pipeline Example\r\n",
+        "\r\n",
+        "*TODO: Summary of the iJungle technique* \r\n"
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import iJungle\n",
+        "from azureml.core import Workspace, Datastore, Dataset, Experiment, Environment, ScriptRunConfig\n",
+        "import pandas as pd\n",
+        "import os\n",
+        "from azureml.core.compute import  ComputeTarget, AmlCompute\n",
+        "from azureml.core.conda_dependencies import CondaDependencies\n",
+        "from azureml.train.hyperdrive import GridParameterSampling, HyperDriveConfig, PrimaryMetricGoal, choice\n",
+        "from azureml.core.runconfig import RunConfiguration\n",
+        "from azureml.pipeline.core import Pipeline\n",
+        "from azureml.pipeline.steps import PythonScriptStep, HyperDriveStep, HyperDriveStepRun\n",
+        "from azureml.data import OutputFileDatasetConfig\n",
+        "\n",
+        "print(\"iJungle version:\", iJungle.__version__)"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "iJungle version: 0.1.73\n"
+        }
+      ],
+      "execution_count": 1,
+      "metadata": {
+        "gather": {
+          "logged": 1655239329501
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# 1. Parameters definition"
+      ],
+      "metadata": {
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "cluster_name = \"cluster4\"\r\n",
+        "environment_name = \"ijungle-training-env\"\r\n",
+        "working_datastore_name = \"workspaceblobstore\"\r\n",
+        "training_dataset_name = \"ijungle-trainining-dataset\"\r\n",
+        "test_dataset_name = \"ijungle-test-dataset\"\r\n",
+        "y_test_dataset_name = \"ijungle-y-test-dataset\"\r\n",
+        "index_feature = 'index'\r\n",
+        "pipeline_name = \"ijungle-training-pipeline\"\r\n",
+        "subsample_list = [4096, 2048, 1024, 512]\r\n",
+        "trees_list = [500, 100, 20, 10]\r\n",
+        "train_expected_m = 50000\r\n",
+        "overhead_expected_m = 50000\r\n",
+        "\r\n"
+      ],
+      "outputs": [],
+      "execution_count": 2,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "gather": {
+          "logged": 1655239329609
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# 2. Preparation of cluster, environment and run configuration"
+      ],
+      "metadata": {
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "\r\n",
+        "ws = Workspace.from_config()\r\n",
+        "\r\n",
+        "# Verify that cluster does not exist already\r\n",
+        "try:\r\n",
+        "    pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)\r\n",
+        "    print('Found existing cluster, use it.')\r\n",
+        "except:\r\n",
+        "    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)\r\n",
+        "    pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)\r\n",
+        "\r\n",
+        "# Creation of environment\r\n",
+        "new_env = Environment(environment_name)\r\n",
+        "packages = CondaDependencies.create(\r\n",
+        "    conda_packages=['pip'],\r\n",
+        "    pip_packages=['azureml-defaults','scikit-learn','pandas','pyarrow'])\r\n",
+        "\r\n",
+        "# Add iJungle library\r\n",
+        "\r\n",
+        "whl_filename = \"../dist/iJungle-\"+iJungle.__version__+\"-py3-none-any.whl\"\r\n",
+        "\r\n",
+        "whl_url = Environment.add_private_pip_wheel(workspace=ws,file_path = whl_filename, exist_ok=True)\r\n",
+        "packages.add_pip_package(whl_url)\r\n",
+        "\r\n",
+        "\r\n",
+        "# Add the dependencies to the environment\r\n",
+        "new_env.python.conda_dependencies = packages\r\n",
+        "\r\n",
+        "# Register the environment \r\n",
+        "new_env.register(workspace=ws)\r\n",
+        "registered_env = Environment.get(ws, environment_name)\r\n",
+        "\r\n",
+        "# Create a new runconfig object for the pipeline\r\n",
+        "pipeline_run_config = RunConfiguration()\r\n",
+        "\r\n",
+        "# Use the compute you created above. \r\n",
+        "pipeline_run_config.target = pipeline_cluster\r\n",
+        "\r\n",
+        "# Assign the environment to the run configuration\r\n",
+        "pipeline_run_config.environment = registered_env\r\n",
+        "\r\n",
+        "print (\"Run configuration created.\")\r\n"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Found existing cluster, use it.\nRun configuration created.\n"
+        }
+      ],
+      "execution_count": 3,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "gather": {
+          "logged": 1655239330859
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "\n",
+        "# 3. Data preparation and dataset registration\n",
+        "\n",
+        "*TODO: description of the data*\n",
+        "\n",
+        "1. Use the following data in this repository *TODO: KDD url to download the files*\n",
+        "    - kddcup.names\n",
+        "    - kddcup.data\n",
+        "    - corrected"
+      ],
+      "metadata": {
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "## Move to data directory\n",
+        "os.chdir(os.path.dirname(os.path.abspath('__file__'))+'/../data')\n",
+        "\n",
+        "## Generate DataFrame with kdd data(csv format)\n",
+        "names = list(pd.read_csv('kddcup.names',sep=':', header=None)[0])\n",
+        "df = pd.read_csv('kddcup.data.gz', header=None, names=names)\n",
+        "df_test = pd.read_csv('corrected.gz', header=None, names=names)\n",
+        "\n",
+        "print(\"Shape of raw data:\", df.shape)\n",
+        "print(\"Shape of test data:\", df_test.shape)\n",
+        "\n",
+        "# Remove entries which protocol is not Http\n",
+        "df = df[df.service == 'http']\n",
+        "df_test = df_test[df_test.service == 'http']\n",
+        "print(\"Shape of filtered train data:\", df.shape)\n",
+        "print(\"Shape of filtered test data:\", df_test.shape)\n",
+        "\n",
+        "# Preparation of labels\n",
+        "y_train = df.pop('label')\n",
+        "y_test = df_test.pop('label')\n",
+        "y_train = pd.Series([1 if val == 'normal.' else -1 for val in y_train], name=\"y\")\n",
+        "y_test = pd.Series([1 if val == 'normal.' else -1 for val in y_test], name=\"y\")\n",
+        "print(\"Shape of train labels:\", y_train.shape)\n",
+        "print(\"Shape of test labels:\", y_test.shape)\n",
+        "\n",
+        "# Final preparation of training and testing data\n",
+        "df.drop(['service'], axis=1, inplace=True)\n",
+        "df_test.drop(['service'], axis=1, inplace=True)\n",
+        "\n",
+        "cat_columns = ['protocol_type', 'flag']\n",
+        "\n",
+        "for col in cat_columns:\n",
+        "    df_test[col] = df_test[col].astype('category')\n",
+        "    df[col] = df[col].astype('category')\n",
+        "\n",
+        "cat_columns = df.select_dtypes(['category']).columns\n",
+        "df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)\n",
+        "\n",
+        "cat_columns = df_test.select_dtypes(['category']).columns\n",
+        "df_test[cat_columns] = df_test[cat_columns].apply(lambda x: x.cat.codes)\n",
+        "\n",
+        "df.reset_index(inplace=True)\n",
+        "df_test.reset_index(inplace=True)\n",
+        "df_y_test = y_test.reset_index()\n",
+        "\n",
+        "print(\"Shape of train data:\", df.shape)\n",
+        "print(\"Shape of test data:\", df_test.shape)\n",
+        "print(\"Shape of y-test data:\", df_y_test.shape)\n",
+        "\n",
+        "datastore = Datastore.get(ws, working_datastore_name)\n",
+        "\n",
+        "print(\"Registering training dataset ...\")\n",
+        "train_dataset = Dataset.Tabular.register_pandas_dataframe(df, datastore, training_dataset_name)\n",
+        "\n",
+        "print(\"Registering testing dataset ...\")\n",
+        "test_dataset = Dataset.Tabular.register_pandas_dataframe(df_test, datastore, test_dataset_name)\n",
+        "\n",
+        "print(\"Registering y-testing dataset ...\")\n",
+        "y_test_dataset = Dataset.Tabular.register_pandas_dataframe(df_y_test, datastore, y_test_dataset_name)\n"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Shape of raw data: (4898431, 42)\nShape of test data: (311029, 42)\nShape of filtered train data: (623091, 42)\nShape of filtered test data: (41237, 42)\nShape of train labels: (623091,)\nShape of test labels: (41237,)\nShape of train data: (623091, 41)\nShape of test data: (41237, 41)\nShape of y-test data: (41237, 2)\nRegistering training dataset ...\nValidating arguments.\nArguments validated.\nSuccessfully obtained datastore reference and path.\nUploading file to managed-dataset/bd6448b8-8c72-4b54-be45-cd4e6ab5e212/\nSuccessfully uploaded file to datastore.\nCreating and registering a new dataset.\nSuccessfully created and registered a new dataset.\nRegistering testing dataset ...\nValidating arguments.\nArguments validated.\nSuccessfully obtained datastore reference and path.\nUploading file to managed-dataset/8d4f60e5-0127-4d71-8720-b0290a26ebce/\nSuccessfully uploaded file to datastore.\nCreating and registering a new dataset.\nSuccessfully created and registered a new dataset.\nRegistering y-testing dataset ...\nValidating arguments.\nArguments validated.\nSuccessfully obtained datastore reference and path.\nUploading file to managed-dataset/7fd41a9d-8dd1-4ebd-9a69-26cccf32c490/\nSuccessfully uploaded file to datastore.\nCreating and registering a new dataset.\nSuccessfully created and registered a new dataset.\n"
+        }
+      ],
+      "execution_count": 4,
+      "metadata": {
+        "collapsed": true,
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "gather": {
+          "logged": 1655239359512
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# 2. Creation of training pipeline"
+      ],
+      "metadata": {
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Get the training dataset\r\n",
+        "train_ds = ws.datasets.get(training_dataset_name)\r\n",
+        "\r\n",
+        "# Intermadiate data\r\n",
+        "dataprep_output = OutputFileDatasetConfig(\r\n",
+        "    name=\"processed_data\", \r\n",
+        "    destination=(\r\n",
+        "        ws.datastores.get(working_datastore_name), \r\n",
+        "        \"invoices/{run-id}/{output-name}\")\r\n",
+        ").as_upload()\r\n",
+        "\r\n",
+        "# Step 1, Run the data prep script\r\n",
+        "prep_step = PythonScriptStep(\r\n",
+        "    name = \"Feature engineering Step\",\r\n",
+        "    source_directory = \"../scripts\",\r\n",
+        "    script_name = \"feat_eng.py\",\r\n",
+        "    arguments = [\r\n",
+        "        '--input-data', train_ds.as_named_input('input'),\r\n",
+        "        '--prepped-data', dataprep_output,\r\n",
+        "        '--index-feature', index_feature,    \r\n",
+        "        '--training', 'True'    \r\n",
+        "    ],\r\n",
+        "    outputs=[dataprep_output],\r\n",
+        "    compute_target = pipeline_cluster,\r\n",
+        "    runconfig = pipeline_run_config,\r\n",
+        "    allow_reuse = False\r\n",
+        ")\r\n",
+        "\r\n",
+        "# Initial definition of the pipeline steps\r\n",
+        "pipeline_steps = [prep_step]\r\n"
+      ],
+      "outputs": [],
+      "execution_count": 5,
+      "metadata": {
+        "gather": {
+          "logged": 1655239360524
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Next Step, run the training script\r\n",
+        "\r\n",
+        "dataprep_input = dataprep_output.as_input()\r\n",
+        "node_count = int(pipeline_cluster.serialize()['properties']['properties']['scaleSettings']['maxNodeCount'])\r\n",
+        "\r\n",
+        "model_output_dir = OutputFileDatasetConfig(\r\n",
+        "    name=\"model_output\", \r\n",
+        "    destination=(\r\n",
+        "        ws.datastores.get(working_datastore_name), \r\n",
+        "        \"invoices/{run-id}/{output-name}\")\r\n",
+        ").as_upload()\r\n",
+        "\r\n",
+        "script_config = ScriptRunConfig(\r\n",
+        "    source_directory=\"../scripts\",\r\n",
+        "    script=\"training.py\",\r\n",
+        "    arguments = [\r\n",
+        "        '--training-folder', dataprep_input,\r\n",
+        "        '--max-subsample-size', max(subsample_list),\r\n",
+        "        '--model-output', model_output_dir,\r\n",
+        "        '--id-feat', index_feature,\r\n",
+        "        '--train-expected-m', train_expected_m\r\n",
+        "    ],\r\n",
+        "    run_config = pipeline_run_config\r\n",
+        ")\r\n",
+        "\r\n",
+        "params = GridParameterSampling(\r\n",
+        "    {\r\n",
+        "        '--trees': choice(trees_list),\r\n",
+        "        '--subsample-size' : choice(subsample_list)\r\n",
+        "    }\r\n",
+        ")\r\n",
+        "\r\n",
+        "hyperdrive_config = HyperDriveConfig(\r\n",
+        "    run_config = script_config, \r\n",
+        "    hyperparameter_sampling = params, \r\n",
+        "    policy = None, \r\n",
+        "    primary_metric_name = 'Dummy', \r\n",
+        "    primary_metric_goal = PrimaryMetricGoal.MAXIMIZE, \r\n",
+        "    max_total_runs = len(trees_list)*len(subsample_list), \r\n",
+        "    max_concurrent_runs = node_count\r\n",
+        ") \r\n",
+        "\r\n",
+        "train_step = HyperDriveStep(\r\n",
+        "    name = \"iJungle Trainining Step\", \r\n",
+        "    hyperdrive_config = hyperdrive_config, \r\n",
+        "    inputs=[dataprep_input],\r\n",
+        "    outputs=[model_output_dir],\r\n",
+        "    allow_reuse=False\r\n",
+        ")\r\n",
+        "\r\n",
+        "pipeline_steps.append(train_step)"
+      ],
+      "outputs": [],
+      "execution_count": 6,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "gather": {
+          "logged": 1655239360652
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Next step, overhead dataset calculation\r\n",
+        "\r\n",
+        "overhead_ds_output = OutputFileDatasetConfig(\r\n",
+        "    name=\"overhead_ds_output\", \r\n",
+        "    destination=(\r\n",
+        "        ws.datastores.get(working_datastore_name), \r\n",
+        "        \"invoices/{run-id}/{output-name}\")\r\n",
+        ").as_upload()\r\n",
+        "\r\n",
+        "overhead_ds_step = PythonScriptStep(\r\n",
+        "    name = \"Overhead Dataset Step\",\r\n",
+        "    source_directory = \"../scripts\",\r\n",
+        "    script_name = \"overhead_ds.py\",\r\n",
+        "    arguments = [\r\n",
+        "        '--input-data', dataprep_input,\r\n",
+        "        '--overhead-data', overhead_ds_output,\r\n",
+        "        '--overhead-expected-m', overhead_expected_m\r\n",
+        "    ],\r\n",
+        "    inputs=[dataprep_input],\r\n",
+        "    outputs=[overhead_ds_output],\r\n",
+        "    compute_target = pipeline_cluster,\r\n",
+        "    runconfig = pipeline_run_config,\r\n",
+        "    allow_reuse = False\r\n",
+        ")\r\n",
+        "pipeline_steps.append(overhead_ds_step)\r\n"
+      ],
+      "outputs": [],
+      "execution_count": 7,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "gather": {
+          "logged": 1655239360833
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Next step, run the overhead script\r\n",
+        "\r\n",
+        "model_input_dir = model_output_dir.as_input()\r\n",
+        "overhead_ds_input = overhead_ds_output.as_input()\r\n",
+        "\r\n",
+        "overhead_output = OutputFileDatasetConfig(\r\n",
+        "    name=\"overhead_output\", \r\n",
+        "    destination=(\r\n",
+        "        ws.datastores.get(working_datastore_name), \r\n",
+        "        \"invoices/{run-id}/{output-name}\")\r\n",
+        ").as_upload()\r\n",
+        "\r\n",
+        "script_config = ScriptRunConfig(\r\n",
+        "    source_directory=\"../scripts\",\r\n",
+        "    script=\"overhead.py\",\r\n",
+        "    arguments = [\r\n",
+        "        '--overhead-folder', overhead_ds_input,\r\n",
+        "        '--model-input', model_input_dir,\r\n",
+        "        '--overhead-output', overhead_output,\r\n",
+        "        '--id-feat', index_feature\r\n",
+        "        ],\r\n",
+        "    run_config = pipeline_run_config\r\n",
+        ")\r\n",
+        "\r\n",
+        "params = GridParameterSampling(\r\n",
+        "    {\r\n",
+        "        '--trees': choice(trees_list),\r\n",
+        "        '--subsample-size' : choice(subsample_list)\r\n",
+        "    }\r\n",
+        ")\r\n",
+        "\r\n",
+        "hyperdrive_config = HyperDriveConfig(\r\n",
+        "    run_config = script_config, \r\n",
+        "    hyperparameter_sampling = params, \r\n",
+        "    policy = None, \r\n",
+        "    primary_metric_name = 'Dummy', \r\n",
+        "    primary_metric_goal = PrimaryMetricGoal.MAXIMIZE, \r\n",
+        "    max_total_runs = len(trees_list)*len(subsample_list), \r\n",
+        "    max_concurrent_runs = node_count\r\n",
+        ") \r\n",
+        "\r\n",
+        "overhead_step = HyperDriveStep(\r\n",
+        "    name = \"iJungle Overhead Step\", \r\n",
+        "    hyperdrive_config = hyperdrive_config, \r\n",
+        "    inputs=[overhead_ds_input, model_input_dir],\r\n",
+        "    outputs=[overhead_output],\r\n",
+        "    allow_reuse=False\r\n",
+        ")\r\n",
+        "\r\n",
+        "pipeline_steps.append(overhead_step)"
+      ],
+      "outputs": [],
+      "execution_count": 8,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "gather": {
+          "logged": 1655239360929
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Next steps, find the representative iForest\r\n",
+        "\r\n",
+        "overhead_input = overhead_output.as_input()\r\n",
+        "\r\n",
+        "best_iforest_step = PythonScriptStep(\r\n",
+        "    name = \"Best iForest Step\",\r\n",
+        "    source_directory = \"../scripts\",\r\n",
+        "    script_name = \"best_iforest.py\",\r\n",
+        "    arguments = [\r\n",
+        "        '--overhead-input', overhead_input,\r\n",
+        "        '--subsample-list', str(subsample_list),\r\n",
+        "        '--trees-list', str(trees_list)\r\n",
+        "    ],\r\n",
+        "    inputs=[overhead_input],\r\n",
+        "    compute_target = pipeline_cluster,\r\n",
+        "    runconfig = pipeline_run_config,\r\n",
+        "    allow_reuse = False\r\n",
+        ")\r\n",
+        "pipeline_steps.append(best_iforest_step)"
+      ],
+      "outputs": [],
+      "execution_count": 9,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "gather": {
+          "logged": 1655239361026
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Construct the pipeline\r\n",
+        "pipeline = Pipeline(workspace=ws, steps=pipeline_steps)\r\n",
+        "print(\"Pipeline is built.\")\r\n",
+        "\r\n",
+        "# Create an experiment and run the pipeline\r\n",
+        "experiment = Experiment(workspace=ws, name = pipeline_name)\r\n",
+        "pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)\r\n",
+        "print(\"Pipeline submitted for execution.\")\r\n"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Pipeline is built.\nCreated step Feature engineering Step [1dab3f3a][da82670e-3f7e-4be0-88f6-507debe692f6], (This step will run and generate new outputs)\nCreated step iJungle Trainining Step [6b78089a][3c5b501f-987f-4dea-adae-e9613855b04c], (This step will run and generate new outputs)Created step Overhead Dataset Step [03970c0c][d8d83657-f3f1-4806-97c6-f40170b8f0b0], (This step will run and generate new outputs)\n\nCreated step iJungle Overhead Step [5e5520f5][074c153a-7a82-4e83-96da-6e62b1d5cedb], (This step will run and generate new outputs)\nCreated step Best iForest Step [26f37c61][5c436f27-c6b8-42b9-a0bd-5b91a3ea06eb], (This step will run and generate new outputs)\nSubmitted PipelineRun d571febb-ba82-4190-874a-4823dd9e978d\nLink to Azure Machine Learning Portal: https://ml.azure.com/runs/d571febb-ba82-4190-874a-4823dd9e978d?wsid=/subscriptions/d412dac0-d902-4cfb-b2f9-19dea115f7ff/resourcegroups/rg-dv-aidnaanomaly-corp-eus2/workspaces/wsmldvanomaly&tid=973ba820-4a58-4246-84bf-170e50b3152a\nPipeline submitted for execution.\n"
+        }
+      ],
+      "execution_count": 10,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "gather": {
+          "logged": 1655239374520
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    }
+  ],
+  "metadata": {
+    "kernel_info": {
+      "name": "python38-azureml"
+    },
+    "kernelspec": {
+      "name": "python38-azureml",
+      "language": "python",
+      "display_name": "Python 3.8 - AzureML"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.8.5",
+      "mimetype": "text/x-python",
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "pygments_lexer": "ipython3",
+      "nbconvert_exporter": "python",
+      "file_extension": ".py"
+    },
+    "nteract": {
+      "version": "nteract-front-end@1.0.0"
+    },
+    "microsoft": {
+      "host": {
+        "AzureML": {
+          "notebookHasBeenCompleted": true
+        }
+      }
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 4
+}
--- a/notebooks/02-iJungle-tutorial-inference-pipeline.ipynb
+++ b/notebooks/02-iJungle-tutorial-inference-pipeline.ipynb
@ -0,0 +1,402 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# DO NOT START THIS NOTEBOOK UNTIL THE PIPELINE CREATED IN THE PREVIOUS STEP, THE IJUNGLE TRAINING PIPELINE, IS IN \"COMPLETE\" STATUS.\r\n",
+        "\r\n",
+        "# iJungle Inference pipeline"
+      ],
+      "metadata": {
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from azureml.core import Workspace, Environment, Experiment, ScriptRunConfig\r\n",
+        "from azureml.core.compute import ComputeTarget\r\n",
+        "from azureml.core.conda_dependencies import CondaDependencies\r\n",
+        "from azureml.core.runconfig import RunConfiguration\r\n",
+        "from azureml.pipeline.core import Pipeline\r\n",
+        "from azureml.pipeline.steps import PythonScriptStep, ParallelRunConfig, ParallelRunStep\r\n",
+        "from azureml.data import OutputFileDatasetConfig"
+      ],
+      "outputs": [],
+      "execution_count": 1,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "gather": {
+          "logged": 1655246932882
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "cluster_name = \"cluster4\"\r\n",
+        "environment_name = \"ijungle-inference-env\"\r\n",
+        "input_dataset_name=\"ijungle-test-dataset\"\r\n",
+        "working_datastore_name=\"workspaceblobstore\"\r\n",
+        "output_datastore_name=\"workspaceblobstore\"\r\n",
+        "output_path=\"iJungle/results/\"\r\n",
+        "pipeline_name=\"ijungle-inference-pipeline\"\r\n",
+        "\r\n",
+        "index_feature = 'index'\r\n",
+        "anomaly_score = -.8"
+      ],
+      "outputs": [],
+      "execution_count": 2,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "gather": {
+          "logged": 1655246933174
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "ws = Workspace.from_config()\r\n",
+        "pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)\r\n",
+        "print('Cluster configured to execute the pipeline:',cluster_name)"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Cluster configured to execute the pipeline: cluster4\n"
+        }
+      ],
+      "execution_count": 3,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "gather": {
+          "logged": 1655246934825
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "new_env = Environment(environment_name)\r\n",
+        "packages = CondaDependencies.create(\r\n",
+        "    conda_packages=['pip'],\r\n",
+        "    pip_packages=['azureml-defaults','azureml-interpret','scikit-learn','pandas','pyarrow'])\r\n",
+        "\r\n",
+        "# Add the dependencies to the environment\r\n",
+        "new_env.python.conda_dependencies = packages\r\n",
+        "\r\n",
+        "# Register the environment \r\n",
+        "new_env.register(workspace=ws)\r\n",
+        "registered_env = Environment.get(ws, environment_name)\r\n",
+        "\r\n",
+        "# Create a new runconfig object for the pipeline\r\n",
+        "pipeline_run_config = RunConfiguration()\r\n",
+        "\r\n",
+        "# Use the compute you created above. \r\n",
+        "pipeline_run_config.target = pipeline_cluster\r\n",
+        "\r\n",
+        "# Assign the environment to the run configuration\r\n",
+        "pipeline_run_config.environment = registered_env\r\n",
+        "\r\n",
+        "print (\"Run configuration created.\")\r\n"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Run configuration created.\n"
+        }
+      ],
+      "execution_count": 4,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "gather": {
+          "logged": 1655246935455
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Get the inference dataset\r\n",
+        "inference_ds = ws.datasets.get(input_dataset_name)\r\n",
+        "\r\n",
+        "# Intermadiate data\r\n",
+        "dataprep_output = OutputFileDatasetConfig(\r\n",
+        "    name=\"processed_data\", \r\n",
+        "    destination=(\r\n",
+        "        ws.datastores.get(working_datastore_name), \r\n",
+        "        \"invoices/{run-id}/{output-name}\")\r\n",
+        ").as_upload()\r\n",
+        "\r\n",
+        "# Step 1, Run the data prep script\r\n",
+        "prep_step = PythonScriptStep(\r\n",
+        "    name = \"Inference data preparation Step\",\r\n",
+        "    source_directory = \"../scripts\",\r\n",
+        "    script_name = \"feat_eng.py\",\r\n",
+        "    arguments = [\r\n",
+        "        '--input-data', inference_ds.as_named_input('input'),\r\n",
+        "        '--prepped-data', dataprep_output,\r\n",
+        "        '--index-feature', index_feature,\r\n",
+        "        '--training', 'False',        \r\n",
+        "    ],\r\n",
+        "    outputs=[dataprep_output],\r\n",
+        "    compute_target = pipeline_cluster,\r\n",
+        "    runconfig = pipeline_run_config,\r\n",
+        "    allow_reuse = False\r\n",
+        ")\r\n",
+        "\r\n",
+        "# Initial definition of the pipeline steps\r\n",
+        "pipeline_steps = [prep_step]\r\n"
+      ],
+      "outputs": [],
+      "execution_count": 5,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "gather": {
+          "logged": 1655246936585
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Next Step, run the inferencing script\r\n",
+        "\r\n",
+        "node_count = int(pipeline_cluster.serialize()['properties']['properties']['scaleSettings']['maxNodeCount'])\r\n",
+        "\r\n",
+        "dataprep_input = dataprep_output.read_parquet_files().as_input(\"inference_data\")\r\n",
+        "\r\n",
+        "inference_output_dir = OutputFileDatasetConfig(\r\n",
+        "    name=\"inference_output\", \r\n",
+        "    destination=(\r\n",
+        "        ws.datastores.get(working_datastore_name), \r\n",
+        "        \"invoices/{run-id}/{output-name}\")\r\n",
+        ").as_upload()\r\n",
+        "\r\n",
+        "inference_step = PythonScriptStep(\r\n",
+        "    name = \"Inference Step\",\r\n",
+        "    source_directory = \"../scripts\",\r\n",
+        "    script_name = \"inference.py\",\r\n",
+        "    arguments = [\r\n",
+        "        '--input', dataprep_input,\r\n",
+        "        '--output', inference_output_dir,\r\n",
+        "        '--feat-id', index_feature\r\n",
+        "    ],\r\n",
+        "    inputs=[dataprep_input],\r\n",
+        "    outputs=[inference_output_dir],\r\n",
+        "    compute_target = pipeline_cluster,\r\n",
+        "    runconfig = pipeline_run_config,\r\n",
+        "    allow_reuse = False\r\n",
+        ")\r\n",
+        "\r\n",
+        "pipeline_steps.append(inference_step)\r\n"
+      ],
+      "outputs": [],
+      "execution_count": 6,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "gather": {
+          "logged": 1655246938497
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Next step, explainability\r\n",
+        "\r\n",
+        "interpret_input = inference_output_dir.read_parquet_files().as_input(\"interpret_input\")\r\n",
+        "\r\n",
+        "interpret_output_dir = OutputFileDatasetConfig(\r\n",
+        "    name=\"interpret_output\", \r\n",
+        "    destination=(\r\n",
+        "        ws.datastores.get(output_datastore_name), \r\n",
+        "        output_path)\r\n",
+        ").as_upload()\r\n",
+        "\r\n",
+        "\r\n",
+        "interpret_step = PythonScriptStep(\r\n",
+        "    name = \"Explainability Step\",\r\n",
+        "    source_directory = \"../scripts\",\r\n",
+        "    script_name = \"interpret.py\",\r\n",
+        "    arguments = [\r\n",
+        "        '--input', interpret_input,\r\n",
+        "        '--dataprep', dataprep_input,\r\n",
+        "        '--output', interpret_output_dir,\r\n",
+        "        '--index-id', index_feature,\r\n",
+        "        '--anomaly-score', anomaly_score\r\n",
+        "    ],\r\n",
+        "    inputs=[ interpret_input, dataprep_input],\r\n",
+        "    outputs=[interpret_output_dir],\r\n",
+        "    compute_target = pipeline_cluster,\r\n",
+        "    runconfig = pipeline_run_config,\r\n",
+        "    allow_reuse = False\r\n",
+        ")\r\n",
+        "pipeline_steps.append(interpret_step)"
+      ],
+      "outputs": [
+        {
+          "output_type": "error",
+          "ename": "NameError",
+          "evalue": "name 'dataprep_output_outliers' is not defined",
+          "traceback": [
+            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+            "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+            "\u001b[0;32m<ipython-input-7-d0664c56f92d>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0minterpret_input\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minference_output_dir\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_parquet_files\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mas_input\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"interpret_input\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mdataprep_input_outliers\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdataprep_output_outliers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_parquet_files\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mas_input\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"dataprep_input_outliers\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m interpret_output_dir = OutputFileDatasetConfig(\n",
+            "\u001b[0;31mNameError\u001b[0m: name 'dataprep_output_outliers' is not defined"
+          ]
+        }
+      ],
+      "execution_count": 7,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "gather": {
+          "logged": 1655246938617
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Construct the pipeline\r\n",
+        "pipeline = Pipeline(workspace=ws, steps=pipeline_steps)\r\n",
+        "print(\"Pipeline is built.\")\r\n",
+        "\r\n",
+        "# Create an experiment and run the pipeline\r\n",
+        "experiment = Experiment(workspace=ws, name = pipeline_name)\r\n",
+        "pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)\r\n",
+        "print(\"Pipeline submitted for execution.\")\r\n"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "gather": {
+          "logged": 1655246938658
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "name": "python38-azureml",
+      "language": "python",
+      "display_name": "Python 3.8 - AzureML"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.8.5",
+      "mimetype": "text/x-python",
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "pygments_lexer": "ipython3",
+      "nbconvert_exporter": "python",
+      "file_extension": ".py"
+    },
+    "kernel_info": {
+      "name": "python38-azureml"
+    },
+    "microsoft": {
+      "host": {
+        "AzureML": {
+          "notebookHasBeenCompleted": true
+        }
+      }
+    },
+    "nteract": {
+      "version": "nteract-front-end@1.0.0"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 2
+}
--- a/notebooks/iJungle-tutorial.ipynb
+++ b/notebooks/iJungle-tutorial.ipynb
--- a/notebooks/requirements.txt
+++ b/notebooks/requirements.txt
@ -1,4 +0,0 @@
-jupyter
-pandas
-scikit-learn
-matplotlib
--- a/operation/parallel_inference.py
+++ b/operation/parallel_inference.py
@ -1,27 +0,0 @@
-import os
-import numpy as np
-from azureml.core import Model
-import joblib
-#import argparse
-
-#parser = argparse.ArgumentParser()
-#parser.add_argument("--id-feature", type=str, dest='id_feature', help='ID Freature')
-#args = parser.parse_args()
-#id_feat = str(args.id_feature)
-#print('id feature',  id_feat)
-
-
-def init():
-    # Runs when the pipeline step is initialized
-    global model
-
-    # load the model
-    model_path = Model.get_model_path('best_iforest.pkl')
-    model = joblib.load(model_path)
-    
-def run(mini_batch):
-    mini_batch.set_index('Van_Stock_Proposal_Detail_Id', inplace=True)
-    index_list = list(mini_batch.index)
-    y_pred = model.predict(mini_batch).tolist()
-    score = model.score_samples(mini_batch).tolist()
-    return(list(zip(index_list, y_pred, score)))
--- a/operation/parallel_train.py
+++ b/operation/parallel_train.py
@ -1,56 +0,0 @@
-# -*- coding: utf-8 -*-
-from azureml.core import Run
-import argparse
-import numpy as np
-import iJungle
-
-run = Run.get_context()
-print("iJungle version:", iJungle.__version__)
-run.log('iJungle_version', iJungle.__version__)
-
-parser = argparse.ArgumentParser()
-
-# Input Data
-parser.add_argument("--input-data", type=str, dest='input_data', help='training dataset')
-parser.add_argument("--id-feature", type=str, dest='id_feature', help='ID Freature')
-parser.add_argument("--max-subsample-size", type=int, dest='max_sss', help='Max subsample size')
-parser.add_argument("--train-size", type=float, dest='train_size', help='Train size')
-
-# Hyper parameters
-parser.add_argument('--trees', type=int, dest='trees', default=100, help='Number of trees')
-parser.add_argument('--subsample-size', type=int, dest='subsample_size', default=8192, help='Subsample size')
-
-# Add arguments to args collection
-args = parser.parse_args()
-id_feat = str(args.id_feature)
-print('id feature',  id_feat)
-
-# Log Hyperparameter values
-trees = np.int(args.trees)
-subsample_size = np.int(args.subsample_size)
-print('trees',  trees)
-print('subsample_size',  subsample_size)
-run.log('trees',  trees)
-run.log('subsample_size',  subsample_size)
-
-# Other parameters
-max_sss = np.int(args.max_sss)
-train_size = np.float(args.train_size)
-print("Max subsample size", max_sss)
-print("Train size", train_size)
-run.log('max_sss',  max_sss)
-run.log('train_size',  train_size)
-
-# Load training data
-print("Loading Data...")
-df = run.input_datasets['training_data'].to_pandas_dataframe() # Get the training data from the estimator input
-df.set_index(id_feat, inplace=True)
-
-print("Starting training ...")
-model_filename = iJungle.model_train_fun(df, trees, subsample_size, train_size, max_sss)
-print(model_filename)
-
-# Log dummy metric
-run.log('Dummy', np.float(0))
-
-run.complete()
--- a/scripts/best_iforest.py
+++ b/scripts/best_iforest.py
@ -0,0 +1,56 @@
+from azureml.core import Run, Model
+import argparse
+import pandas as pd
+import os
+import iJungle
+import joblib
+
+run = Run.get_context()
+parser = argparse.ArgumentParser()
+
+# Input Data
+parser.add_argument('--overhead-input', type=str, dest='overhead_input', help='Overhead input')
+parser.add_argument('--subsample-list', type=str, dest='subsample_list')
+parser.add_argument('--trees-list', type=str, dest='trees_list')
+
+# Add arguments to args collection
+args = parser.parse_args()
+overhead_input = args.overhead_input
+print("Overhead input", overhead_input)
+subsample_list = eval(args.subsample_list)
+print("subsample_list", subsample_list)
+trees_list = eval(args.trees_list)
+print("subsample_list", trees_list)
+
+# Load models
+print("Loading Models...")
+results_dic = {}
+for subsample_size in subsample_list:
+    results_dic[str(subsample_size)] = {}
+    for trees in trees_list:
+        model_name = 'iJungle_light_' + str(trees) + '_' + str(subsample_size) + '_results'
+        print(model_name)
+        model_path = Model.get_model_path(model_name)
+        print(model_path)
+        results_dic[str(subsample_size)][str(trees)] = joblib.load(model_path)
+
+results = pd.DataFrame(results_dic)
+
+# Calculating best iForest
+print("Calculating best iForest ...")
+best_subsample_size, best_trees, best_iF_k = iJungle.best_iforest_params(results)
+
+model_name = 'iJungle_light_' + str(trees) + '_' + str(subsample_size)
+print("Best iForest model name:", model_name)
+model_path = Model.get_model_path(model_name)
+print("Loading best iFor_list from ", model_path)
+iFor_list = joblib.load(model_path)
+model = iFor_list[best_iF_k]
+print("Model selected!")
+print("Registering model...")
+best_model_name = 'best_iforest.pkl'
+best_model_path = os.path.join(iJungle._MODEL_DIR, best_model_name)
+joblib.dump(model, best_model_path)
+Model.register(workspace=run.experiment.workspace ,model_path=best_model_path, model_name=best_model_name)
+
+run.complete()
--- a/scripts/feat_eng.py
+++ b/scripts/feat_eng.py
@ -0,0 +1,65 @@
+import os
+import argparse
+import pandas as pd
+from azureml.core import Run, Model
+from sklearn import preprocessing
+import joblib
+import numpy as np
+
+LOCAL_MODEL_PATH = 'outputs'
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--input-data", type=str, dest='train_dataset_id')
+parser.add_argument('--prepped-data', type=str, dest='prepped_data')
+parser.add_argument('--index-feature', type=str, dest='index_feature')
+parser.add_argument('--training', type=str, dest='training')
+
+print('Loading parameters ...')
+args = parser.parse_args()
+save_folder = args.prepped_data
+index_feature = args.index_feature
+training = bool(args.training)
+
+print('save_folder', save_folder)
+print('index_feature', index_feature)
+print('training', training)
+
+run = Run.get_context()
+
+print("Loading data ...")
+df = run.input_datasets['input'].to_pandas_dataframe()
+print(df)
+print("Shape of data:",df.shape)
+
+print("Setting index ...")
+df.set_index(index_feature, inplace=True)
+
+for feat in df.columns:
+    if training:
+        print("Training and registering scaler for feature:", feat)
+        scaler = preprocessing.StandardScaler()
+        scaler.fit(df[[feat]])
+        model_name = 'ijungle_scaler_model_'+feat
+        file_name = os.path.join(LOCAL_MODEL_PATH, model_name + '.pkl')
+        joblib.dump(value=scaler, filename=file_name)
+        Model.register(
+            workspace=run.experiment.workspace,
+            model_path = file_name,
+            model_name = model_name
+        )
+    else:
+        print("Applying scaler for feature:", feat)
+        model_name = 'invoices_scaler_model_'+feat
+        model_path = Model.get_model_path(model_name)
+        scaler = joblib.load(model_path)        
+    df[feat] = scaler.transform(df[[feat]]).reshape(df.shape[0])
+
+print("Reseting index ...")
+df.reset_index(inplace=True)
+
+print("Saving Data...")
+os.makedirs(save_folder, exist_ok=True)
+save_path = os.path.join(save_folder,'prepped.parquet')
+df.to_parquet(save_path, index=False)
+
+run.complete()
--- a/scripts/inference.py
+++ b/scripts/inference.py
@ -0,0 +1,51 @@
+import os
+import argparse
+import pandas as pd
+from azureml.core import Run, Model
+from sklearn import preprocessing
+import joblib
+import numpy as np
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--input", type=str, dest='input_')
+parser.add_argument("--output", type=str, dest='output_')
+parser.add_argument("--feat-id", type=str, dest='feat_id')
+
+run = Run.get_context()
+
+print("Reading parameters ...")
+args = parser.parse_args()
+output_dir = args.output_
+feat_id = args.feat_id
+print("Output dir:", output_dir)
+print("feat_id:", feat_id)
+
+
+print("Loading data ...")
+df = run.input_datasets['inference_data'].to_pandas_dataframe()
+df.set_index(feat_id, inplace=True)
+print(df)
+
+print("Loading model ...")
+model_name = 'best_iforest.pkl'
+model_path = Model.get_model_path(model_name)
+model = joblib.load(model_path)
+print("model", model)
+
+print(df.isnull().any())
+
+print("Making predictions ...")
+y_pred = model.predict(df)
+scores = model.score_samples(df)
+print("Number of anomalies: ", len(y_pred[y_pred==-1]))
+
+print("Generating outputs ...")
+df_out = pd.DataFrame()
+df_out[feat_id] = df.index.values
+df_out['pred'] = y_pred
+df_out['score'] = scores
+
+save_path = os.path.join(output_dir,'results.parquet')
+df_out.to_parquet(save_path, index=False)
+
+run.complete()
--- a/scripts/interpret.py
+++ b/scripts/interpret.py
@ -0,0 +1,108 @@
+import os
+import argparse
+import pandas as pd
+from azureml.core import Run, Model
+from sklearn import preprocessing
+import joblib
+import numpy as np
+from interpret.ext.blackbox import TabularExplainer
+import time
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--input", type=str, dest='input', help='inference dataset')
+parser.add_argument("--dataprep", type=str, dest='dataprep', help='dataset')
+parser.add_argument('--output', type=str, dest='output_dir', help='Folder for results')
+parser.add_argument('--index-id', type=str, dest='index_id')
+parser.add_argument('--anomaly-score', type=str, dest='anomaly_score')
+
+run = Run.get_context()
+
+print("Loading arguments ...")
+args = parser.parse_args()
+output_dir = args.output_dir
+index_id = args.index_id
+anomaly_score = float(args.anomaly_score)
+print("outpur_dir", output_dir)
+print("index_id", index_id)
+print("anomaly_score", anomaly_score)
+
+print("Loading results ...")
+df = run.input_datasets["interpret_input"].to_pandas_dataframe()
+print(df)
+print("Selecting anomalies ...")
+df = df[df['score']<=anomaly_score]
+print(df)
+
+ls_anomalies = []
+
+if df.shape[0] > 0:
+    df_100 = df.sort_values("score", ascending=True).copy()
+    print(df_100)
+
+    print("Loading inference data ...")
+    df_inf = run.input_datasets["inference_data"].to_pandas_dataframe()
+    print(df_inf)
+
+    print("Creating input to interpret ...")
+    result_ids = df_100[index_id].values
+    df_inf = df_inf.loc[df_inf[index_id].isin(result_ids),:].copy()
+    X_explain = df_inf.set_index(index_id)
+    print(X_explain)
+
+    print("Loading model ...")
+    model_name = 'Invoices_best_iforest.pkl'
+    model_path = Model.get_model_path(model_name)
+    model = joblib.load(model_path)
+    print("model", model)
+
+    print("Creating explanations ...")
+    tab_explainer = TabularExplainer(model, X_explain)
+    print(tab_explainer)
+
+    # Get predictions
+    predictions = model.predict(X_explain)
+
+    # Get local explanations
+    local_tab_explanation = tab_explainer.explain_local(X_explain)
+
+    # Get feature names and importance for each possible label
+    local_tab_features = local_tab_explanation.get_ranked_local_names()
+    local_tab_importance = local_tab_explanation.get_ranked_local_values()
+
+    ls_explanations = []
+    for i in range(len(X_explain.index)):
+        detail_id = X_explain.index[i]
+        feat3, feat2, feat1 = tuple(local_tab_features[0][-3:])
+        score3, score2, score1 = tuple(local_tab_importance[0][-3:])
+        ls_explanations.append({
+            index_id:detail_id,
+            'Interpretation_Feature_1':feat1,
+            'Interpretation_Score_1':score1,
+            'Interpretation_Feature_2':feat2,
+            'Interpretation_Score_2':score2,
+            'Interpretation_Feature_3':feat3,
+            'Interpretation_Score_3':score3,
+        })
+
+    df_explanations = pd.DataFrame(ls_explanations)
+    print(df_explanations)
+    df_results_exp = df_100.merge(df_explanations, on=index_id, how='left')
+    print("Explanations:")
+    print(df_results_exp)
+
+    ls_anomalies.append(df_results_exp)
+
+print("Consolidating all anomalies ...")
+df_anomalies = pd.concat(ls_anomalies)
+df_anomalies.sort_values(['score','Interpretation_Score_1'], ascending=True, inplace=True)
+print(df_anomalies)
+
+print("Adding additional columns ...")
+df_anomalies = df_anomalies.merge(df_sp, on=index_id, how='left')
+print(df_anomalies)
+
+print("Saving anomalies ...")
+timestr = time.strftime("%Y%m%d%H%M%S")
+df_anomalies.to_csv(os.path.join(output_dir,'anomalies_'+timestr+'.csv'), index=False)
+
+run.complete()
--- a/operation/parallel_overhead.py
+++ b/operation/parallel_overhead.py
@ -1,29 +1,34 @@
-# -*- coding: utf-8 -*-
-from azureml.core import Model, Run
+from azureml.core import Run, Model
 import argparse
+import pandas as pd
 import numpy as np
-import iJungle
 import joblib
+import os
+import iJungle
+import shutil

 run = Run.get_context()
-
-print("iJungle version:", iJungle.__version__)
-run.log('iJungle_version', iJungle.__version__)
-
 parser = argparse.ArgumentParser()

 # Input Data
-parser.add_argument("--input-data", type=str, dest='input_data', help='Overhead dataset')
-parser.add_argument("--id-feature", type=str, dest='id_feature', help='ID Freature')
+parser.add_argument("--overhead-folder", type=str, dest='overhead_folder', help='overhead data folder')
+parser.add_argument("--model-input", type=str, dest='model_input', help='model input folder')
+parser.add_argument("--overhead-output", type=str, dest='overhead_output', help='overhead output folder')
+parser.add_argument("--id-feat", type=str, dest='id_feat')

 # Hyper parameters
 parser.add_argument('--trees', type=int, dest='trees', default=100, help='Number of trees')
 parser.add_argument('--subsample-size', type=int, dest='subsample_size', default=8192, help='Subsample size')

+
 # Add arguments to args collection
 args = parser.parse_args()
-id_feat = str(args.id_feature)
-print('id feature',  id_feat)
+overhead_folder = args.overhead_folder
+print("Overhead folder", overhead_folder)
+model_input = args.model_input
+print("Model input", model_input)
+overhead_output = args.overhead_output
+print("Overhead output", overhead_output)

 # Log Hyperparameter values
 trees = np.int(args.trees)
@ -33,10 +38,18 @@ print('subsample_size',  subsample_size)
 run.log('trees',  trees)
 run.log('subsample_size',  subsample_size)

+# Other parameters
+id_feat = args.id_feat
+print("id_feat", id_feat)
+run.log('id_feat',  id_feat)
+
 # Load training data
 print("Loading Data...")
-W = run.input_datasets['overhead_data'].to_pandas_dataframe() # Get the training data from the estimator input
+load_path = os.path.join(overhead_folder,'W.parquet')
+W = pd.read_parquet(load_path)
 W.set_index(id_feat, inplace=True)
+print("Overhead Data loaded. Shape:", W.shape)
+

 # Load iFor_list pickle
 print("Loading pickle...")
@ -54,7 +67,21 @@ results_filename = os.path.join(iJungle._MODEL_DIR, model_name + '_results.pkl')
 print("Writing results:", results_filename)
 joblib.dump(value=results, filename=results_filename)

+model_name = 'iJungle_light_' + str(trees) + '_' + str(subsample_size) + '_results'
+print(model_name)
+
+print('Registering model...')
+Model.register(
+    workspace=run.experiment.workspace,
+    model_path = results_filename,
+    model_name = model_name,
+    properties={
+        'trees':trees,
+        'subsample_size':subsample_size})
+
 # Log dummy metric
 run.log('Dummy', np.float(0))

+shutil.copy(results_filename, os.path.join(overhead_output, model_name + '.pkl'))
+
 run.complete()
--- a/scripts/overhead_ds.py
+++ b/scripts/overhead_ds.py
@ -0,0 +1,43 @@
+from azureml.core import Run
+import argparse
+import pandas as pd
+import os
+import iJungle
+
+run = Run.get_context()
+parser = argparse.ArgumentParser()
+
+# Input Data
+parser.add_argument("--input-data", type=str, dest='prepped_data', help='Prepped data')
+parser.add_argument('--overhead-data', type=str, dest='overhead_data', help='Overhead data')
+parser.add_argument('--overhead-expected-m', type=str, dest='overhead_expected_m')
+
+# Add arguments to args collection
+args = parser.parse_args()
+prepped_data = args.prepped_data
+print("Prepped folder", prepped_data)
+overhead_data = args.overhead_data
+print("Model input", overhead_data)
+overhead_expected_m = int(args.overhead_expected_m)
+print("overhead_expected_m", overhead_expected_m)
+
+# Load training data
+print("Loading Data...")
+load_path = os.path.join(prepped_data,'prepped.parquet')
+df = pd.read_parquet(load_path)
+print("Data loaded. Shape:", df.shape)
+
+# Overhead sample size calculation
+n_records = df.shape[0]
+overhead_size = min(1,overhead_expected_m/n_records)
+print("Overhead size", overhead_size)
+run.log('overhead_size',  overhead_size)
+
+W = iJungle.select_overhead_data(df, overhead_size=overhead_size)
+print("Overhead shape", W.shape)
+
+print("Saving Data...")
+save_path = os.path.join(overhead_data,'W.parquet')
+W.to_parquet(save_path, index=False)
+
+run.complete()
--- a/scripts/training.py
+++ b/scripts/training.py
@ -0,0 +1,83 @@
+from azureml.core import Run, Model
+import argparse
+import pandas as pd
+import numpy as np
+import joblib
+import os
+import iJungle
+import shutil
+
+run = Run.get_context()
+parser = argparse.ArgumentParser()
+
+# Input Data
+parser.add_argument("--training-folder", type=str, dest='training_folder', help='training data folder')
+parser.add_argument("--max-subsample-size", type=int, dest='max_sss', help='Max subsample size')
+parser.add_argument("--model-output", type=str, dest='model_output', help='model output folder')
+parser.add_argument("--id-feat", type=str, dest='id_feat')
+parser.add_argument("--train-expected-m", type=str, dest='train_expected_m')
+
+# Hyper parameters
+parser.add_argument('--trees', type=int, dest='trees', default=100, help='Number of trees')
+parser.add_argument('--subsample-size', type=int, dest='subsample_size', default=8192, help='Subsample size')
+
+
+# Add arguments to args collection
+args = parser.parse_args()
+training_folder = args.training_folder
+print("Training folder", training_folder)
+model_output = args.model_output
+print("Model output", model_output)
+id_feat = args.id_feat
+print("id_feat", id_feat)
+train_expected_m = int(args.train_expected_m)
+print("train_expected_m", train_expected_m)
+
+# Log Hyperparameter values
+trees = np.int(args.trees)
+subsample_size = np.int(args.subsample_size)
+print('trees',  trees)
+print('subsample_size',  subsample_size)
+run.log('trees',  trees)
+run.log('subsample_size',  subsample_size)
+
+# Other parameters
+max_sss = np.int(args.max_sss)
+print("Max subsample size", max_sss)
+run.log('max_sss',  max_sss)
+
+# Load training data
+print("Loading Data...")
+load_path = os.path.join(training_folder,'prepped.parquet')
+df = pd.read_parquet(load_path)
+df.set_index(id_feat, inplace=True)
+
+# Train sample size calculation
+n_records = df.shape[0]
+train_size = min(1,train_expected_m/n_records)
+print("Train size", train_size)
+run.log('train_size',  train_size)
+
+print("Starting training ...")
+model_filename = iJungle.model_train_fun(df, trees, subsample_size, train_size, max_sss)
+print(model_filename)
+model_name = 'iJungle_light_' + str(trees) + '_' + str(subsample_size)
+print(model_name)
+
+model_path = os.path.join(iJungle._MODEL_DIR, model_filename)
+
+print('Registering model...')
+Model.register(
+    workspace=run.experiment.workspace,
+    model_path = model_path,
+    model_name = model_name,
+    properties={
+        'trees':trees,
+        'subsample_size':subsample_size})
+
+# Log dummy metric
+run.log('Dummy', np.float(0))
+
+shutil.copy(model_path, os.path.join(model_output,model_name + '.pkl'))
+
+run.complete()
--- a/src/iJungle/config.py
+++ b/src/iJungle/config.py
@ -1,2 +1,2 @@
-__version__ = '0.1.73'
+__version__ = '0.2.0'
 _MODEL_DIR = 'outputs'