pipeline creation
This commit is contained in:
Родитель
74946cffa2
Коммит
ced8867fb6
|
@ -0,0 +1,651 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"# iJungle Tutorial Training Pipeline Example\r\n",
|
||||
"\r\n",
|
||||
"*TODO: Summary of the iJungle technique* \r\n"
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"import iJungle\n",
|
||||
"from azureml.core import Workspace, Datastore, Dataset, Experiment, Environment, ScriptRunConfig\n",
|
||||
"import pandas as pd\n",
|
||||
"import os\n",
|
||||
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
|
||||
"from azureml.core.conda_dependencies import CondaDependencies\n",
|
||||
"from azureml.train.hyperdrive import GridParameterSampling, HyperDriveConfig, PrimaryMetricGoal, choice\n",
|
||||
"from azureml.core.runconfig import RunConfiguration\n",
|
||||
"from azureml.pipeline.core import Pipeline\n",
|
||||
"from azureml.pipeline.steps import PythonScriptStep, HyperDriveStep, HyperDriveStepRun\n",
|
||||
"from azureml.data import OutputFileDatasetConfig\n",
|
||||
"\n",
|
||||
"print(\"iJungle version:\", iJungle.__version__)"
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "iJungle version: 0.1.73\n"
|
||||
}
|
||||
],
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1655239329501
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"# 1. Parameters definition"
|
||||
],
|
||||
"metadata": {
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"cluster_name = \"cluster4\"\r\n",
|
||||
"environment_name = \"ijungle-training-env\"\r\n",
|
||||
"working_datastore_name = \"workspaceblobstore\"\r\n",
|
||||
"training_dataset_name = \"ijungle-trainining-dataset\"\r\n",
|
||||
"test_dataset_name = \"ijungle-test-dataset\"\r\n",
|
||||
"y_test_dataset_name = \"ijungle-y-test-dataset\"\r\n",
|
||||
"index_feature = 'index'\r\n",
|
||||
"pipeline_name = \"ijungle-training-pipeline\"\r\n",
|
||||
"subsample_list = [4096, 2048, 1024, 512]\r\n",
|
||||
"trees_list = [500, 100, 20, 10]\r\n",
|
||||
"train_expected_m = 50000\r\n",
|
||||
"overhead_expected_m = 50000\r\n",
|
||||
"\r\n"
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"jupyter": {
|
||||
"source_hidden": false,
|
||||
"outputs_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
},
|
||||
"gather": {
|
||||
"logged": 1655239329609
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"# 2. Preparation of cluster, environment and run configuration"
|
||||
],
|
||||
"metadata": {
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"\r\n",
|
||||
"ws = Workspace.from_config()\r\n",
|
||||
"\r\n",
|
||||
"# Verify that cluster does not exist already\r\n",
|
||||
"try:\r\n",
|
||||
" pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)\r\n",
|
||||
" print('Found existing cluster, use it.')\r\n",
|
||||
"except:\r\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)\r\n",
|
||||
" pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)\r\n",
|
||||
"\r\n",
|
||||
"# Creation of environment\r\n",
|
||||
"new_env = Environment(environment_name)\r\n",
|
||||
"packages = CondaDependencies.create(\r\n",
|
||||
" conda_packages=['pip'],\r\n",
|
||||
" pip_packages=['azureml-defaults','scikit-learn','pandas','pyarrow'])\r\n",
|
||||
"\r\n",
|
||||
"# Add iJungle library\r\n",
|
||||
"\r\n",
|
||||
"whl_filename = \"../dist/iJungle-\"+iJungle.__version__+\"-py3-none-any.whl\"\r\n",
|
||||
"\r\n",
|
||||
"whl_url = Environment.add_private_pip_wheel(workspace=ws,file_path = whl_filename, exist_ok=True)\r\n",
|
||||
"packages.add_pip_package(whl_url)\r\n",
|
||||
"\r\n",
|
||||
"\r\n",
|
||||
"# Add the dependencies to the environment\r\n",
|
||||
"new_env.python.conda_dependencies = packages\r\n",
|
||||
"\r\n",
|
||||
"# Register the environment \r\n",
|
||||
"new_env.register(workspace=ws)\r\n",
|
||||
"registered_env = Environment.get(ws, environment_name)\r\n",
|
||||
"\r\n",
|
||||
"# Create a new runconfig object for the pipeline\r\n",
|
||||
"pipeline_run_config = RunConfiguration()\r\n",
|
||||
"\r\n",
|
||||
"# Use the compute you created above. \r\n",
|
||||
"pipeline_run_config.target = pipeline_cluster\r\n",
|
||||
"\r\n",
|
||||
"# Assign the environment to the run configuration\r\n",
|
||||
"pipeline_run_config.environment = registered_env\r\n",
|
||||
"\r\n",
|
||||
"print (\"Run configuration created.\")\r\n"
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Found existing cluster, use it.\nRun configuration created.\n"
|
||||
}
|
||||
],
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"jupyter": {
|
||||
"source_hidden": false,
|
||||
"outputs_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
},
|
||||
"gather": {
|
||||
"logged": 1655239330859
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"\n",
|
||||
"# 3. Data preparation and dataset registration\n",
|
||||
"\n",
|
||||
"*TODO: description of the data*\n",
|
||||
"\n",
|
||||
"1. Use the following data in this repository *TODO: KDD url to download the files*\n",
|
||||
" - kddcup.names\n",
|
||||
" - kddcup.data\n",
|
||||
" - corrected"
|
||||
],
|
||||
"metadata": {
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"## Move to data directory\n",
|
||||
"os.chdir(os.path.dirname(os.path.abspath('__file__'))+'/../data')\n",
|
||||
"\n",
|
||||
"## Generate DataFrame with kdd data(csv format)\n",
|
||||
"names = list(pd.read_csv('kddcup.names',sep=':', header=None)[0])\n",
|
||||
"df = pd.read_csv('kddcup.data.gz', header=None, names=names)\n",
|
||||
"df_test = pd.read_csv('corrected.gz', header=None, names=names)\n",
|
||||
"\n",
|
||||
"print(\"Shape of raw data:\", df.shape)\n",
|
||||
"print(\"Shape of test data:\", df_test.shape)\n",
|
||||
"\n",
|
||||
"# Remove entries which protocol is not Http\n",
|
||||
"df = df[df.service == 'http']\n",
|
||||
"df_test = df_test[df_test.service == 'http']\n",
|
||||
"print(\"Shape of filtered train data:\", df.shape)\n",
|
||||
"print(\"Shape of filtered test data:\", df_test.shape)\n",
|
||||
"\n",
|
||||
"# Preparation of labels\n",
|
||||
"y_train = df.pop('label')\n",
|
||||
"y_test = df_test.pop('label')\n",
|
||||
"y_train = pd.Series([1 if val == 'normal.' else -1 for val in y_train], name=\"y\")\n",
|
||||
"y_test = pd.Series([1 if val == 'normal.' else -1 for val in y_test], name=\"y\")\n",
|
||||
"print(\"Shape of train labels:\", y_train.shape)\n",
|
||||
"print(\"Shape of test labels:\", y_test.shape)\n",
|
||||
"\n",
|
||||
"# Final preparation of training and testing data\n",
|
||||
"df.drop(['service'], axis=1, inplace=True)\n",
|
||||
"df_test.drop(['service'], axis=1, inplace=True)\n",
|
||||
"\n",
|
||||
"cat_columns = ['protocol_type', 'flag']\n",
|
||||
"\n",
|
||||
"for col in cat_columns:\n",
|
||||
" df_test[col] = df_test[col].astype('category')\n",
|
||||
" df[col] = df[col].astype('category')\n",
|
||||
"\n",
|
||||
"cat_columns = df.select_dtypes(['category']).columns\n",
|
||||
"df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)\n",
|
||||
"\n",
|
||||
"cat_columns = df_test.select_dtypes(['category']).columns\n",
|
||||
"df_test[cat_columns] = df_test[cat_columns].apply(lambda x: x.cat.codes)\n",
|
||||
"\n",
|
||||
"df.reset_index(inplace=True)\n",
|
||||
"df_test.reset_index(inplace=True)\n",
|
||||
"df_y_test = y_test.reset_index()\n",
|
||||
"\n",
|
||||
"print(\"Shape of train data:\", df.shape)\n",
|
||||
"print(\"Shape of test data:\", df_test.shape)\n",
|
||||
"print(\"Shape of y-test data:\", df_y_test.shape)\n",
|
||||
"\n",
|
||||
"datastore = Datastore.get(ws, working_datastore_name)\n",
|
||||
"\n",
|
||||
"print(\"Registering training dataset ...\")\n",
|
||||
"train_dataset = Dataset.Tabular.register_pandas_dataframe(df, datastore, training_dataset_name)\n",
|
||||
"\n",
|
||||
"print(\"Registering testing dataset ...\")\n",
|
||||
"test_dataset = Dataset.Tabular.register_pandas_dataframe(df_test, datastore, test_dataset_name)\n",
|
||||
"\n",
|
||||
"print(\"Registering y-testing dataset ...\")\n",
|
||||
"y_test_dataset = Dataset.Tabular.register_pandas_dataframe(df_y_test, datastore, y_test_dataset_name)\n"
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Shape of raw data: (4898431, 42)\nShape of test data: (311029, 42)\nShape of filtered train data: (623091, 42)\nShape of filtered test data: (41237, 42)\nShape of train labels: (623091,)\nShape of test labels: (41237,)\nShape of train data: (623091, 41)\nShape of test data: (41237, 41)\nShape of y-test data: (41237, 2)\nRegistering training dataset ...\nValidating arguments.\nArguments validated.\nSuccessfully obtained datastore reference and path.\nUploading file to managed-dataset/bd6448b8-8c72-4b54-be45-cd4e6ab5e212/\nSuccessfully uploaded file to datastore.\nCreating and registering a new dataset.\nSuccessfully created and registered a new dataset.\nRegistering testing dataset ...\nValidating arguments.\nArguments validated.\nSuccessfully obtained datastore reference and path.\nUploading file to managed-dataset/8d4f60e5-0127-4d71-8720-b0290a26ebce/\nSuccessfully uploaded file to datastore.\nCreating and registering a new dataset.\nSuccessfully created and registered a new dataset.\nRegistering y-testing dataset ...\nValidating arguments.\nArguments validated.\nSuccessfully obtained datastore reference and path.\nUploading file to managed-dataset/7fd41a9d-8dd1-4ebd-9a69-26cccf32c490/\nSuccessfully uploaded file to datastore.\nCreating and registering a new dataset.\nSuccessfully created and registered a new dataset.\n"
|
||||
}
|
||||
],
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"jupyter": {
|
||||
"source_hidden": false,
|
||||
"outputs_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
},
|
||||
"gather": {
|
||||
"logged": 1655239359512
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"# 2. Creation of training pipeline"
|
||||
],
|
||||
"metadata": {
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"# Get the training dataset\r\n",
|
||||
"train_ds = ws.datasets.get(training_dataset_name)\r\n",
|
||||
"\r\n",
|
||||
"# Intermadiate data\r\n",
|
||||
"dataprep_output = OutputFileDatasetConfig(\r\n",
|
||||
" name=\"processed_data\", \r\n",
|
||||
" destination=(\r\n",
|
||||
" ws.datastores.get(working_datastore_name), \r\n",
|
||||
" \"invoices/{run-id}/{output-name}\")\r\n",
|
||||
").as_upload()\r\n",
|
||||
"\r\n",
|
||||
"# Step 1, Run the data prep script\r\n",
|
||||
"prep_step = PythonScriptStep(\r\n",
|
||||
" name = \"Feature engineering Step\",\r\n",
|
||||
" source_directory = \"../scripts\",\r\n",
|
||||
" script_name = \"feat_eng.py\",\r\n",
|
||||
" arguments = [\r\n",
|
||||
" '--input-data', train_ds.as_named_input('input'),\r\n",
|
||||
" '--prepped-data', dataprep_output,\r\n",
|
||||
" '--index-feature', index_feature, \r\n",
|
||||
" '--training', 'True' \r\n",
|
||||
" ],\r\n",
|
||||
" outputs=[dataprep_output],\r\n",
|
||||
" compute_target = pipeline_cluster,\r\n",
|
||||
" runconfig = pipeline_run_config,\r\n",
|
||||
" allow_reuse = False\r\n",
|
||||
")\r\n",
|
||||
"\r\n",
|
||||
"# Initial definition of the pipeline steps\r\n",
|
||||
"pipeline_steps = [prep_step]\r\n"
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1655239360524
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"# Next Step, run the training script\r\n",
|
||||
"\r\n",
|
||||
"dataprep_input = dataprep_output.as_input()\r\n",
|
||||
"node_count = int(pipeline_cluster.serialize()['properties']['properties']['scaleSettings']['maxNodeCount'])\r\n",
|
||||
"\r\n",
|
||||
"model_output_dir = OutputFileDatasetConfig(\r\n",
|
||||
" name=\"model_output\", \r\n",
|
||||
" destination=(\r\n",
|
||||
" ws.datastores.get(working_datastore_name), \r\n",
|
||||
" \"invoices/{run-id}/{output-name}\")\r\n",
|
||||
").as_upload()\r\n",
|
||||
"\r\n",
|
||||
"script_config = ScriptRunConfig(\r\n",
|
||||
" source_directory=\"../scripts\",\r\n",
|
||||
" script=\"training.py\",\r\n",
|
||||
" arguments = [\r\n",
|
||||
" '--training-folder', dataprep_input,\r\n",
|
||||
" '--max-subsample-size', max(subsample_list),\r\n",
|
||||
" '--model-output', model_output_dir,\r\n",
|
||||
" '--id-feat', index_feature,\r\n",
|
||||
" '--train-expected-m', train_expected_m\r\n",
|
||||
" ],\r\n",
|
||||
" run_config = pipeline_run_config\r\n",
|
||||
")\r\n",
|
||||
"\r\n",
|
||||
"params = GridParameterSampling(\r\n",
|
||||
" {\r\n",
|
||||
" '--trees': choice(trees_list),\r\n",
|
||||
" '--subsample-size' : choice(subsample_list)\r\n",
|
||||
" }\r\n",
|
||||
")\r\n",
|
||||
"\r\n",
|
||||
"hyperdrive_config = HyperDriveConfig(\r\n",
|
||||
" run_config = script_config, \r\n",
|
||||
" hyperparameter_sampling = params, \r\n",
|
||||
" policy = None, \r\n",
|
||||
" primary_metric_name = 'Dummy', \r\n",
|
||||
" primary_metric_goal = PrimaryMetricGoal.MAXIMIZE, \r\n",
|
||||
" max_total_runs = len(trees_list)*len(subsample_list), \r\n",
|
||||
" max_concurrent_runs = node_count\r\n",
|
||||
") \r\n",
|
||||
"\r\n",
|
||||
"train_step = HyperDriveStep(\r\n",
|
||||
" name = \"iJungle Trainining Step\", \r\n",
|
||||
" hyperdrive_config = hyperdrive_config, \r\n",
|
||||
" inputs=[dataprep_input],\r\n",
|
||||
" outputs=[model_output_dir],\r\n",
|
||||
" allow_reuse=False\r\n",
|
||||
")\r\n",
|
||||
"\r\n",
|
||||
"pipeline_steps.append(train_step)"
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"jupyter": {
|
||||
"source_hidden": false,
|
||||
"outputs_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
},
|
||||
"gather": {
|
||||
"logged": 1655239360652
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"# Next step, overhead dataset calculation\r\n",
|
||||
"\r\n",
|
||||
"overhead_ds_output = OutputFileDatasetConfig(\r\n",
|
||||
" name=\"overhead_ds_output\", \r\n",
|
||||
" destination=(\r\n",
|
||||
" ws.datastores.get(working_datastore_name), \r\n",
|
||||
" \"invoices/{run-id}/{output-name}\")\r\n",
|
||||
").as_upload()\r\n",
|
||||
"\r\n",
|
||||
"overhead_ds_step = PythonScriptStep(\r\n",
|
||||
" name = \"Overhead Dataset Step\",\r\n",
|
||||
" source_directory = \"../scripts\",\r\n",
|
||||
" script_name = \"overhead_ds.py\",\r\n",
|
||||
" arguments = [\r\n",
|
||||
" '--input-data', dataprep_input,\r\n",
|
||||
" '--overhead-data', overhead_ds_output,\r\n",
|
||||
" '--overhead-expected-m', overhead_expected_m\r\n",
|
||||
" ],\r\n",
|
||||
" inputs=[dataprep_input],\r\n",
|
||||
" outputs=[overhead_ds_output],\r\n",
|
||||
" compute_target = pipeline_cluster,\r\n",
|
||||
" runconfig = pipeline_run_config,\r\n",
|
||||
" allow_reuse = False\r\n",
|
||||
")\r\n",
|
||||
"pipeline_steps.append(overhead_ds_step)\r\n"
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": 7,
|
||||
"metadata": {
|
||||
"jupyter": {
|
||||
"source_hidden": false,
|
||||
"outputs_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
},
|
||||
"gather": {
|
||||
"logged": 1655239360833
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"# Next step, run the overhead script\r\n",
|
||||
"\r\n",
|
||||
"model_input_dir = model_output_dir.as_input()\r\n",
|
||||
"overhead_ds_input = overhead_ds_output.as_input()\r\n",
|
||||
"\r\n",
|
||||
"overhead_output = OutputFileDatasetConfig(\r\n",
|
||||
" name=\"overhead_output\", \r\n",
|
||||
" destination=(\r\n",
|
||||
" ws.datastores.get(working_datastore_name), \r\n",
|
||||
" \"invoices/{run-id}/{output-name}\")\r\n",
|
||||
").as_upload()\r\n",
|
||||
"\r\n",
|
||||
"script_config = ScriptRunConfig(\r\n",
|
||||
" source_directory=\"../scripts\",\r\n",
|
||||
" script=\"overhead.py\",\r\n",
|
||||
" arguments = [\r\n",
|
||||
" '--overhead-folder', overhead_ds_input,\r\n",
|
||||
" '--model-input', model_input_dir,\r\n",
|
||||
" '--overhead-output', overhead_output,\r\n",
|
||||
" '--id-feat', index_feature\r\n",
|
||||
" ],\r\n",
|
||||
" run_config = pipeline_run_config\r\n",
|
||||
")\r\n",
|
||||
"\r\n",
|
||||
"params = GridParameterSampling(\r\n",
|
||||
" {\r\n",
|
||||
" '--trees': choice(trees_list),\r\n",
|
||||
" '--subsample-size' : choice(subsample_list)\r\n",
|
||||
" }\r\n",
|
||||
")\r\n",
|
||||
"\r\n",
|
||||
"hyperdrive_config = HyperDriveConfig(\r\n",
|
||||
" run_config = script_config, \r\n",
|
||||
" hyperparameter_sampling = params, \r\n",
|
||||
" policy = None, \r\n",
|
||||
" primary_metric_name = 'Dummy', \r\n",
|
||||
" primary_metric_goal = PrimaryMetricGoal.MAXIMIZE, \r\n",
|
||||
" max_total_runs = len(trees_list)*len(subsample_list), \r\n",
|
||||
" max_concurrent_runs = node_count\r\n",
|
||||
") \r\n",
|
||||
"\r\n",
|
||||
"overhead_step = HyperDriveStep(\r\n",
|
||||
" name = \"iJungle Overhead Step\", \r\n",
|
||||
" hyperdrive_config = hyperdrive_config, \r\n",
|
||||
" inputs=[overhead_ds_input, model_input_dir],\r\n",
|
||||
" outputs=[overhead_output],\r\n",
|
||||
" allow_reuse=False\r\n",
|
||||
")\r\n",
|
||||
"\r\n",
|
||||
"pipeline_steps.append(overhead_step)"
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": 8,
|
||||
"metadata": {
|
||||
"jupyter": {
|
||||
"source_hidden": false,
|
||||
"outputs_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
},
|
||||
"gather": {
|
||||
"logged": 1655239360929
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"# Next steps, find the representative iForest\r\n",
|
||||
"\r\n",
|
||||
"overhead_input = overhead_output.as_input()\r\n",
|
||||
"\r\n",
|
||||
"best_iforest_step = PythonScriptStep(\r\n",
|
||||
" name = \"Best iForest Step\",\r\n",
|
||||
" source_directory = \"../scripts\",\r\n",
|
||||
" script_name = \"best_iforest.py\",\r\n",
|
||||
" arguments = [\r\n",
|
||||
" '--overhead-input', overhead_input,\r\n",
|
||||
" '--subsample-list', str(subsample_list),\r\n",
|
||||
" '--trees-list', str(trees_list)\r\n",
|
||||
" ],\r\n",
|
||||
" inputs=[overhead_input],\r\n",
|
||||
" compute_target = pipeline_cluster,\r\n",
|
||||
" runconfig = pipeline_run_config,\r\n",
|
||||
" allow_reuse = False\r\n",
|
||||
")\r\n",
|
||||
"pipeline_steps.append(best_iforest_step)"
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": 9,
|
||||
"metadata": {
|
||||
"jupyter": {
|
||||
"source_hidden": false,
|
||||
"outputs_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
},
|
||||
"gather": {
|
||||
"logged": 1655239361026
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"# Construct the pipeline\r\n",
|
||||
"pipeline = Pipeline(workspace=ws, steps=pipeline_steps)\r\n",
|
||||
"print(\"Pipeline is built.\")\r\n",
|
||||
"\r\n",
|
||||
"# Create an experiment and run the pipeline\r\n",
|
||||
"experiment = Experiment(workspace=ws, name = pipeline_name)\r\n",
|
||||
"pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)\r\n",
|
||||
"print(\"Pipeline submitted for execution.\")\r\n"
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Pipeline is built.\nCreated step Feature engineering Step [1dab3f3a][da82670e-3f7e-4be0-88f6-507debe692f6], (This step will run and generate new outputs)\nCreated step iJungle Trainining Step [6b78089a][3c5b501f-987f-4dea-adae-e9613855b04c], (This step will run and generate new outputs)Created step Overhead Dataset Step [03970c0c][d8d83657-f3f1-4806-97c6-f40170b8f0b0], (This step will run and generate new outputs)\n\nCreated step iJungle Overhead Step [5e5520f5][074c153a-7a82-4e83-96da-6e62b1d5cedb], (This step will run and generate new outputs)\nCreated step Best iForest Step [26f37c61][5c436f27-c6b8-42b9-a0bd-5b91a3ea06eb], (This step will run and generate new outputs)\nSubmitted PipelineRun d571febb-ba82-4190-874a-4823dd9e978d\nLink to Azure Machine Learning Portal: https://ml.azure.com/runs/d571febb-ba82-4190-874a-4823dd9e978d?wsid=/subscriptions/d412dac0-d902-4cfb-b2f9-19dea115f7ff/resourcegroups/rg-dv-aidnaanomaly-corp-eus2/workspaces/wsmldvanomaly&tid=973ba820-4a58-4246-84bf-170e50b3152a\nPipeline submitted for execution.\n"
|
||||
}
|
||||
],
|
||||
"execution_count": 10,
|
||||
"metadata": {
|
||||
"jupyter": {
|
||||
"source_hidden": false,
|
||||
"outputs_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
},
|
||||
"gather": {
|
||||
"logged": 1655239374520
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"jupyter": {
|
||||
"source_hidden": false,
|
||||
"outputs_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernel_info": {
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"kernelspec": {
|
||||
"name": "python38-azureml",
|
||||
"language": "python",
|
||||
"display_name": "Python 3.8 - AzureML"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python",
|
||||
"version": "3.8.5",
|
||||
"mimetype": "text/x-python",
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"pygments_lexer": "ipython3",
|
||||
"nbconvert_exporter": "python",
|
||||
"file_extension": ".py"
|
||||
},
|
||||
"nteract": {
|
||||
"version": "nteract-front-end@1.0.0"
|
||||
},
|
||||
"microsoft": {
|
||||
"host": {
|
||||
"AzureML": {
|
||||
"notebookHasBeenCompleted": true
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
|
@ -0,0 +1,402 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"# DO NOT START THIS NOTEBOOK UNTIL THE PIPELINE CREATED IN THE PREVIOUS STEP, THE IJUNGLE TRAINING PIPELINE, IS IN \"COMPLETE\" STATUS.\r\n",
|
||||
"\r\n",
|
||||
"# iJungle Inference pipeline"
|
||||
],
|
||||
"metadata": {
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"from azureml.core import Workspace, Environment, Experiment, ScriptRunConfig\r\n",
|
||||
"from azureml.core.compute import ComputeTarget\r\n",
|
||||
"from azureml.core.conda_dependencies import CondaDependencies\r\n",
|
||||
"from azureml.core.runconfig import RunConfiguration\r\n",
|
||||
"from azureml.pipeline.core import Pipeline\r\n",
|
||||
"from azureml.pipeline.steps import PythonScriptStep, ParallelRunConfig, ParallelRunStep\r\n",
|
||||
"from azureml.data import OutputFileDatasetConfig"
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"jupyter": {
|
||||
"source_hidden": false,
|
||||
"outputs_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
},
|
||||
"gather": {
|
||||
"logged": 1655246932882
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"cluster_name = \"cluster4\"\r\n",
|
||||
"environment_name = \"ijungle-inference-env\"\r\n",
|
||||
"input_dataset_name=\"ijungle-test-dataset\"\r\n",
|
||||
"working_datastore_name=\"workspaceblobstore\"\r\n",
|
||||
"output_datastore_name=\"workspaceblobstore\"\r\n",
|
||||
"output_path=\"iJungle/results/\"\r\n",
|
||||
"pipeline_name=\"ijungle-inference-pipeline\"\r\n",
|
||||
"\r\n",
|
||||
"index_feature = 'index'\r\n",
|
||||
"anomaly_score = -.8"
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"jupyter": {
|
||||
"source_hidden": false,
|
||||
"outputs_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
},
|
||||
"gather": {
|
||||
"logged": 1655246933174
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"ws = Workspace.from_config()\r\n",
|
||||
"pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)\r\n",
|
||||
"print('Cluster configured to execute the pipeline:',cluster_name)"
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Cluster configured to execute the pipeline: cluster4\n"
|
||||
}
|
||||
],
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"jupyter": {
|
||||
"source_hidden": false,
|
||||
"outputs_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
},
|
||||
"gather": {
|
||||
"logged": 1655246934825
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"new_env = Environment(environment_name)\r\n",
|
||||
"packages = CondaDependencies.create(\r\n",
|
||||
" conda_packages=['pip'],\r\n",
|
||||
" pip_packages=['azureml-defaults','azureml-interpret','scikit-learn','pandas','pyarrow'])\r\n",
|
||||
"\r\n",
|
||||
"# Add the dependencies to the environment\r\n",
|
||||
"new_env.python.conda_dependencies = packages\r\n",
|
||||
"\r\n",
|
||||
"# Register the environment \r\n",
|
||||
"new_env.register(workspace=ws)\r\n",
|
||||
"registered_env = Environment.get(ws, environment_name)\r\n",
|
||||
"\r\n",
|
||||
"# Create a new runconfig object for the pipeline\r\n",
|
||||
"pipeline_run_config = RunConfiguration()\r\n",
|
||||
"\r\n",
|
||||
"# Use the compute you created above. \r\n",
|
||||
"pipeline_run_config.target = pipeline_cluster\r\n",
|
||||
"\r\n",
|
||||
"# Assign the environment to the run configuration\r\n",
|
||||
"pipeline_run_config.environment = registered_env\r\n",
|
||||
"\r\n",
|
||||
"print (\"Run configuration created.\")\r\n"
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Run configuration created.\n"
|
||||
}
|
||||
],
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"jupyter": {
|
||||
"source_hidden": false,
|
||||
"outputs_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
},
|
||||
"gather": {
|
||||
"logged": 1655246935455
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"# Get the inference dataset\r\n",
|
||||
"inference_ds = ws.datasets.get(input_dataset_name)\r\n",
|
||||
"\r\n",
|
||||
"# Intermadiate data\r\n",
|
||||
"dataprep_output = OutputFileDatasetConfig(\r\n",
|
||||
" name=\"processed_data\", \r\n",
|
||||
" destination=(\r\n",
|
||||
" ws.datastores.get(working_datastore_name), \r\n",
|
||||
" \"invoices/{run-id}/{output-name}\")\r\n",
|
||||
").as_upload()\r\n",
|
||||
"\r\n",
|
||||
"# Step 1, Run the data prep script\r\n",
|
||||
"prep_step = PythonScriptStep(\r\n",
|
||||
" name = \"Inference data preparation Step\",\r\n",
|
||||
" source_directory = \"../scripts\",\r\n",
|
||||
" script_name = \"feat_eng.py\",\r\n",
|
||||
" arguments = [\r\n",
|
||||
" '--input-data', inference_ds.as_named_input('input'),\r\n",
|
||||
" '--prepped-data', dataprep_output,\r\n",
|
||||
" '--index-feature', index_feature,\r\n",
|
||||
" '--training', 'False', \r\n",
|
||||
" ],\r\n",
|
||||
" outputs=[dataprep_output],\r\n",
|
||||
" compute_target = pipeline_cluster,\r\n",
|
||||
" runconfig = pipeline_run_config,\r\n",
|
||||
" allow_reuse = False\r\n",
|
||||
")\r\n",
|
||||
"\r\n",
|
||||
"# Initial definition of the pipeline steps\r\n",
|
||||
"pipeline_steps = [prep_step]\r\n"
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"jupyter": {
|
||||
"source_hidden": false,
|
||||
"outputs_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
},
|
||||
"gather": {
|
||||
"logged": 1655246936585
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"# Next Step, run the inferencing script\r\n",
|
||||
"\r\n",
|
||||
"node_count = int(pipeline_cluster.serialize()['properties']['properties']['scaleSettings']['maxNodeCount'])\r\n",
|
||||
"\r\n",
|
||||
"dataprep_input = dataprep_output.read_parquet_files().as_input(\"inference_data\")\r\n",
|
||||
"\r\n",
|
||||
"inference_output_dir = OutputFileDatasetConfig(\r\n",
|
||||
" name=\"inference_output\", \r\n",
|
||||
" destination=(\r\n",
|
||||
" ws.datastores.get(working_datastore_name), \r\n",
|
||||
" \"invoices/{run-id}/{output-name}\")\r\n",
|
||||
").as_upload()\r\n",
|
||||
"\r\n",
|
||||
"inference_step = PythonScriptStep(\r\n",
|
||||
" name = \"Inference Step\",\r\n",
|
||||
" source_directory = \"../scripts\",\r\n",
|
||||
" script_name = \"inference.py\",\r\n",
|
||||
" arguments = [\r\n",
|
||||
" '--input', dataprep_input,\r\n",
|
||||
" '--output', inference_output_dir,\r\n",
|
||||
" '--feat-id', index_feature\r\n",
|
||||
" ],\r\n",
|
||||
" inputs=[dataprep_input],\r\n",
|
||||
" outputs=[inference_output_dir],\r\n",
|
||||
" compute_target = pipeline_cluster,\r\n",
|
||||
" runconfig = pipeline_run_config,\r\n",
|
||||
" allow_reuse = False\r\n",
|
||||
")\r\n",
|
||||
"\r\n",
|
||||
"pipeline_steps.append(inference_step)\r\n"
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"jupyter": {
|
||||
"source_hidden": false,
|
||||
"outputs_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
},
|
||||
"gather": {
|
||||
"logged": 1655246938497
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"# Next step, explainability\r\n",
|
||||
"\r\n",
|
||||
"interpret_input = inference_output_dir.read_parquet_files().as_input(\"interpret_input\")\r\n",
|
||||
"\r\n",
|
||||
"interpret_output_dir = OutputFileDatasetConfig(\r\n",
|
||||
" name=\"interpret_output\", \r\n",
|
||||
" destination=(\r\n",
|
||||
" ws.datastores.get(output_datastore_name), \r\n",
|
||||
" output_path)\r\n",
|
||||
").as_upload()\r\n",
|
||||
"\r\n",
|
||||
"\r\n",
|
||||
"interpret_step = PythonScriptStep(\r\n",
|
||||
" name = \"Explainability Step\",\r\n",
|
||||
" source_directory = \"../scripts\",\r\n",
|
||||
" script_name = \"interpret.py\",\r\n",
|
||||
" arguments = [\r\n",
|
||||
" '--input', interpret_input,\r\n",
|
||||
" '--dataprep', dataprep_input,\r\n",
|
||||
" '--output', interpret_output_dir,\r\n",
|
||||
" '--index-id', index_feature,\r\n",
|
||||
" '--anomaly-score', anomaly_score\r\n",
|
||||
" ],\r\n",
|
||||
" inputs=[ interpret_input, dataprep_input],\r\n",
|
||||
" outputs=[interpret_output_dir],\r\n",
|
||||
" compute_target = pipeline_cluster,\r\n",
|
||||
" runconfig = pipeline_run_config,\r\n",
|
||||
" allow_reuse = False\r\n",
|
||||
")\r\n",
|
||||
"pipeline_steps.append(interpret_step)"
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "error",
|
||||
"ename": "NameError",
|
||||
"evalue": "name 'dataprep_output_outliers' is not defined",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[0;32m<ipython-input-7-d0664c56f92d>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0minterpret_input\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minference_output_dir\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_parquet_files\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mas_input\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"interpret_input\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mdataprep_input_outliers\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdataprep_output_outliers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_parquet_files\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mas_input\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"dataprep_input_outliers\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m interpret_output_dir = OutputFileDatasetConfig(\n",
|
||||
"\u001b[0;31mNameError\u001b[0m: name 'dataprep_output_outliers' is not defined"
|
||||
]
|
||||
}
|
||||
],
|
||||
"execution_count": 7,
|
||||
"metadata": {
|
||||
"jupyter": {
|
||||
"source_hidden": false,
|
||||
"outputs_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
},
|
||||
"gather": {
|
||||
"logged": 1655246938617
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"# Construct the pipeline\r\n",
|
||||
"pipeline = Pipeline(workspace=ws, steps=pipeline_steps)\r\n",
|
||||
"print(\"Pipeline is built.\")\r\n",
|
||||
"\r\n",
|
||||
"# Create an experiment and run the pipeline\r\n",
|
||||
"experiment = Experiment(workspace=ws, name = pipeline_name)\r\n",
|
||||
"pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)\r\n",
|
||||
"print(\"Pipeline submitted for execution.\")\r\n"
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"jupyter": {
|
||||
"source_hidden": false,
|
||||
"outputs_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
},
|
||||
"gather": {
|
||||
"logged": 1655246938658
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"jupyter": {
|
||||
"source_hidden": false,
|
||||
"outputs_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"name": "python38-azureml",
|
||||
"language": "python",
|
||||
"display_name": "Python 3.8 - AzureML"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python",
|
||||
"version": "3.8.5",
|
||||
"mimetype": "text/x-python",
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"pygments_lexer": "ipython3",
|
||||
"nbconvert_exporter": "python",
|
||||
"file_extension": ".py"
|
||||
},
|
||||
"kernel_info": {
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"microsoft": {
|
||||
"host": {
|
||||
"AzureML": {
|
||||
"notebookHasBeenCompleted": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"nteract": {
|
||||
"version": "nteract-front-end@1.0.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -1,4 +0,0 @@
|
|||
jupyter
|
||||
pandas
|
||||
scikit-learn
|
||||
matplotlib
|
|
@ -1,27 +0,0 @@
|
|||
import os
|
||||
import numpy as np
|
||||
from azureml.core import Model
|
||||
import joblib
|
||||
#import argparse
|
||||
|
||||
#parser = argparse.ArgumentParser()
|
||||
#parser.add_argument("--id-feature", type=str, dest='id_feature', help='ID Freature')
|
||||
#args = parser.parse_args()
|
||||
#id_feat = str(args.id_feature)
|
||||
#print('id feature', id_feat)
|
||||
|
||||
|
||||
def init():
|
||||
# Runs when the pipeline step is initialized
|
||||
global model
|
||||
|
||||
# load the model
|
||||
model_path = Model.get_model_path('best_iforest.pkl')
|
||||
model = joblib.load(model_path)
|
||||
|
||||
def run(mini_batch):
|
||||
mini_batch.set_index('Van_Stock_Proposal_Detail_Id', inplace=True)
|
||||
index_list = list(mini_batch.index)
|
||||
y_pred = model.predict(mini_batch).tolist()
|
||||
score = model.score_samples(mini_batch).tolist()
|
||||
return(list(zip(index_list, y_pred, score)))
|
|
@ -1,56 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from azureml.core import Run
|
||||
import argparse
|
||||
import numpy as np
|
||||
import iJungle
|
||||
|
||||
run = Run.get_context()
|
||||
print("iJungle version:", iJungle.__version__)
|
||||
run.log('iJungle_version', iJungle.__version__)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
# Input Data
|
||||
parser.add_argument("--input-data", type=str, dest='input_data', help='training dataset')
|
||||
parser.add_argument("--id-feature", type=str, dest='id_feature', help='ID Freature')
|
||||
parser.add_argument("--max-subsample-size", type=int, dest='max_sss', help='Max subsample size')
|
||||
parser.add_argument("--train-size", type=float, dest='train_size', help='Train size')
|
||||
|
||||
# Hyper parameters
|
||||
parser.add_argument('--trees', type=int, dest='trees', default=100, help='Number of trees')
|
||||
parser.add_argument('--subsample-size', type=int, dest='subsample_size', default=8192, help='Subsample size')
|
||||
|
||||
# Add arguments to args collection
|
||||
args = parser.parse_args()
|
||||
id_feat = str(args.id_feature)
|
||||
print('id feature', id_feat)
|
||||
|
||||
# Log Hyperparameter values
|
||||
trees = np.int(args.trees)
|
||||
subsample_size = np.int(args.subsample_size)
|
||||
print('trees', trees)
|
||||
print('subsample_size', subsample_size)
|
||||
run.log('trees', trees)
|
||||
run.log('subsample_size', subsample_size)
|
||||
|
||||
# Other parameters
|
||||
max_sss = np.int(args.max_sss)
|
||||
train_size = np.float(args.train_size)
|
||||
print("Max subsample size", max_sss)
|
||||
print("Train size", train_size)
|
||||
run.log('max_sss', max_sss)
|
||||
run.log('train_size', train_size)
|
||||
|
||||
# Load training data
|
||||
print("Loading Data...")
|
||||
df = run.input_datasets['training_data'].to_pandas_dataframe() # Get the training data from the estimator input
|
||||
df.set_index(id_feat, inplace=True)
|
||||
|
||||
print("Starting training ...")
|
||||
model_filename = iJungle.model_train_fun(df, trees, subsample_size, train_size, max_sss)
|
||||
print(model_filename)
|
||||
|
||||
# Log dummy metric
|
||||
run.log('Dummy', np.float(0))
|
||||
|
||||
run.complete()
|
|
@ -0,0 +1,56 @@
|
|||
from azureml.core import Run, Model
|
||||
import argparse
|
||||
import pandas as pd
|
||||
import os
|
||||
import iJungle
|
||||
import joblib
|
||||
|
||||
run = Run.get_context()
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
# Input Data
|
||||
parser.add_argument('--overhead-input', type=str, dest='overhead_input', help='Overhead input')
|
||||
parser.add_argument('--subsample-list', type=str, dest='subsample_list')
|
||||
parser.add_argument('--trees-list', type=str, dest='trees_list')
|
||||
|
||||
# Add arguments to args collection
|
||||
args = parser.parse_args()
|
||||
overhead_input = args.overhead_input
|
||||
print("Overhead input", overhead_input)
|
||||
subsample_list = eval(args.subsample_list)
|
||||
print("subsample_list", subsample_list)
|
||||
trees_list = eval(args.trees_list)
|
||||
print("subsample_list", trees_list)
|
||||
|
||||
# Load models
|
||||
print("Loading Models...")
|
||||
results_dic = {}
|
||||
for subsample_size in subsample_list:
|
||||
results_dic[str(subsample_size)] = {}
|
||||
for trees in trees_list:
|
||||
model_name = 'iJungle_light_' + str(trees) + '_' + str(subsample_size) + '_results'
|
||||
print(model_name)
|
||||
model_path = Model.get_model_path(model_name)
|
||||
print(model_path)
|
||||
results_dic[str(subsample_size)][str(trees)] = joblib.load(model_path)
|
||||
|
||||
results = pd.DataFrame(results_dic)
|
||||
|
||||
# Calculating best iForest
|
||||
print("Calculating best iForest ...")
|
||||
best_subsample_size, best_trees, best_iF_k = iJungle.best_iforest_params(results)
|
||||
|
||||
model_name = 'iJungle_light_' + str(trees) + '_' + str(subsample_size)
|
||||
print("Best iForest model name:", model_name)
|
||||
model_path = Model.get_model_path(model_name)
|
||||
print("Loading best iFor_list from ", model_path)
|
||||
iFor_list = joblib.load(model_path)
|
||||
model = iFor_list[best_iF_k]
|
||||
print("Model selected!")
|
||||
print("Registering model...")
|
||||
best_model_name = 'best_iforest.pkl'
|
||||
best_model_path = os.path.join(iJungle._MODEL_DIR, best_model_name)
|
||||
joblib.dump(model, best_model_path)
|
||||
Model.register(workspace=run.experiment.workspace ,model_path=best_model_path, model_name=best_model_name)
|
||||
|
||||
run.complete()
|
|
@ -0,0 +1,65 @@
|
|||
import os
|
||||
import argparse
|
||||
import pandas as pd
|
||||
from azureml.core import Run, Model
|
||||
from sklearn import preprocessing
|
||||
import joblib
|
||||
import numpy as np
|
||||
|
||||
LOCAL_MODEL_PATH = 'outputs'
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--input-data", type=str, dest='train_dataset_id')
|
||||
parser.add_argument('--prepped-data', type=str, dest='prepped_data')
|
||||
parser.add_argument('--index-feature', type=str, dest='index_feature')
|
||||
parser.add_argument('--training', type=str, dest='training')
|
||||
|
||||
print('Loading parameters ...')
|
||||
args = parser.parse_args()
|
||||
save_folder = args.prepped_data
|
||||
index_feature = args.index_feature
|
||||
training = bool(args.training)
|
||||
|
||||
print('save_folder', save_folder)
|
||||
print('index_feature', index_feature)
|
||||
print('training', training)
|
||||
|
||||
run = Run.get_context()
|
||||
|
||||
print("Loading data ...")
|
||||
df = run.input_datasets['input'].to_pandas_dataframe()
|
||||
print(df)
|
||||
print("Shape of data:",df.shape)
|
||||
|
||||
print("Setting index ...")
|
||||
df.set_index(index_feature, inplace=True)
|
||||
|
||||
for feat in df.columns:
|
||||
if training:
|
||||
print("Training and registering scaler for feature:", feat)
|
||||
scaler = preprocessing.StandardScaler()
|
||||
scaler.fit(df[[feat]])
|
||||
model_name = 'ijungle_scaler_model_'+feat
|
||||
file_name = os.path.join(LOCAL_MODEL_PATH, model_name + '.pkl')
|
||||
joblib.dump(value=scaler, filename=file_name)
|
||||
Model.register(
|
||||
workspace=run.experiment.workspace,
|
||||
model_path = file_name,
|
||||
model_name = model_name
|
||||
)
|
||||
else:
|
||||
print("Applying scaler for feature:", feat)
|
||||
model_name = 'invoices_scaler_model_'+feat
|
||||
model_path = Model.get_model_path(model_name)
|
||||
scaler = joblib.load(model_path)
|
||||
df[feat] = scaler.transform(df[[feat]]).reshape(df.shape[0])
|
||||
|
||||
print("Reseting index ...")
|
||||
df.reset_index(inplace=True)
|
||||
|
||||
print("Saving Data...")
|
||||
os.makedirs(save_folder, exist_ok=True)
|
||||
save_path = os.path.join(save_folder,'prepped.parquet')
|
||||
df.to_parquet(save_path, index=False)
|
||||
|
||||
run.complete()
|
|
@ -0,0 +1,51 @@
|
|||
import os
|
||||
import argparse
|
||||
import pandas as pd
|
||||
from azureml.core import Run, Model
|
||||
from sklearn import preprocessing
|
||||
import joblib
|
||||
import numpy as np
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--input", type=str, dest='input_')
|
||||
parser.add_argument("--output", type=str, dest='output_')
|
||||
parser.add_argument("--feat-id", type=str, dest='feat_id')
|
||||
|
||||
run = Run.get_context()
|
||||
|
||||
print("Reading parameters ...")
|
||||
args = parser.parse_args()
|
||||
output_dir = args.output_
|
||||
feat_id = args.feat_id
|
||||
print("Output dir:", output_dir)
|
||||
print("feat_id:", feat_id)
|
||||
|
||||
|
||||
print("Loading data ...")
|
||||
df = run.input_datasets['inference_data'].to_pandas_dataframe()
|
||||
df.set_index(feat_id, inplace=True)
|
||||
print(df)
|
||||
|
||||
print("Loading model ...")
|
||||
model_name = 'best_iforest.pkl'
|
||||
model_path = Model.get_model_path(model_name)
|
||||
model = joblib.load(model_path)
|
||||
print("model", model)
|
||||
|
||||
print(df.isnull().any())
|
||||
|
||||
print("Making predictions ...")
|
||||
y_pred = model.predict(df)
|
||||
scores = model.score_samples(df)
|
||||
print("Number of anomalies: ", len(y_pred[y_pred==-1]))
|
||||
|
||||
print("Generating outputs ...")
|
||||
df_out = pd.DataFrame()
|
||||
df_out[feat_id] = df.index.values
|
||||
df_out['pred'] = y_pred
|
||||
df_out['score'] = scores
|
||||
|
||||
save_path = os.path.join(output_dir,'results.parquet')
|
||||
df_out.to_parquet(save_path, index=False)
|
||||
|
||||
run.complete()
|
|
@ -0,0 +1,108 @@
|
|||
import os
|
||||
import argparse
|
||||
import pandas as pd
|
||||
from azureml.core import Run, Model
|
||||
from sklearn import preprocessing
|
||||
import joblib
|
||||
import numpy as np
|
||||
from interpret.ext.blackbox import TabularExplainer
|
||||
import time
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--input", type=str, dest='input', help='inference dataset')
|
||||
parser.add_argument("--dataprep", type=str, dest='dataprep', help='dataset')
|
||||
parser.add_argument('--output', type=str, dest='output_dir', help='Folder for results')
|
||||
parser.add_argument('--index-id', type=str, dest='index_id')
|
||||
parser.add_argument('--anomaly-score', type=str, dest='anomaly_score')
|
||||
|
||||
run = Run.get_context()
|
||||
|
||||
print("Loading arguments ...")
|
||||
args = parser.parse_args()
|
||||
output_dir = args.output_dir
|
||||
index_id = args.index_id
|
||||
anomaly_score = float(args.anomaly_score)
|
||||
print("outpur_dir", output_dir)
|
||||
print("index_id", index_id)
|
||||
print("anomaly_score", anomaly_score)
|
||||
|
||||
print("Loading results ...")
|
||||
df = run.input_datasets["interpret_input"].to_pandas_dataframe()
|
||||
print(df)
|
||||
print("Selecting anomalies ...")
|
||||
df = df[df['score']<=anomaly_score]
|
||||
print(df)
|
||||
|
||||
ls_anomalies = []
|
||||
|
||||
if df.shape[0] > 0:
|
||||
df_100 = df.sort_values("score", ascending=True).copy()
|
||||
print(df_100)
|
||||
|
||||
print("Loading inference data ...")
|
||||
df_inf = run.input_datasets["inference_data"].to_pandas_dataframe()
|
||||
print(df_inf)
|
||||
|
||||
print("Creating input to interpret ...")
|
||||
result_ids = df_100[index_id].values
|
||||
df_inf = df_inf.loc[df_inf[index_id].isin(result_ids),:].copy()
|
||||
X_explain = df_inf.set_index(index_id)
|
||||
print(X_explain)
|
||||
|
||||
print("Loading model ...")
|
||||
model_name = 'Invoices_best_iforest.pkl'
|
||||
model_path = Model.get_model_path(model_name)
|
||||
model = joblib.load(model_path)
|
||||
print("model", model)
|
||||
|
||||
print("Creating explanations ...")
|
||||
tab_explainer = TabularExplainer(model, X_explain)
|
||||
print(tab_explainer)
|
||||
|
||||
# Get predictions
|
||||
predictions = model.predict(X_explain)
|
||||
|
||||
# Get local explanations
|
||||
local_tab_explanation = tab_explainer.explain_local(X_explain)
|
||||
|
||||
# Get feature names and importance for each possible label
|
||||
local_tab_features = local_tab_explanation.get_ranked_local_names()
|
||||
local_tab_importance = local_tab_explanation.get_ranked_local_values()
|
||||
|
||||
ls_explanations = []
|
||||
for i in range(len(X_explain.index)):
|
||||
detail_id = X_explain.index[i]
|
||||
feat3, feat2, feat1 = tuple(local_tab_features[0][-3:])
|
||||
score3, score2, score1 = tuple(local_tab_importance[0][-3:])
|
||||
ls_explanations.append({
|
||||
index_id:detail_id,
|
||||
'Interpretation_Feature_1':feat1,
|
||||
'Interpretation_Score_1':score1,
|
||||
'Interpretation_Feature_2':feat2,
|
||||
'Interpretation_Score_2':score2,
|
||||
'Interpretation_Feature_3':feat3,
|
||||
'Interpretation_Score_3':score3,
|
||||
})
|
||||
|
||||
df_explanations = pd.DataFrame(ls_explanations)
|
||||
print(df_explanations)
|
||||
df_results_exp = df_100.merge(df_explanations, on=index_id, how='left')
|
||||
print("Explanations:")
|
||||
print(df_results_exp)
|
||||
|
||||
ls_anomalies.append(df_results_exp)
|
||||
|
||||
print("Consolidating all anomalies ...")
|
||||
df_anomalies = pd.concat(ls_anomalies)
|
||||
df_anomalies.sort_values(['score','Interpretation_Score_1'], ascending=True, inplace=True)
|
||||
print(df_anomalies)
|
||||
|
||||
print("Adding additional columns ...")
|
||||
df_anomalies = df_anomalies.merge(df_sp, on=index_id, how='left')
|
||||
print(df_anomalies)
|
||||
|
||||
print("Saving anomalies ...")
|
||||
timestr = time.strftime("%Y%m%d%H%M%S")
|
||||
df_anomalies.to_csv(os.path.join(output_dir,'anomalies_'+timestr+'.csv'), index=False)
|
||||
|
||||
run.complete()
|
|
@ -1,29 +1,34 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from azureml.core import Model, Run
|
||||
from azureml.core import Run, Model
|
||||
import argparse
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import iJungle
|
||||
import joblib
|
||||
import os
|
||||
import iJungle
|
||||
import shutil
|
||||
|
||||
run = Run.get_context()
|
||||
|
||||
print("iJungle version:", iJungle.__version__)
|
||||
run.log('iJungle_version', iJungle.__version__)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
# Input Data
|
||||
parser.add_argument("--input-data", type=str, dest='input_data', help='Overhead dataset')
|
||||
parser.add_argument("--id-feature", type=str, dest='id_feature', help='ID Freature')
|
||||
parser.add_argument("--overhead-folder", type=str, dest='overhead_folder', help='overhead data folder')
|
||||
parser.add_argument("--model-input", type=str, dest='model_input', help='model input folder')
|
||||
parser.add_argument("--overhead-output", type=str, dest='overhead_output', help='overhead output folder')
|
||||
parser.add_argument("--id-feat", type=str, dest='id_feat')
|
||||
|
||||
# Hyper parameters
|
||||
parser.add_argument('--trees', type=int, dest='trees', default=100, help='Number of trees')
|
||||
parser.add_argument('--subsample-size', type=int, dest='subsample_size', default=8192, help='Subsample size')
|
||||
|
||||
|
||||
# Add arguments to args collection
|
||||
args = parser.parse_args()
|
||||
id_feat = str(args.id_feature)
|
||||
print('id feature', id_feat)
|
||||
overhead_folder = args.overhead_folder
|
||||
print("Overhead folder", overhead_folder)
|
||||
model_input = args.model_input
|
||||
print("Model input", model_input)
|
||||
overhead_output = args.overhead_output
|
||||
print("Overhead output", overhead_output)
|
||||
|
||||
# Log Hyperparameter values
|
||||
trees = np.int(args.trees)
|
||||
|
@ -33,10 +38,18 @@ print('subsample_size', subsample_size)
|
|||
run.log('trees', trees)
|
||||
run.log('subsample_size', subsample_size)
|
||||
|
||||
# Other parameters
|
||||
id_feat = args.id_feat
|
||||
print("id_feat", id_feat)
|
||||
run.log('id_feat', id_feat)
|
||||
|
||||
# Load training data
|
||||
print("Loading Data...")
|
||||
W = run.input_datasets['overhead_data'].to_pandas_dataframe() # Get the training data from the estimator input
|
||||
load_path = os.path.join(overhead_folder,'W.parquet')
|
||||
W = pd.read_parquet(load_path)
|
||||
W.set_index(id_feat, inplace=True)
|
||||
print("Overhead Data loaded. Shape:", W.shape)
|
||||
|
||||
|
||||
# Load iFor_list pickle
|
||||
print("Loading pickle...")
|
||||
|
@ -54,7 +67,21 @@ results_filename = os.path.join(iJungle._MODEL_DIR, model_name + '_results.pkl')
|
|||
print("Writing results:", results_filename)
|
||||
joblib.dump(value=results, filename=results_filename)
|
||||
|
||||
model_name = 'iJungle_light_' + str(trees) + '_' + str(subsample_size) + '_results'
|
||||
print(model_name)
|
||||
|
||||
print('Registering model...')
|
||||
Model.register(
|
||||
workspace=run.experiment.workspace,
|
||||
model_path = results_filename,
|
||||
model_name = model_name,
|
||||
properties={
|
||||
'trees':trees,
|
||||
'subsample_size':subsample_size})
|
||||
|
||||
# Log dummy metric
|
||||
run.log('Dummy', np.float(0))
|
||||
|
||||
shutil.copy(results_filename, os.path.join(overhead_output, model_name + '.pkl'))
|
||||
|
||||
run.complete()
|
|
@ -0,0 +1,43 @@
|
|||
from azureml.core import Run
|
||||
import argparse
|
||||
import pandas as pd
|
||||
import os
|
||||
import iJungle
|
||||
|
||||
run = Run.get_context()
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
# Input Data
|
||||
parser.add_argument("--input-data", type=str, dest='prepped_data', help='Prepped data')
|
||||
parser.add_argument('--overhead-data', type=str, dest='overhead_data', help='Overhead data')
|
||||
parser.add_argument('--overhead-expected-m', type=str, dest='overhead_expected_m')
|
||||
|
||||
# Add arguments to args collection
|
||||
args = parser.parse_args()
|
||||
prepped_data = args.prepped_data
|
||||
print("Prepped folder", prepped_data)
|
||||
overhead_data = args.overhead_data
|
||||
print("Model input", overhead_data)
|
||||
overhead_expected_m = int(args.overhead_expected_m)
|
||||
print("overhead_expected_m", overhead_expected_m)
|
||||
|
||||
# Load training data
|
||||
print("Loading Data...")
|
||||
load_path = os.path.join(prepped_data,'prepped.parquet')
|
||||
df = pd.read_parquet(load_path)
|
||||
print("Data loaded. Shape:", df.shape)
|
||||
|
||||
# Overhead sample size calculation
|
||||
n_records = df.shape[0]
|
||||
overhead_size = min(1,overhead_expected_m/n_records)
|
||||
print("Overhead size", overhead_size)
|
||||
run.log('overhead_size', overhead_size)
|
||||
|
||||
W = iJungle.select_overhead_data(df, overhead_size=overhead_size)
|
||||
print("Overhead shape", W.shape)
|
||||
|
||||
print("Saving Data...")
|
||||
save_path = os.path.join(overhead_data,'W.parquet')
|
||||
W.to_parquet(save_path, index=False)
|
||||
|
||||
run.complete()
|
|
@ -0,0 +1,83 @@
|
|||
from azureml.core import Run, Model
|
||||
import argparse
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import joblib
|
||||
import os
|
||||
import iJungle
|
||||
import shutil
|
||||
|
||||
run = Run.get_context()
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
# Input Data
|
||||
parser.add_argument("--training-folder", type=str, dest='training_folder', help='training data folder')
|
||||
parser.add_argument("--max-subsample-size", type=int, dest='max_sss', help='Max subsample size')
|
||||
parser.add_argument("--model-output", type=str, dest='model_output', help='model output folder')
|
||||
parser.add_argument("--id-feat", type=str, dest='id_feat')
|
||||
parser.add_argument("--train-expected-m", type=str, dest='train_expected_m')
|
||||
|
||||
# Hyper parameters
|
||||
parser.add_argument('--trees', type=int, dest='trees', default=100, help='Number of trees')
|
||||
parser.add_argument('--subsample-size', type=int, dest='subsample_size', default=8192, help='Subsample size')
|
||||
|
||||
|
||||
# Add arguments to args collection
|
||||
args = parser.parse_args()
|
||||
training_folder = args.training_folder
|
||||
print("Training folder", training_folder)
|
||||
model_output = args.model_output
|
||||
print("Model output", model_output)
|
||||
id_feat = args.id_feat
|
||||
print("id_feat", id_feat)
|
||||
train_expected_m = int(args.train_expected_m)
|
||||
print("train_expected_m", train_expected_m)
|
||||
|
||||
# Log Hyperparameter values
|
||||
trees = np.int(args.trees)
|
||||
subsample_size = np.int(args.subsample_size)
|
||||
print('trees', trees)
|
||||
print('subsample_size', subsample_size)
|
||||
run.log('trees', trees)
|
||||
run.log('subsample_size', subsample_size)
|
||||
|
||||
# Other parameters
|
||||
max_sss = np.int(args.max_sss)
|
||||
print("Max subsample size", max_sss)
|
||||
run.log('max_sss', max_sss)
|
||||
|
||||
# Load training data
|
||||
print("Loading Data...")
|
||||
load_path = os.path.join(training_folder,'prepped.parquet')
|
||||
df = pd.read_parquet(load_path)
|
||||
df.set_index(id_feat, inplace=True)
|
||||
|
||||
# Train sample size calculation
|
||||
n_records = df.shape[0]
|
||||
train_size = min(1,train_expected_m/n_records)
|
||||
print("Train size", train_size)
|
||||
run.log('train_size', train_size)
|
||||
|
||||
print("Starting training ...")
|
||||
model_filename = iJungle.model_train_fun(df, trees, subsample_size, train_size, max_sss)
|
||||
print(model_filename)
|
||||
model_name = 'iJungle_light_' + str(trees) + '_' + str(subsample_size)
|
||||
print(model_name)
|
||||
|
||||
model_path = os.path.join(iJungle._MODEL_DIR, model_filename)
|
||||
|
||||
print('Registering model...')
|
||||
Model.register(
|
||||
workspace=run.experiment.workspace,
|
||||
model_path = model_path,
|
||||
model_name = model_name,
|
||||
properties={
|
||||
'trees':trees,
|
||||
'subsample_size':subsample_size})
|
||||
|
||||
# Log dummy metric
|
||||
run.log('Dummy', np.float(0))
|
||||
|
||||
shutil.copy(model_path, os.path.join(model_output,model_name + '.pkl'))
|
||||
|
||||
run.complete()
|
|
@ -1,2 +1,2 @@
|
|||
__version__ = '0.1.73'
|
||||
__version__ = '0.2.0'
|
||||
_MODEL_DIR = 'outputs'
|
||||
|
|
Загрузка…
Ссылка в новой задаче