Merge pull request #1968 from Azure/release_update_stablev2/Release-240
update samples from Release-240 as a part of 1.57.0 SDK stable release
This commit is contained in:
Коммит
3c341f6e9a
|
@ -103,7 +103,7 @@
|
|||
"source": [
|
||||
"import azureml.core\n",
|
||||
"\n",
|
||||
"print(\"This notebook was created using version 1.56.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.57.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
|
|
@ -194,7 +194,7 @@
|
|||
"categorical_transformer = Pipeline(\n",
|
||||
" [\n",
|
||||
" (\"impute\", SimpleImputer(strategy=\"most_frequent\")),\n",
|
||||
" (\"ohe\", OneHotEncoder(handle_unknown=\"ignore\", sparse=False)),\n",
|
||||
" (\"ohe\", OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False)),\n",
|
||||
" ]\n",
|
||||
")\n",
|
||||
"\n",
|
||||
|
|
|
@ -6,7 +6,7 @@ dependencies:
|
|||
- fairlearn>=0.6.2,<=0.7.0
|
||||
- joblib
|
||||
- liac-arff
|
||||
- raiwidgets~=0.33.0
|
||||
- raiwidgets~=0.36.0
|
||||
- itsdangerous==2.0.1
|
||||
- markupsafe<2.1.0
|
||||
- protobuf==3.20.0
|
||||
|
|
|
@ -209,7 +209,7 @@
|
|||
"categorical_transformer = Pipeline(\n",
|
||||
" [\n",
|
||||
" (\"impute\", SimpleImputer(strategy=\"most_frequent\")),\n",
|
||||
" (\"ohe\", OneHotEncoder(handle_unknown=\"ignore\", sparse=False)),\n",
|
||||
" (\"ohe\", OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False)),\n",
|
||||
" ]\n",
|
||||
")\n",
|
||||
"\n",
|
||||
|
|
|
@ -6,7 +6,7 @@ dependencies:
|
|||
- fairlearn>=0.6.2,<=0.7.0
|
||||
- joblib
|
||||
- liac-arff
|
||||
- raiwidgets~=0.33.0
|
||||
- raiwidgets~=0.36.0
|
||||
- itsdangerous==2.0.1
|
||||
- markupsafe<2.1.0
|
||||
- protobuf==3.20.0
|
||||
|
|
|
@ -14,14 +14,14 @@ dependencies:
|
|||
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
- azureml-widgets~=1.56.0
|
||||
- azureml-defaults~=1.56.0
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.56.0/validated_win32_requirements.txt [--no-deps]
|
||||
- azureml-widgets~=1.57.0
|
||||
- azureml-defaults~=1.57.0
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.57.0/validated_win32_requirements.txt [--no-deps]
|
||||
- matplotlib==3.7.1
|
||||
- xgboost==1.5.2
|
||||
- prophet==1.1.4
|
||||
- pandas==1.3.5
|
||||
- cmdstanpy==1.1.0
|
||||
- setuptools-git==1.2
|
||||
- spacy==3.4.4
|
||||
- https://aka.ms/automl-resources/packages/en_core_web_sm-3.4.1.tar.gz
|
||||
- spacy==3.7.4
|
||||
- https://aka.ms/automl-resources/packages/en_core_web_sm-3.7.1.tar.gz
|
||||
|
|
|
@ -12,7 +12,7 @@ dependencies:
|
|||
- numpy>=1.21.6,<=1.23.5
|
||||
- urllib3==1.26.7
|
||||
- scipy==1.10.1
|
||||
- scikit-learn==1.1.3
|
||||
- scikit-learn==1.5.1
|
||||
- holidays==0.29
|
||||
- pytorch::pytorch=1.11.0
|
||||
- cudatoolkit=10.1.243
|
||||
|
@ -20,11 +20,11 @@ dependencies:
|
|||
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
- azureml-widgets~=1.56.0
|
||||
- azureml-defaults~=1.56.0
|
||||
- azureml-widgets~=1.57.0
|
||||
- azureml-defaults~=1.57.0
|
||||
- pytorch-transformers==1.0.0
|
||||
- spacy==3.4.4
|
||||
- spacy==3.7.4
|
||||
- xgboost==1.5.2
|
||||
- prophet==1.1.4
|
||||
- https://aka.ms/automl-resources/packages/en_core_web_sm-3.4.1.tar.gz
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.56.0/validated_linux_requirements.txt [--no-deps]
|
||||
- https://aka.ms/automl-resources/packages/en_core_web_sm-3.7.1.tar.gz
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.57.0/validated_linux_requirements.txt [--no-deps]
|
||||
|
|
|
@ -10,17 +10,17 @@ dependencies:
|
|||
- python>=3.10,<3.11
|
||||
- numpy>=1.21.6,<=1.23.5
|
||||
- scipy==1.10.1
|
||||
- scikit-learn==1.1.3
|
||||
- scikit-learn==1.5.1
|
||||
- holidays==0.29
|
||||
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
- azureml-widgets~=1.56.0
|
||||
- azureml-defaults~=1.56.0
|
||||
- azureml-widgets~=1.57.0
|
||||
- azureml-defaults~=1.57.0
|
||||
- pytorch-transformers==1.0.0
|
||||
- prophet==1.1.4
|
||||
- xgboost==1.5.2
|
||||
- spacy==3.4.4
|
||||
- spacy==3.7.4
|
||||
- matplotlib==3.7.1
|
||||
- https://aka.ms/automl-resources/packages/en_core_web_sm-3.4.1.tar.gz
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.56.0/validated_darwin_requirements.txt [--no-deps]
|
||||
- https://aka.ms/automl-resources/packages/en_core_web_sm-3.7.1.tar.gz
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.57.0/validated_darwin_requirements.txt [--no-deps]
|
||||
|
|
|
@ -97,7 +97,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.56.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.57.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
|
|
@ -97,7 +97,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.56.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.57.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
|
|
@ -1,420 +0,0 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/experimental/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.png)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Automated Machine Learning\n",
|
||||
"_**Classification of credit card fraudulent transactions on local managed compute **_\n",
|
||||
"\n",
|
||||
"## Contents\n",
|
||||
"1. [Introduction](#Introduction)\n",
|
||||
"1. [Setup](#Setup)\n",
|
||||
"1. [Train](#Train)\n",
|
||||
"1. [Results](#Results)\n",
|
||||
"1. [Test](#Test)\n",
|
||||
"1. [Acknowledgements](#Acknowledgements)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Introduction\n",
|
||||
"\n",
|
||||
"In this example we use the associated credit card dataset to showcase how you can use AutoML for a simple classification problem. The goal is to predict if a credit card transaction is considered a fraudulent charge.\n",
|
||||
"\n",
|
||||
"This notebook is using local managed compute to train the model.\n",
|
||||
"\n",
|
||||
"If you are using an Azure Machine Learning Compute Instance, you are all set. Otherwise, go through the [configuration](../../../configuration.ipynb) notebook first if you haven't already to establish your connection to the AzureML Workspace. \n",
|
||||
"\n",
|
||||
"In this notebook you will learn how to:\n",
|
||||
"1. Create an experiment using an existing workspace.\n",
|
||||
"2. Configure AutoML using `AutoMLConfig`.\n",
|
||||
"3. Train the model using local managed compute.\n",
|
||||
"4. Explore the results.\n",
|
||||
"5. Test the fitted model."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"As part of the setup you have already created an Azure ML `Workspace` object. For Automated ML you will need to create an `Experiment` object, which is a named object in a `Workspace` used to run experiments."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import logging\n",
|
||||
"\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"import azureml.core\n",
|
||||
"from azureml.core.compute_target import LocalTarget\n",
|
||||
"from azureml.core.experiment import Experiment\n",
|
||||
"from azureml.core.workspace import Workspace\n",
|
||||
"from azureml.core.dataset import Dataset\n",
|
||||
"from azureml.train.automl import AutoMLConfig"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This sample notebook may use features that are not available in previous versions of the Azure ML SDK."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.56.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ws = Workspace.from_config()\n",
|
||||
"\n",
|
||||
"# choose a name for experiment\n",
|
||||
"experiment_name = 'automl-local-managed'\n",
|
||||
"\n",
|
||||
"experiment=Experiment(ws, experiment_name)\n",
|
||||
"\n",
|
||||
"output = {}\n",
|
||||
"output['Subscription ID'] = ws.subscription_id\n",
|
||||
"output['Workspace'] = ws.name\n",
|
||||
"output['Resource Group'] = ws.resource_group\n",
|
||||
"output['Location'] = ws.location\n",
|
||||
"output['Experiment Name'] = experiment.name\n",
|
||||
"pd.set_option('display.max_colwidth', None)\n",
|
||||
"outputDf = pd.DataFrame(data = output, index = [''])\n",
|
||||
"outputDf.T"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Determine if local docker is configured for Linux images\n",
|
||||
"\n",
|
||||
"Local managed runs will leverage a Linux docker container to submit the run to. Due to this, the docker needs to be configured to use Linux containers."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Check if Docker is installed and Linux containers are enabled\n",
|
||||
"import subprocess\n",
|
||||
"from subprocess import CalledProcessError\n",
|
||||
"try:\n",
|
||||
" assert subprocess.run(\"docker -v\", shell=True).returncode == 0, 'Local Managed runs require docker to be installed.'\n",
|
||||
" out = subprocess.check_output(\"docker system info\", shell=True).decode('ascii')\n",
|
||||
" assert \"OSType: linux\" in out, 'Docker engine needs to be configured to use Linux containers.' \\\n",
|
||||
" 'https://docs.docker.com/docker-for-windows/#switch-between-windows-and-linux-containers'\n",
|
||||
"except CalledProcessError as ex:\n",
|
||||
" raise Exception('Local Managed runs require docker to be installed.') from ex"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Load Data\n",
|
||||
"\n",
|
||||
"Load the credit card dataset from a csv file containing both training features and labels. The features are inputs to the model, while the training labels represent the expected output of the model. Next, we'll split the data using random_split and extract the training data for the model."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/creditcard.csv\"\n",
|
||||
"dataset = Dataset.Tabular.from_delimited_files(data)\n",
|
||||
"training_data, validation_data = dataset.random_split(percentage=0.8, seed=223)\n",
|
||||
"label_column_name = 'Class'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Train\n",
|
||||
"\n",
|
||||
"Instantiate a AutoMLConfig object. This defines the settings and data used to run the experiment.\n",
|
||||
"\n",
|
||||
"|Property|Description|\n",
|
||||
"|-|-|\n",
|
||||
"|**task**|classification or regression|\n",
|
||||
"|**primary_metric**|This is the metric that you want to optimize. Classification supports the following primary metrics: <br><i>accuracy</i><br><i>AUC_weighted</i><br><i>average_precision_score_weighted</i><br><i>norm_macro_recall</i><br><i>precision_score_weighted</i>|\n",
|
||||
"|**enable_early_stopping**|Stop the run if the metric score is not showing improvement.|\n",
|
||||
"|**n_cross_validations**|Number of cross validation splits.|\n",
|
||||
"|**training_data**|Input dataset, containing both features and label column.|\n",
|
||||
"|**label_column_name**|The name of the label column.|\n",
|
||||
"|**enable_local_managed**|Enable the experimental local-managed scenario.|\n",
|
||||
"\n",
|
||||
"**_You can find more information about primary metrics_** [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train#primary-metric)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"automl_settings = {\n",
|
||||
" \"n_cross_validations\": 3,\n",
|
||||
" \"primary_metric\": 'average_precision_score_weighted',\n",
|
||||
" \"enable_early_stopping\": True,\n",
|
||||
" \"experiment_timeout_hours\": 0.3, #for real scenarios we recommend a timeout of at least one hour \n",
|
||||
" \"verbosity\": logging.INFO,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"automl_config = AutoMLConfig(task = 'classification',\n",
|
||||
" debug_log = 'automl_errors.log',\n",
|
||||
" compute_target = LocalTarget(),\n",
|
||||
" enable_local_managed = True,\n",
|
||||
" training_data = training_data,\n",
|
||||
" label_column_name = label_column_name,\n",
|
||||
" **automl_settings\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Call the `submit` method on the experiment object and pass the run configuration. Depending on the data and the number of iterations this can run for a while. Validation errors and current status will be shown when setting `show_output=True` and the execution will be synchronous."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"parent_run = experiment.submit(automl_config, show_output = True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# If you need to retrieve a run that already started, use the following code\n",
|
||||
"#from azureml.train.automl.run import AutoMLRun\n",
|
||||
"#parent_run = AutoMLRun(experiment = experiment, run_id = '<replace with your run id>')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"parent_run"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Results"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Explain model\n",
|
||||
"\n",
|
||||
"Automated ML models can be explained and visualized using the SDK Explainability library. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Analyze results\n",
|
||||
"\n",
|
||||
"### Retrieve the Best Child Run\n",
|
||||
"\n",
|
||||
"Below we select the best pipeline from our iterations. The `get_best_child` method returns the best run. Overloads on `get_best_child` allow you to retrieve the best run for *any* logged metric."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"best_run = parent_run.get_best_child()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Test the fitted model\n",
|
||||
"\n",
|
||||
"Now that the model is trained, split the data in the same way the data was split for training (The difference here is the data is being split locally) and then run the test data through the trained model to get the predicted values."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X_test_df = validation_data.drop_columns(columns=[label_column_name])\n",
|
||||
"y_test_df = validation_data.keep_columns(columns=[label_column_name], validate=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Creating ModelProxy for submitting prediction runs to the training environment.\n",
|
||||
"We will create a ModelProxy for the best child run, which will allow us to submit a run that does the prediction in the training environment. Unlike the local client, which can have different versions of some libraries, the training environment will have all the compatible libraries for the model already."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.train.automl.model_proxy import ModelProxy\n",
|
||||
"best_model_proxy = ModelProxy(best_run)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# call the predict functions on the model proxy\n",
|
||||
"y_pred = best_model_proxy.predict(X_test_df).to_pandas_dataframe()\n",
|
||||
"y_pred"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Acknowledgements"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This Credit Card fraud Detection dataset is made available under the Open Database License: http://opendatacommons.org/licenses/odbl/1.0/. Any rights in individual contents of the database are licensed under the Database Contents License: http://opendatacommons.org/licenses/dbcl/1.0/ and is available at: https://www.kaggle.com/mlg-ulb/creditcardfraud\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"The dataset has been collected and analysed during a research collaboration of Worldline and the Machine Learning Group (http://mlg.ulb.ac.be) of ULB (Universit\u00c3\u0192\u00c2\u00a9 Libre de Bruxelles) on big data mining and fraud detection. More details on current and past projects on related topics are available on https://www.researchgate.net and the page of the DefeatFraud project\n",
|
||||
"Please cite the following works: \n",
|
||||
"\u00c3\u00a2\u00e2\u201a\u00ac\u00c2\u00a2\tAndrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson and Gianluca Bontempi. Calibrating Probability with Undersampling for Unbalanced Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015\n",
|
||||
"\u00c3\u00a2\u00e2\u201a\u00ac\u00c2\u00a2\tDal Pozzolo, Andrea; Caelen, Olivier; Le Borgne, Yann-Ael; Waterschoot, Serge; Bontempi, Gianluca. Learned lessons in credit card fraud detection from a practitioner perspective, Expert systems with applications,41,10,4915-4928,2014, Pergamon\n",
|
||||
"\u00c3\u00a2\u00e2\u201a\u00ac\u00c2\u00a2\tDal Pozzolo, Andrea; Boracchi, Giacomo; Caelen, Olivier; Alippi, Cesare; Bontempi, Gianluca. Credit card fraud detection: a realistic modeling and a novel learning strategy, IEEE transactions on neural networks and learning systems,29,8,3784-3797,2018,IEEE\n",
|
||||
"o\tDal Pozzolo, Andrea Adaptive Machine learning for credit card fraud detection ULB MLG PhD thesis (supervised by G. Bontempi)\n",
|
||||
"\u00c3\u00a2\u00e2\u201a\u00ac\u00c2\u00a2\tCarcillo, Fabrizio; Dal Pozzolo, Andrea; Le Borgne, Yann-A\u00c3\u0192\u00c2\u00abl; Caelen, Olivier; Mazzer, Yannis; Bontempi, Gianluca. Scarff: a scalable framework for streaming credit card fraud detection with Spark, Information fusion,41, 182-194,2018,Elsevier\n",
|
||||
"\u00c3\u00a2\u00e2\u201a\u00ac\u00c2\u00a2\tCarcillo, Fabrizio; Le Borgne, Yann-A\u00c3\u0192\u00c2\u00abl; Caelen, Olivier; Bontempi, Gianluca. Streaming active learning strategies for real-life credit card fraud detection: assessment and visualization, International Journal of Data Science and Analytics, 5,4,285-300,2018,Springer International Publishing"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"authors": [
|
||||
{
|
||||
"name": "sekrupa"
|
||||
}
|
||||
],
|
||||
"category": "tutorial",
|
||||
"compute": [
|
||||
"AML Compute"
|
||||
],
|
||||
"datasets": [
|
||||
"Creditcard"
|
||||
],
|
||||
"deployment": [
|
||||
"None"
|
||||
],
|
||||
"exclude_from_index": false,
|
||||
"file_extension": ".py",
|
||||
"framework": [
|
||||
"None"
|
||||
],
|
||||
"friendly_name": "Classification of credit card fraudulent transactions using Automated ML",
|
||||
"index_order": 5,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.7"
|
||||
},
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"tags": [
|
||||
"AutomatedML"
|
||||
],
|
||||
"task": "Classification",
|
||||
"version": "3.6.7"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
|
@ -91,7 +91,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.56.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.57.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
|
|
@ -366,7 +366,7 @@
|
|||
"USE_CURATED_ENV = True\n",
|
||||
"if USE_CURATED_ENV:\n",
|
||||
" curated_environment = Environment.get(\n",
|
||||
" workspace=ws, name=\"AzureML-sklearn-0.24-ubuntu18.04-py37-cpu\"\n",
|
||||
" workspace=ws, name=\"AzureML-sklearn-1.5\"\n",
|
||||
" )\n",
|
||||
" aml_run_config.environment = curated_environment\n",
|
||||
"else:\n",
|
||||
|
|
|
@ -53,7 +53,7 @@
|
|||
"\n",
|
||||
"We will showcase one of the tabular data explainers: TabularExplainer (SHAP).\n",
|
||||
"\n",
|
||||
"Problem: Boston Housing Price Prediction with scikit-learn (train a model and run an explainer remotely via AMLCompute, and download and visualize the remotely-calculated explanations.)\n",
|
||||
"Problem: Housing Price Prediction with scikit-learn (train a model and run an explainer remotely via AMLCompute, and download and visualize the remotely-calculated explanations.)\n",
|
||||
"\n",
|
||||
"| ![explanations-run-history](./img/explanations-run-history.png) |\n",
|
||||
"|:--:|\n"
|
||||
|
@ -429,8 +429,8 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"# Retrieve x_test for visualization\n",
|
||||
"x_test_path = './x_test_boston_housing.pkl'\n",
|
||||
"run.download_file('x_test_boston_housing.pkl', output_file_path=x_test_path)"
|
||||
"x_test_path = './x_test_california_housing.pkl'\n",
|
||||
"run.download_file('x_test_california_housing.pkl', output_file_path=x_test_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -439,7 +439,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"x_test = joblib.load('x_test_boston_housing.pkl')"
|
||||
"x_test = joblib.load('x_test_california_housing.pkl')"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# Copyright (c) Microsoft. All rights reserved.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
from sklearn import datasets
|
||||
from sklearn.datasets import fetch_california_housing
|
||||
from sklearn.linear_model import Ridge
|
||||
from interpret.ext.blackbox import TabularExplainer
|
||||
from azureml.interpret import ExplanationClient
|
||||
|
@ -14,20 +14,20 @@ import numpy as np
|
|||
OUTPUT_DIR = './outputs/'
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
|
||||
boston_data = datasets.load_boston()
|
||||
california_data = fetch_california_housing()
|
||||
|
||||
run = Run.get_context()
|
||||
client = ExplanationClient.from_run(run)
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(boston_data.data,
|
||||
boston_data.target,
|
||||
X_train, X_test, y_train, y_test = train_test_split(california_data.data,
|
||||
california_data.target,
|
||||
test_size=0.2,
|
||||
random_state=0)
|
||||
# write x_test out as a pickle file for later visualization
|
||||
x_test_pkl = 'x_test.pkl'
|
||||
with open(x_test_pkl, 'wb') as file:
|
||||
joblib.dump(value=X_test, filename=os.path.join(OUTPUT_DIR, x_test_pkl))
|
||||
run.upload_file('x_test_boston_housing.pkl', os.path.join(OUTPUT_DIR, x_test_pkl))
|
||||
run.upload_file('x_test_california_housing.pkl', os.path.join(OUTPUT_DIR, x_test_pkl))
|
||||
|
||||
|
||||
alpha = 0.5
|
||||
|
@ -50,7 +50,7 @@ original_model = run.register_model(model_name='model_explain_model_on_amlcomp',
|
|||
model_path='original_model.pkl')
|
||||
|
||||
# Explain predictions on your local machine
|
||||
tabular_explainer = TabularExplainer(model, X_train, features=boston_data.feature_names)
|
||||
tabular_explainer = TabularExplainer(model, X_train, features=california_data.feature_names)
|
||||
|
||||
# Explain overall model predictions (global explanation)
|
||||
# Passing in test dataset for evaluation examples - note it must be a representative sample of the original data
|
||||
|
@ -60,5 +60,5 @@ global_explanation = tabular_explainer.explain_global(X_test)
|
|||
|
||||
# Uploading model explanation data for storage or visualization in webUX
|
||||
# The explanation can then be downloaded on any compute
|
||||
comment = 'Global explanation on regression model trained on boston dataset'
|
||||
comment = 'Global explanation on regression model trained on california dataset'
|
||||
client.upload_model_explanation(global_explanation, comment=comment, model_id=original_model.id)
|
||||
|
|
|
@ -125,29 +125,29 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.exceptions import UserErrorException\n",
|
||||
"\n",
|
||||
"blob_datastore_name='MyBlobDatastore'\n",
|
||||
"account_name=os.getenv(\"BLOB_ACCOUNTNAME_62\", \"<my-account-name>\") # Storage account name\n",
|
||||
"container_name=os.getenv(\"BLOB_CONTAINER_62\", \"<my-container-name>\") # Name of Azure blob container\n",
|
||||
"account_key=os.getenv(\"BLOB_ACCOUNT_KEY_62\", \"<my-account-key>\") # Storage account key\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" blob_datastore = Datastore.get(ws, blob_datastore_name)\n",
|
||||
" print(\"Found Blob Datastore with name: %s\" % blob_datastore_name)\n",
|
||||
"except UserErrorException:\n",
|
||||
" blob_datastore = Datastore.register_azure_blob_container(\n",
|
||||
" workspace=ws,\n",
|
||||
" datastore_name=blob_datastore_name,\n",
|
||||
" account_name=account_name, # Storage account name\n",
|
||||
" container_name=container_name, # Name of Azure blob container\n",
|
||||
" account_key=account_key) # Storage account key\n",
|
||||
" print(\"Registered blob datastore with name: %s\" % blob_datastore_name)\n",
|
||||
"\n",
|
||||
"blob_data_ref = DataReference(\n",
|
||||
" datastore=blob_datastore,\n",
|
||||
" data_reference_name=\"blob_test_data\",\n",
|
||||
" path_on_datastore=\"testdata\")"
|
||||
"# from azureml.exceptions import UserErrorException\n",
|
||||
"#\n",
|
||||
"# blob_datastore_name='MyBlobDatastore'\n",
|
||||
"# account_name=os.getenv(\"BLOB_ACCOUNTNAME_62\", \"<my-account-name>\") # Storage account name\n",
|
||||
"# container_name=os.getenv(\"BLOB_CONTAINER_62\", \"<my-container-name>\") # Name of Azure blob container\n",
|
||||
"# account_key=os.getenv(\"BLOB_ACCOUNT_KEY_62\", \"<my-account-key>\") # Storage account key\n",
|
||||
"#\n",
|
||||
"# try:\n",
|
||||
"# blob_datastore = Datastore.get(ws, blob_datastore_name)\n",
|
||||
"# print(\"Found Blob Datastore with name: %s\" % blob_datastore_name)\n",
|
||||
"# except UserErrorException:\n",
|
||||
"# blob_datastore = Datastore.register_azure_blob_container(\n",
|
||||
"# workspace=ws,\n",
|
||||
"# datastore_name=blob_datastore_name,\n",
|
||||
"# account_name=account_name, # Storage account name\n",
|
||||
"# container_name=container_name, # Name of Azure blob container\n",
|
||||
"# account_key=account_key) # Storage account key\n",
|
||||
"# print(\"Registered blob datastore with name: %s\" % blob_datastore_name)\n",
|
||||
"#\n",
|
||||
"# blob_data_ref = DataReference(\n",
|
||||
"# datastore=blob_datastore,\n",
|
||||
"# data_reference_name=\"blob_test_data\",\n",
|
||||
"# path_on_datastore=\"testdata\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -341,24 +341,24 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_factory_name = 'adftest'\n",
|
||||
"\n",
|
||||
"def get_or_create_data_factory(workspace, factory_name):\n",
|
||||
" try:\n",
|
||||
" return DataFactoryCompute(workspace, factory_name)\n",
|
||||
" except ComputeTargetException as e:\n",
|
||||
" if 'ComputeTargetNotFound' in e.message:\n",
|
||||
" print('Data factory not found, creating...')\n",
|
||||
" provisioning_config = DataFactoryCompute.provisioning_configuration()\n",
|
||||
" data_factory = ComputeTarget.create(workspace, factory_name, provisioning_config)\n",
|
||||
" data_factory.wait_for_completion()\n",
|
||||
" return data_factory\n",
|
||||
" else:\n",
|
||||
" raise e\n",
|
||||
" \n",
|
||||
"data_factory_compute = get_or_create_data_factory(ws, data_factory_name)\n",
|
||||
"\n",
|
||||
"print(\"Setup Azure Data Factory account complete\")"
|
||||
"# data_factory_name = 'adftest'\n",
|
||||
"#\n",
|
||||
"# def get_or_create_data_factory(workspace, factory_name):\n",
|
||||
"# try:\n",
|
||||
"# return DataFactoryCompute(workspace, factory_name)\n",
|
||||
"# except ComputeTargetException as e:\n",
|
||||
"# if 'ComputeTargetNotFound' in e.message:\n",
|
||||
"# print('Data factory not found, creating...')\n",
|
||||
"# provisioning_config = DataFactoryCompute.provisioning_configuration()\n",
|
||||
"# data_factory = ComputeTarget.create(workspace, factory_name, provisioning_config)\n",
|
||||
"# data_factory.wait_for_completion()\n",
|
||||
"# return data_factory\n",
|
||||
"# else:\n",
|
||||
"# raise e\n",
|
||||
"#\n",
|
||||
"# data_factory_compute = get_or_create_data_factory(ws, data_factory_name)\n",
|
||||
"#\n",
|
||||
"# print(\"Setup Azure Data Factory account complete\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -392,19 +392,21 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# TODO: 3012801 - Use ADLS Gen2 datastore.\n",
|
||||
"blob_data_ref2 = DataReference(\n",
|
||||
" datastore=blob_datastore,\n",
|
||||
" data_reference_name=\"blob_test_data2\",\n",
|
||||
" path_on_datastore=\"testdata2\")\n",
|
||||
"\n",
|
||||
"transfer_adls_to_blob = DataTransferStep(\n",
|
||||
" name=\"transfer_adls_to_blob\",\n",
|
||||
" source_data_reference=blob_data_ref,\n",
|
||||
" destination_data_reference=blob_data_ref2,\n",
|
||||
" compute_target=data_factory_compute)\n",
|
||||
"\n",
|
||||
"print(\"Data transfer step created\")"
|
||||
"# # TODO: 3012801 - Use ADLS Gen2 datastore.\n",
|
||||
"# blob_data_ref2 = DataReference(\n",
|
||||
"# datastore=blob_datastore,\n",
|
||||
"# data_reference_name=\"blob_test_data2\",\n",
|
||||
"# path_on_datastore=\"testdata2\")\n",
|
||||
"#\n",
|
||||
"# transfer_adls_to_blob = DataTransferStep(\n",
|
||||
"# name=\"transfer_adls_to_blob\",\n",
|
||||
"# source_data_reference=blob_data_ref,\n",
|
||||
"# destination_data_reference=blob_data_ref2,\n",
|
||||
"# compute_target=data_factory_compute,\n",
|
||||
"# source_reference_type='file',\n",
|
||||
"# destination_reference_type=\"file\")\n",
|
||||
"#\n",
|
||||
"# print(\"Data transfer step created\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -455,13 +457,13 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pipeline_01 = Pipeline(\n",
|
||||
" description=\"data_transfer_01\",\n",
|
||||
" workspace=ws,\n",
|
||||
" steps=[transfer_adls_to_blob])\n",
|
||||
"\n",
|
||||
"pipeline_run_01 = Experiment(ws, \"Data_Transfer_example_01\").submit(pipeline_01)\n",
|
||||
"pipeline_run_01.wait_for_completion()"
|
||||
"# pipeline_01 = Pipeline(\n",
|
||||
"# description=\"data_transfer_01\",\n",
|
||||
"# workspace=ws,\n",
|
||||
"# steps=[transfer_adls_to_blob])\n",
|
||||
"#\n",
|
||||
"# pipeline_run_01 = Experiment(ws, \"Data_Transfer_example_01\").submit(pipeline_01)\n",
|
||||
"# pipeline_run_01.wait_for_completion()"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -492,8 +494,8 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.widgets import RunDetails\n",
|
||||
"RunDetails(pipeline_run_01).show()"
|
||||
"# from azureml.widgets import RunDetails\n",
|
||||
"# RunDetails(pipeline_run_01).show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
@ -292,7 +292,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tf_env = Environment.get(ws, name='AzureML-tensorflow-2.12-cuda11')"
|
||||
"tf_env = Environment.get(ws, name='AzureML-tensorflow-2.16-cuda11')"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
@ -178,7 +178,7 @@ os.makedirs('./outputs/model', exist_ok=True)
|
|||
|
||||
# files saved in the "./outputs" folder are automatically uploaded into run history
|
||||
# this is workaround for https://github.com/tensorflow/tensorflow/issues/33913 and will be fixed once we move to >tf2.1
|
||||
neural_net._set_inputs(X_train)
|
||||
# neural_net._set_inputs(X_train)
|
||||
tf.saved_model.save(neural_net, './outputs/model/')
|
||||
|
||||
stop_time = time.perf_counter()
|
||||
|
|
|
@ -322,7 +322,7 @@
|
|||
"source": [
|
||||
"from azureml.core import Environment\n",
|
||||
"\n",
|
||||
"sklearn_env = Environment.get(ws, name='azureml-sklearn-1.0')"
|
||||
"sklearn_env = Environment.get(ws, name='azureml-sklearn-1.5')"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
@ -33,8 +33,6 @@ Using these samples, you will learn how to do the following.
|
|||
|
||||
| File/folder | Description |
|
||||
|-------------------|--------------------------------------------|
|
||||
| [cartpole_ci.ipynb](cartpole-on-compute-instance/cartpole_ci.ipynb) | Notebook to train a Cartpole playing agent on an Azure Machine Learning Compute Instance |
|
||||
| [cartpole_sc.ipynb](cartpole-on-single-compute/cartpole_sc.ipynb) | Notebook to train a Cartpole playing agent on an Azure Machine Learning Compute Cluster (single node) |
|
||||
| [pong_rllib.ipynb](atari-on-distributed-compute/pong_rllib.ipynb) | Notebook for distributed training of Pong agent using RLlib on multiple compute targets |
|
||||
|
||||
## Prerequisites
|
||||
|
|
|
@ -1,768 +0,0 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/cartpole_ci.png)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Reinforcement Learning in Azure Machine Learning - Cartpole Problem on Compute Instance\n",
|
||||
"\n",
|
||||
"Reinforcement Learning in Azure Machine Learning is a managed service for running reinforcement learning training and simulation. With Reinforcement Learning in Azure Machine Learning, data scientists can start developing reinforcement learning systems on one machine, and scale to compute targets with 100s of nodes if needed.\n",
|
||||
"\n",
|
||||
"This example shows how to use Reinforcement Learning in Azure Machine Learning to train a Cartpole playing agent on a compute instance."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Cartpole problem\n",
|
||||
"\n",
|
||||
"Cartpole, also known as [Inverted Pendulum](https://en.wikipedia.org/wiki/Inverted_pendulum), is a pendulum with a center of mass above its pivot point. This formation is essentially unstable and will easily fall over but can be kept balanced by applying appropriate horizontal forces to the pivot point.\n",
|
||||
"\n",
|
||||
"<table style=\"width:50%\">\n",
|
||||
" <tr>\n",
|
||||
" <th>\n",
|
||||
" <img src=\"./images/cartpole.png\" alt=\"Cartpole image\" /> \n",
|
||||
" </th>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th><p>Fig 1. Cartpole problem schematic description (from <a href=\"https://towardsdatascience.com/cartpole-introduction-to-reinforcement-learning-ed0eb5b58288\">towardsdatascience.com</a>).</p></th>\n",
|
||||
" </tr>\n",
|
||||
"</table>\n",
|
||||
"\n",
|
||||
"The goal here is to train an agent to keep the cartpole balanced by applying appropriate forces to the pivot point.\n",
|
||||
"\n",
|
||||
"See [this video](https://www.youtube.com/watch?v=XiigTGKZfks) for a real-world demonstration of cartpole problem."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Prerequisite\n",
|
||||
"The user should have completed the Azure Machine Learning Tutorial: [Get started creating your first ML experiment with the Python SDK](https://docs.microsoft.com/en-us/azure/machine-learning/tutorial-1st-experiment-sdk-setup). You will need to make sure that you have a valid subscription ID, a resource group, and an Azure Machine Learning workspace. All datastores and datasets you use should be associated with your workspace."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Set up Development Environment\n",
|
||||
"The following subsections show typical steps to setup your development environment. Setup includes:\n",
|
||||
"\n",
|
||||
"* Connecting to a workspace to enable communication between your local machine and remote resources\n",
|
||||
"* Creating an experiment to track all your runs\n",
|
||||
"* Using a Compute Instance as compute target"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Azure Machine Learning SDK \n",
|
||||
"Display the Azure Machine Learning SDK version."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683062935076
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import azureml.core\n",
|
||||
"print(\"Azure Machine Learning SDK version:\", azureml.core.VERSION)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Get Azure Machine Learning workspace\n",
|
||||
"Get a reference to an existing Azure Machine Learning workspace."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683062936280
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Workspace\n",
|
||||
"\n",
|
||||
"ws = Workspace.from_config()\n",
|
||||
"print(ws.name, ws.location, ws.resource_group, sep = ' | ')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Use Compute Instance as compute target\n",
|
||||
"\n",
|
||||
"A compute target is a designated compute resource where you run your training and simulation scripts. This location may be your local machine or a cloud-based compute resource. For more information see [What are compute targets in Azure Machine Learning?](https://docs.microsoft.com/en-us/azure/machine-learning/concept-compute-target)\n",
|
||||
"\n",
|
||||
"The code below shows how to use current compute instance as a compute target. First some helper functions:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683062936485
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os.path\n",
|
||||
"\n",
|
||||
"# Get information about the currently running compute instance (notebook VM), like its name and prefix.\n",
|
||||
"def load_nbvm():\n",
|
||||
" if not os.path.isfile(\"/mnt/azmnt/.nbvm\"):\n",
|
||||
" return None\n",
|
||||
" with open(\"/mnt/azmnt/.nbvm\", 'r') as nbvm_file:\n",
|
||||
" return { key:value for (key, value) in [ line.strip().split('=') for line in nbvm_file if '=' in line ] }\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Then we use these helper functions to get a handle to current compute instance."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683062937126
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.compute import ComputeInstance\n",
|
||||
"from azureml.core.compute_target import ComputeTargetException\n",
|
||||
"\n",
|
||||
"import random\n",
|
||||
"import string\n",
|
||||
"\n",
|
||||
"# Load current compute instance info\n",
|
||||
"current_compute_instance = load_nbvm()\n",
|
||||
"\n",
|
||||
"# For this demo, let's use the current compute instance as the compute target, if available\n",
|
||||
"if current_compute_instance:\n",
|
||||
" print(\"Current compute instance:\", current_compute_instance)\n",
|
||||
" instance_name = current_compute_instance['instance']\n",
|
||||
"else:\n",
|
||||
" # Compute instance name needs to be unique across all existing compute instances within an Azure region\n",
|
||||
" instance_name = \"cartpole-ci-\" + \"\".join(random.choice(string.ascii_lowercase) for _ in range(5))\n",
|
||||
" try:\n",
|
||||
" instance = ComputeInstance(workspace=ws, name=instance_name)\n",
|
||||
" print('Found existing instance, use it.')\n",
|
||||
" except ComputeTargetException:\n",
|
||||
" print(\"Creating new compute instance...\")\n",
|
||||
" compute_config = ComputeInstance.provisioning_configuration(\n",
|
||||
" vm_size='STANDARD_D2_V2'\n",
|
||||
" )\n",
|
||||
" instance = ComputeInstance.create(ws, instance_name, compute_config)\n",
|
||||
" instance.wait_for_completion(show_output=True)\n",
|
||||
" print(\"Instance name:\", instance_name)\n",
|
||||
"\n",
|
||||
"compute_target = ws.compute_targets[instance_name]\n",
|
||||
"\n",
|
||||
"print(\"Compute target status:\")\n",
|
||||
"print(compute_target.get_status().serialize())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create Azure Machine Learning experiment\n",
|
||||
"Create an experiment to track the runs in your workspace. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683062937499
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.experiment import Experiment\n",
|
||||
"\n",
|
||||
"experiment_name = 'CartPole-v1-CI'\n",
|
||||
"experiment = Experiment(workspace=ws, name=experiment_name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683064044718
|
||||
},
|
||||
"jupyter": {
|
||||
"outputs_hidden": false,
|
||||
"source_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Environment\n",
|
||||
"import os\n",
|
||||
"import time\n",
|
||||
"\n",
|
||||
"ray_environment_name = 'cartpole-ray-ci'\n",
|
||||
"ray_environment_dockerfile_path = os.path.join(os.getcwd(), 'files', 'docker', 'Dockerfile')\n",
|
||||
"\n",
|
||||
"# Build environment image\n",
|
||||
"ray_environment = Environment. \\\n",
|
||||
" from_dockerfile(name=ray_environment_name, dockerfile=ray_environment_dockerfile_path). \\\n",
|
||||
" register(workspace=ws)\n",
|
||||
"ray_env_build_details = ray_environment.build(workspace=ws)\n",
|
||||
"\n",
|
||||
"ray_env_build_details.wait_for_completion(show_output=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Train Cartpole Agent\n",
|
||||
"In this section, we show how to use Azure Machine Learning jobs and Ray/RLlib framework to train a cartpole playing agent. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create reinforcement learning training run\n",
|
||||
"\n",
|
||||
"The code below submits the training run using a `ScriptRunConfig`. By providing the\n",
|
||||
"command to run the training, and a `RunConfig` object configured with your\n",
|
||||
"compute target, number of nodes, and environment image to use."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683064046594
|
||||
},
|
||||
"jupyter": {
|
||||
"outputs_hidden": false,
|
||||
"source_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Environment\n",
|
||||
"from azureml.core import RunConfiguration, ScriptRunConfig, Experiment\n",
|
||||
"from azureml.core.runconfig import DockerConfiguration, RunConfiguration\n",
|
||||
"\n",
|
||||
"config_name = 'cartpole-ppo.yaml'\n",
|
||||
"script_name = 'cartpole_training.py'\n",
|
||||
"script_arguments = [\n",
|
||||
" '--config', config_name\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"aml_run_config_ml = RunConfiguration(communicator='OpenMpi')\n",
|
||||
"aml_run_config_ml.target = compute_target\n",
|
||||
"aml_run_config_ml.node_count = 1\n",
|
||||
"aml_run_config_ml.environment = ray_environment\n",
|
||||
"\n",
|
||||
"training_config = ScriptRunConfig(source_directory='./files',\n",
|
||||
" script=script_name,\n",
|
||||
" arguments=script_arguments,\n",
|
||||
" run_config = aml_run_config_ml\n",
|
||||
" )\n",
|
||||
"training_run = experiment.submit(training_config)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Training configuration\n",
|
||||
"\n",
|
||||
"This is the training configuration (in yaml) that we use to train an agent to solve the CartPole problem using\n",
|
||||
"the PPO algorithm.\n",
|
||||
"\n",
|
||||
"```yaml\n",
|
||||
"cartpole-ppo:\n",
|
||||
" env: CartPole-v1\n",
|
||||
" run: PPO\n",
|
||||
" stop:\n",
|
||||
" episode_reward_mean: 475\n",
|
||||
" time_total_s: 300\n",
|
||||
" checkpoint_config:\n",
|
||||
" checkpoint_frequency: 2\n",
|
||||
" checkpoint_at_end: true\n",
|
||||
" config:\n",
|
||||
" # Works for both torch and tf.\n",
|
||||
" framework: torch\n",
|
||||
" gamma: 0.99\n",
|
||||
" lr: 0.0003\n",
|
||||
" num_workers: 1\n",
|
||||
" observation_filter: MeanStdFilter\n",
|
||||
" num_sgd_iter: 6\n",
|
||||
" vf_loss_coeff: 0.01\n",
|
||||
" model:\n",
|
||||
" fcnet_hiddens: [32]\n",
|
||||
" fcnet_activation: linear\n",
|
||||
" vf_share_layers: true\n",
|
||||
" enable_connectors: true\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Monitor experiment\n",
|
||||
"Azure Machine Learning provides a Jupyter widget to show the status of an experiment run. You could use this widget to monitor the status of the runs.\n",
|
||||
"\n",
|
||||
"You can click on the link under **Status** to see the details of a child run. It will also show the metrics being logged."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683064049813
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.widgets import RunDetails\n",
|
||||
"\n",
|
||||
"RunDetails(training_run).show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Stop the run\n",
|
||||
"\n",
|
||||
"To stop the run, call `training_run.cancel()`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683064050024
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Uncomment line below to cancel the run\n",
|
||||
"# training_run.cancel()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Wait for completion\n",
|
||||
"Wait for the run to complete before proceeding.\n",
|
||||
"\n",
|
||||
"**Note: The run may take a few minutes to complete.**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683064304728
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"training_run.wait_for_completion()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Evaluate Trained Agent and See Results\n",
|
||||
"\n",
|
||||
"We can evaluate a previously trained policy using the `cartpole_rollout.py` helper script provided by RLlib (see [Evaluating Trained Policies](https://ray.readthedocs.io/en/latest/rllib-training.html#evaluating-trained-policies) for more details). Here we use an adaptation of this script to reconstruct a policy from a checkpoint taken and saved during training. We took these checkpoints by setting `checkpoint-freq` and `checkpoint-at-end` parameters above.\n",
|
||||
"\n",
|
||||
"In this section we show how to get access to these checkpoints data, and then how to use them to evaluate the trained policy."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create a dataset of training artifacts\n",
|
||||
"To evaluate a trained policy (a checkpoint) we need to make the checkpoint accessible to the rollout script.\n",
|
||||
"We can use the Run API to download policy training artifacts (saved model and checkpoints) to local compute."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683064305251
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from os import path\n",
|
||||
"from distutils import dir_util\n",
|
||||
"\n",
|
||||
"training_artifacts_path = path.join(\"logs\", \"cartpole-ppo\")\n",
|
||||
"print(\"Training artifacts path:\", training_artifacts_path)\n",
|
||||
"\n",
|
||||
"if path.exists(training_artifacts_path):\n",
|
||||
" dir_util.remove_tree(training_artifacts_path)\n",
|
||||
"\n",
|
||||
"# Download run artifacts to local compute\n",
|
||||
"training_run.download_files(training_artifacts_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now let's find the checkpoints and the last checkpoint number."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683064305283
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A helper function to find all of the checkpoint directories located within a larger directory tree\n",
|
||||
"def find_checkpoints(file_path):\n",
|
||||
" print(\"Looking in path:\", file_path)\n",
|
||||
" checkpoints = []\n",
|
||||
" for root, dirs, files in os.walk(file_path):\n",
|
||||
" trimmed_root = root[len(file_path)+1:]\n",
|
||||
" for name in dirs:\n",
|
||||
" if name.startswith('checkpoint_'):\n",
|
||||
" checkpoints.append(path.join(trimmed_root, name))\n",
|
||||
" return checkpoints"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683064305305
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Find checkpoints and last checkpoint number\n",
|
||||
"checkpoint_files = find_checkpoints(training_artifacts_path)\n",
|
||||
"\n",
|
||||
"last_checkpoint_path = None\n",
|
||||
"last_checkpoint_number = -1\n",
|
||||
"for checkpoint_file in checkpoint_files:\n",
|
||||
" checkpoint_number = int(os.path.basename(checkpoint_file).split('_')[1])\n",
|
||||
" if checkpoint_number > last_checkpoint_number:\n",
|
||||
" last_checkpoint_path = checkpoint_file\n",
|
||||
" last_checkpoint_number = checkpoint_number\n",
|
||||
"\n",
|
||||
"print(\"Last checkpoint number:\", last_checkpoint_number)\n",
|
||||
"print(\"Last checkpoint path:\", last_checkpoint_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now we upload checkpoints to default datastore and create a file dataset. This dataset will be used to pass in the checkpoints to the rollout script."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683064305331
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Upload the checkpoint files and create a DataSet\n",
|
||||
"from azureml.data.dataset_factory import FileDatasetFactory\n",
|
||||
"\n",
|
||||
"datastore = ws.get_default_datastore()\n",
|
||||
"checkpoint_ds = FileDatasetFactory.upload_directory(training_artifacts_path, (datastore, 'cartpole_checkpoints_' + training_run.id), overwrite=False, show_progress=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"To verify, we can print out the number (and paths) of all the files in the dataset."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683064305353
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"artifacts_paths = checkpoint_ds.to_path()\n",
|
||||
"print(\"Number of files in dataset:\", len(artifacts_paths))\n",
|
||||
"\n",
|
||||
"# Uncomment line below to print all file paths\n",
|
||||
"#print(\"Artifacts dataset file paths: \", artifacts_paths)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Evaluate Trained Agent and See Results\n",
|
||||
"\n",
|
||||
"We can evaluate a previously trained policy using the `cartpole_rollout.py` helper script provided by RLlib (see [Evaluating Trained Policies](https://ray.readthedocs.io/en/latest/rllib-training.html#evaluating-trained-policies) for more details). Here we use an adaptation of this script to reconstruct a policy from a checkpoint taken and saved during training. We took these checkpoints by setting `checkpoint-freq` and `checkpoint-at-end` parameters above.\n",
|
||||
"In this section we show how to use these checkpoints to evaluate the trained policy."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683064305371
|
||||
},
|
||||
"jupyter": {
|
||||
"outputs_hidden": false,
|
||||
"source_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ray_environment_name = 'cartpole-ray-ci'\n",
|
||||
"\n",
|
||||
"experiment_name = 'CartPole-v1-CI'\n",
|
||||
"\n",
|
||||
"experiment = Experiment(workspace=ws, name=experiment_name)\n",
|
||||
"ray_environment = Environment.get(workspace=ws, name=ray_environment_name)\n",
|
||||
"\n",
|
||||
"script_name = 'cartpole_rollout.py'\n",
|
||||
"script_arguments = [\n",
|
||||
" '--steps', '2000',\n",
|
||||
" '--checkpoint', last_checkpoint_path,\n",
|
||||
" '--algo', 'PPO',\n",
|
||||
" '--render', 'false',\n",
|
||||
" '--dataset_path', checkpoint_ds.as_named_input('dataset_path').as_mount()\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"aml_run_config_ml = RunConfiguration(communicator='OpenMpi')\n",
|
||||
"aml_run_config_ml.target = compute_target\n",
|
||||
"aml_run_config_ml.node_count = 1\n",
|
||||
"aml_run_config_ml.environment = ray_environment\n",
|
||||
"aml_run_config_ml.data\n",
|
||||
"\n",
|
||||
"rollout_config = ScriptRunConfig(\n",
|
||||
" source_directory='./files',\n",
|
||||
" script=script_name,\n",
|
||||
" arguments=script_arguments,\n",
|
||||
" run_config = aml_run_config_ml\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
"rollout_run = experiment.submit(rollout_config)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"And then, similar to the training section, we can monitor the real-time progress of the rollout run and its chid as follows. If you browse logs of the child run you can see the evaluation results recorded in std_log_process_0.txt file. Note that you may need to wait several minutes before these results become available."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683064305399
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"RunDetails(rollout_run).show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Wait for completion of the rollout run, or you may cancel the run."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683064305419
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Uncomment line below to cancel the run\n",
|
||||
"#rollout_run.cancel()\n",
|
||||
"rollout_run.wait_for_completion()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Cleaning up\n",
|
||||
"For your convenience, below you can find code snippets to clean up any resources created as part of this tutorial that you don't wish to retain."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683064305437
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# To archive the created experiment:\n",
|
||||
"#exp.archive()\n",
|
||||
"\n",
|
||||
"# To delete created compute instance\n",
|
||||
"if not current_compute_instance:\n",
|
||||
" compute_target.delete()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Next\n",
|
||||
"This example was about running Reinforcement Learning in Azure Machine Learning (Ray/RLlib Framework) on a compute instance. Please see [Cartpole Problem on Single Compute](../cartpole-on-single-compute/cartpole_sc.ipynb)\n",
|
||||
"example which uses Ray RLlib to train a Cartpole playing agent on a single node remote compute.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"authors": [
|
||||
{
|
||||
"name": "adrosa"
|
||||
},
|
||||
{
|
||||
"name": "hoazari"
|
||||
}
|
||||
],
|
||||
"categories": [
|
||||
"how-to-use-azureml",
|
||||
"reinforcement-learning"
|
||||
],
|
||||
"kernel_info": {
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.5"
|
||||
},
|
||||
"microsoft": {
|
||||
"host": {
|
||||
"AzureML": {
|
||||
"notebookHasBeenCompleted": true
|
||||
}
|
||||
},
|
||||
"ms_spell_check": {
|
||||
"ms_spell_check_language": "en"
|
||||
}
|
||||
},
|
||||
"notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License.",
|
||||
"nteract": {
|
||||
"version": "nteract-front-end@1.0.0"
|
||||
},
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "00c28698cbad9eaca051e9759b1181630e646922505b47b4c6352eb5aa72ddfc"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
|
@ -1,23 +0,0 @@
|
|||
cartpole-ppo:
|
||||
env: CartPole-v1
|
||||
run: PPO
|
||||
stop:
|
||||
episode_reward_mean: 475
|
||||
time_total_s: 300
|
||||
checkpoint_config:
|
||||
checkpoint_frequency: 2
|
||||
checkpoint_at_end: true
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
framework: torch
|
||||
gamma: 0.99
|
||||
lr: 0.0003
|
||||
num_workers: 1
|
||||
observation_filter: MeanStdFilter
|
||||
num_sgd_iter: 6
|
||||
vf_loss_coeff: 0.01
|
||||
model:
|
||||
fcnet_hiddens: [32]
|
||||
fcnet_activation: linear
|
||||
vf_share_layers: true
|
||||
enable_connectors: true
|
|
@ -1,108 +0,0 @@
|
|||
import os
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from ray.rllib.evaluate import RolloutSaver, rollout
|
||||
from ray_on_aml.core import Ray_On_AML
|
||||
import ray.cloudpickle as cloudpickle
|
||||
from ray.tune.utils import merge_dicts
|
||||
from ray.tune.registry import get_trainable_cls, _global_registry, ENV_CREATOR
|
||||
|
||||
from azureml.core import Run
|
||||
from utils import callbacks
|
||||
|
||||
import collections
|
||||
import copy
|
||||
import gymnasium as gym
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def run_rollout(checkpoint, algo, render, steps, episodes):
|
||||
config_dir = os.path.dirname(checkpoint)
|
||||
config_path = os.path.join(config_dir, "params.pkl")
|
||||
config = None
|
||||
|
||||
# Try parent directory.
|
||||
if not os.path.exists(config_path):
|
||||
config_path = os.path.join(config_dir, "../params.pkl")
|
||||
|
||||
# Load the config from pickled.
|
||||
if os.path.exists(config_path):
|
||||
with open(config_path, "rb") as f:
|
||||
config = cloudpickle.load(f)
|
||||
# If no pkl file found, require command line `--config`.
|
||||
else:
|
||||
raise ValueError("Could not find params.pkl in either the checkpoint dir or its parent directory")
|
||||
|
||||
# Make sure worker 0 has an Env.
|
||||
config["create_env_on_driver"] = True
|
||||
|
||||
# Merge with `evaluation_config` (first try from command line, then from
|
||||
# pkl file).
|
||||
evaluation_config = copy.deepcopy(config.get("evaluation_config", {}))
|
||||
config = merge_dicts(config, evaluation_config)
|
||||
env = config.get("env")
|
||||
|
||||
# Make sure we have evaluation workers.
|
||||
if not config.get("evaluation_num_workers"):
|
||||
config["evaluation_num_workers"] = config.get("num_workers", 0)
|
||||
if not config.get("evaluation_duration"):
|
||||
config["evaluation_duration"] = 1
|
||||
|
||||
# Hard-override this as it raises a warning by Algorithm otherwise.
|
||||
# Makes no sense anyways, to have it set to None as we don't call
|
||||
# `Algorithm.train()` here.
|
||||
config["evaluation_interval"] = 1
|
||||
|
||||
# Rendering settings.
|
||||
config["render_env"] = render
|
||||
|
||||
# Create the Algorithm from config.
|
||||
cls = get_trainable_cls(algo)
|
||||
algorithm = cls(env=env, config=config)
|
||||
|
||||
# Load state from checkpoint, if provided.
|
||||
if checkpoint:
|
||||
algorithm.restore(checkpoint)
|
||||
|
||||
# Do the actual rollout.
|
||||
with RolloutSaver(
|
||||
outfile=None,
|
||||
use_shelve=False,
|
||||
write_update_file=False,
|
||||
target_steps=steps,
|
||||
target_episodes=episodes,
|
||||
save_info=False,
|
||||
) as saver:
|
||||
rollout(algorithm, env, steps, episodes, saver, not render)
|
||||
algorithm.stop()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Start ray head (single node)
|
||||
ray_on_aml = Ray_On_AML()
|
||||
ray = ray_on_aml.getRay()
|
||||
if ray:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--dataset_path', required=True, help='Path to artifacts dataset')
|
||||
parser.add_argument('--checkpoint', required=True, help='Name of checkpoint file directory')
|
||||
parser.add_argument('--algo', required=True, help='Name of RL algorithm')
|
||||
parser.add_argument('--render', default=False, required=False, help='True to render')
|
||||
parser.add_argument('--steps', required=False, type=int, help='Number of steps to run')
|
||||
parser.add_argument('--episodes', required=False, type=int, help='Number of episodes to run')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Get a handle to run
|
||||
run = Run.get_context()
|
||||
|
||||
# Get handles to the tarining artifacts dataset and mount path
|
||||
dataset_path = run.input_datasets['dataset_path']
|
||||
|
||||
# Find checkpoint file to be evaluated
|
||||
checkpoint = os.path.join(dataset_path, args.checkpoint)
|
||||
print('Checkpoint:', checkpoint)
|
||||
|
||||
# Start rollout
|
||||
ray.init(address='auto')
|
||||
run_rollout(checkpoint, args.algo, args.render, args.steps, args.episodes)
|
|
@ -1,34 +0,0 @@
|
|||
from ray_on_aml.core import Ray_On_AML
|
||||
import yaml
|
||||
from ray.tune.tune import run_experiments
|
||||
from utils import callbacks
|
||||
import argparse
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--config', help='Path to yaml configuration file')
|
||||
args = parser.parse_args()
|
||||
|
||||
ray_on_aml = Ray_On_AML()
|
||||
ray = ray_on_aml.getRay()
|
||||
if ray: # in the headnode
|
||||
ray.init(address="auto")
|
||||
print("Configuring run from file: ", args.config)
|
||||
experiment_config = None
|
||||
with open(args.config, "r") as file:
|
||||
experiment_config = yaml.safe_load(file)
|
||||
|
||||
# Set local_dir in each experiment configuration to ensure generated logs get picked up
|
||||
# Also set monitor to ensure videos are captured
|
||||
for experiment_name, experiment in experiment_config.items():
|
||||
experiment["storage_path"] = "./logs"
|
||||
experiment['config']['monitor'] = True
|
||||
print(f'Config: {experiment_config}')
|
||||
|
||||
trials = run_experiments(
|
||||
experiment_config,
|
||||
callbacks=[callbacks.TrialCallback()],
|
||||
verbose=2
|
||||
)
|
||||
else:
|
||||
print("in worker node")
|
|
@ -1,27 +0,0 @@
|
|||
FROM mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04
|
||||
|
||||
RUN pip install ray-on-aml==0.2.4 \
|
||||
ray==2.4.0 \
|
||||
ray[rllib]==2.4.0 \
|
||||
mlflow==2.3.1 \
|
||||
azureml-defaults==1.50.0 \
|
||||
azureml-dataset-runtime[fuse,pandas]==1.50.0 \
|
||||
azureml-contrib-reinforcementlearning==1.50.0 \
|
||||
gputil==1.4.0 \
|
||||
scipy==1.9.1 \
|
||||
pyglet==2.0.6 \
|
||||
cloudpickle==2.2.1 \
|
||||
tensorflow==2.11.0 \
|
||||
tensorflow-probability==0.19.0 \
|
||||
torch \
|
||||
tabulate==0.9.0 \
|
||||
dm_tree==0.1.8 \
|
||||
lz4==4.3.2 \
|
||||
psutil==5.9.4 \
|
||||
setproctitle==1.3.2 \
|
||||
pygame==2.1.0 \
|
||||
gymnasium[classic_control]==0.26.3 \
|
||||
gym[classic_control]==0.26.2
|
||||
|
||||
# Display the exact versions we have installed
|
||||
RUN pip freeze
|
|
@ -1,22 +0,0 @@
|
|||
'''RLlib callbacks module:
|
||||
Common callback methods to be passed to RLlib trainer.
|
||||
'''
|
||||
|
||||
from azureml.core import Run
|
||||
from ray import tune
|
||||
from ray.tune import Callback
|
||||
from ray.air import session
|
||||
|
||||
|
||||
class TrialCallback(Callback):
|
||||
|
||||
def on_trial_result(self, iteration, trials, trial, result, **info):
|
||||
'''Callback on train result to record metrics returned by trainer.
|
||||
'''
|
||||
run = Run.get_context()
|
||||
run.log(
|
||||
name='episode_reward_mean',
|
||||
value=result["episode_reward_mean"])
|
||||
run.log(
|
||||
name='episodes_total',
|
||||
value=result["episodes_total"])
|
|
@ -1,13 +0,0 @@
|
|||
'''Misc module:
|
||||
Miscellaneous helper functions and utilities.
|
||||
'''
|
||||
|
||||
import os
|
||||
import glob
|
||||
|
||||
|
||||
# Helper function to find a file or folder path
|
||||
def find_path(name, path_prefix):
|
||||
for root, _, _ in os.walk(path_prefix):
|
||||
if glob.glob(os.path.join(root, name)):
|
||||
return root
|
Двоичный файл не отображается.
До Ширина: | Высота: | Размер: 1.3 KiB |
|
@ -1,917 +0,0 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/reinforcement-learning/cartpole_on_single_compute/cartpole_sc.png)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Reinforcement Learning in Azure Machine Learning - Cartpole Problem on Single Compute\n",
|
||||
"\n",
|
||||
"Reinforcement Learning in Azure Machine Learning is a managed service for running reinforcement learning training and simulation. With Reinforcement Learning in Azure Machine Learning, data scientists can start developing reinforcement learning systems on one machine, and scale to compute targets with 100s of nodes if needed.\n",
|
||||
"\n",
|
||||
"This example shows how to use Reinforcement Learning in Azure Machine Learning to train a Cartpole playing agent on a single compute. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Cartpole problem\n",
|
||||
"\n",
|
||||
"Cartpole, also known as [Inverted Pendulum](https://en.wikipedia.org/wiki/Inverted_pendulum), is a pendulum with a center of mass above its pivot point. This formation is essentially unstable and will easily fall over but can be kept balanced by applying appropriate horizontal forces to the pivot point.\n",
|
||||
"\n",
|
||||
"<table style=\"width:50%\">\n",
|
||||
" <tr>\n",
|
||||
" <th>\n",
|
||||
" <img src=\"./images/cartpole.png\" alt=\"Cartpole image\" /> \n",
|
||||
" </th>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th><p>Fig 1. Cartpole problem schematic description (from <a href=\"https://towardsdatascience.com/cartpole-introduction-to-reinforcement-learning-ed0eb5b58288\">towardsdatascience.com</a>).</p></th>\n",
|
||||
" </tr>\n",
|
||||
"</table>\n",
|
||||
"\n",
|
||||
"The goal here is to train an agent to keep the cartpole balanced by applying appropriate forces to the pivot point.\n",
|
||||
"\n",
|
||||
"See [this video](https://www.youtube.com/watch?v=XiigTGKZfks) for a real-world demonstration of cartpole problem."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Prerequisite\n",
|
||||
"The user should have completed the Azure Machine Learning Tutorial: [Get started creating your first ML experiment with the Python SDK](https://docs.microsoft.com/en-us/azure/machine-learning/tutorial-1st-experiment-sdk-setup). You will need to make sure that you have a valid subscription ID, a resource group, and an Azure Machine Learning workspace. All datastores and datasets you use should be associated with your workspace."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Set up Development Environment\n",
|
||||
"The following subsections show typical steps to setup your development environment. Setup includes:\n",
|
||||
"\n",
|
||||
"* Connecting to a workspace to enable communication between your local machine and remote resources\n",
|
||||
"* Creating an experiment to track all your runs\n",
|
||||
"* Creating a remote compute target to use for training"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Azure Machine Learning SDK \n",
|
||||
"Display the Azure Machine Learning SDK version."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683056824182
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import azureml.core\n",
|
||||
"\n",
|
||||
"print(\"Azure Machine Learning SDK version:\", azureml.core.VERSION)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Get Azure Machine Learning workspace\n",
|
||||
"Get a reference to an existing Azure Machine Learning workspace."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683056825821
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Workspace\n",
|
||||
"\n",
|
||||
"ws = Workspace.from_config()\n",
|
||||
"print(ws.name, ws.location, ws.resource_group, sep = ' | ')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create a new compute resource or attach an existing one\n",
|
||||
"\n",
|
||||
"A compute target is a designated compute resource where you run your training and simulation scripts. This location may be your local machine or a cloud-based compute resource. The code below shows how to create a cloud-based compute target. For more information see [What are compute targets in Azure Machine Learning?](https://docs.microsoft.com/en-us/azure/machine-learning/concept-compute-target)\n",
|
||||
"\n",
|
||||
"> Note that if you have an AzureML Data Scientist role, you will not have permission to create compute resources. Talk to your workspace or IT admin to create the compute targets described in this section, if they do not already exist.\n",
|
||||
"\n",
|
||||
"**Note: Creation of a compute resource can take several minutes**. Please make sure to change `STANDARD_D2_V2` to a [size available in your region](https://azure.microsoft.com/en-us/global-infrastructure/services/?products=virtual-machines)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683056826903
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.compute import AmlCompute, ComputeTarget\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"# Choose a name and maximum size for your cluster\n",
|
||||
"compute_name = \"cpu-cluster-d2\"\n",
|
||||
"compute_min_nodes = 0\n",
|
||||
"compute_max_nodes = 4\n",
|
||||
"vm_size = \"STANDARD_D2_V2\"\n",
|
||||
"\n",
|
||||
"if compute_name in ws.compute_targets:\n",
|
||||
" print(\"Found an existing compute target of name: \" + compute_name)\n",
|
||||
" compute_target = ws.compute_targets[compute_name]\n",
|
||||
" # Note: you may want to make sure compute_target is of type AmlCompute \n",
|
||||
"else:\n",
|
||||
" print(\"Creating new compute target...\")\n",
|
||||
" provisioning_config = AmlCompute.provisioning_configuration(\n",
|
||||
" vm_size=vm_size,\n",
|
||||
" min_nodes=compute_min_nodes, \n",
|
||||
" max_nodes=compute_max_nodes)\n",
|
||||
" \n",
|
||||
" # Create the cluster\n",
|
||||
" compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)\n",
|
||||
" compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)\n",
|
||||
"\n",
|
||||
"print(compute_target.get_status().serialize())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create Azure Machine Learning experiment\n",
|
||||
"Create an experiment to track the runs in your workspace. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683056827252
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.experiment import Experiment\n",
|
||||
"\n",
|
||||
"experiment_name = 'CartPole-v1-SC'\n",
|
||||
"experiment = Experiment(workspace=ws, name=experiment_name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646417962898
|
||||
},
|
||||
"jupyter": {
|
||||
"outputs_hidden": false,
|
||||
"source_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Environment\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"ray_environment_name = 'cartpole-ray-sc'\n",
|
||||
"ray_environment_dockerfile_path = os.path.join(os.getcwd(), 'files', 'docker', 'Dockerfile')\n",
|
||||
"\n",
|
||||
"# Build environment image\n",
|
||||
"ray_environment = Environment. \\\n",
|
||||
" from_dockerfile(name=ray_environment_name, dockerfile=ray_environment_dockerfile_path). \\\n",
|
||||
" register(workspace=ws)\n",
|
||||
"ray_env_build_details = ray_environment.build(workspace=ws)\n",
|
||||
"\n",
|
||||
"ray_env_build_details.wait_for_completion(show_output=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Train Cartpole Agent\n",
|
||||
"In this section, we show how to use Azure Machine Learning jobs and Ray/RLlib framework to train a cartpole playing agent. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create reinforcement learning training run\n",
|
||||
"\n",
|
||||
"The code below submits the training run using a `ScriptRunConfig`. By providing the\n",
|
||||
"command to run the training, and a `RunConfig` object configured with your\n",
|
||||
"compute target, number of nodes, and environment image to use."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683059658819
|
||||
},
|
||||
"jupyter": {
|
||||
"outputs_hidden": false,
|
||||
"source_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Environment\n",
|
||||
"from azureml.core import RunConfiguration, ScriptRunConfig, Experiment\n",
|
||||
"from azureml.core.runconfig import DockerConfiguration, RunConfiguration\n",
|
||||
"\n",
|
||||
"config_name = 'cartpole-ppo.yaml'\n",
|
||||
"script_name = 'cartpole_training.py'\n",
|
||||
"video_capture = True\n",
|
||||
"script_arguments = [\n",
|
||||
" '--config', config_name\n",
|
||||
"]\n",
|
||||
"command=[\"python\", script_name, *script_arguments]\n",
|
||||
"\n",
|
||||
"aml_run_config_ml = RunConfiguration(communicator='OpenMpi')\n",
|
||||
"aml_run_config_ml.target = compute_target\n",
|
||||
"aml_run_config_ml.node_count = 1\n",
|
||||
"aml_run_config_ml.environment = ray_environment\n",
|
||||
"\n",
|
||||
"if video_capture:\n",
|
||||
" command = [\"xvfb-run -s '-screen 0 640x480x16 -ac +extension GLX +render' \"] + command\n",
|
||||
" aml_run_config_ml.environment_variables[\"SDL_VIDEODRIVER\"] = \"dummy\"\n",
|
||||
"\n",
|
||||
"training_config = ScriptRunConfig(source_directory='./files',\n",
|
||||
" command=command,\n",
|
||||
" run_config = aml_run_config_ml\n",
|
||||
" )\n",
|
||||
"training_run = experiment.submit(training_config)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Training configuration\n",
|
||||
"\n",
|
||||
"This is the training configuration (in yaml) that we use to train an agent to solve the CartPole problem using\n",
|
||||
"the PPO algorithm.\n",
|
||||
"\n",
|
||||
"```yaml\n",
|
||||
"cartpole-ppo:\n",
|
||||
" env: CartPole-v1\n",
|
||||
" run: PPO\n",
|
||||
" stop:\n",
|
||||
" episode_reward_mean: 475\n",
|
||||
" time_total_s: 300\n",
|
||||
" checkpoint_config:\n",
|
||||
" checkpoint_frequency: 2\n",
|
||||
" checkpoint_at_end: true\n",
|
||||
" config:\n",
|
||||
" # Works for both torch and tf.\n",
|
||||
" framework: torch\n",
|
||||
" gamma: 0.99\n",
|
||||
" lr: 0.0003\n",
|
||||
" num_workers: 1\n",
|
||||
" observation_filter: MeanStdFilter\n",
|
||||
" num_sgd_iter: 6\n",
|
||||
" vf_loss_coeff: 0.01\n",
|
||||
" model:\n",
|
||||
" fcnet_hiddens: [32]\n",
|
||||
" fcnet_activation: linear\n",
|
||||
" vf_share_layers: true\n",
|
||||
" enable_connectors: true\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Monitor experiment\n",
|
||||
"\n",
|
||||
"Azure Machine Learning provides a Jupyter widget to show the status of an experiment run. You could use this widget to monitor the status of the runs."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683060289002
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.widgets import RunDetails\n",
|
||||
"\n",
|
||||
"RunDetails(training_run).show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Stop the run\n",
|
||||
"To stop the run, call `training_run.cancel()`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Uncomment line below to cancel the run\n",
|
||||
"# training_run.cancel()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Wait for completion\n",
|
||||
"Wait for the run to complete before proceeding.\n",
|
||||
"\n",
|
||||
"**Note: The length of the run depends on the provisioning time of the compute target and it may take several minutes to complete.**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683060297005
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"training_run.wait_for_completion()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Get access to training artifacts\n",
|
||||
"We can simply use run id to get a handle to an in-progress or a previously concluded run."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683060517858
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Run\n",
|
||||
"\n",
|
||||
"run_id = training_run.id # Or set to run id of a completed run (e.g. 'rl-cartpole-v0_1587572312_06e04ace_head')\n",
|
||||
"run = Run(experiment, run_id=run_id)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now we can use the Run API to download policy training artifacts (saved model and checkpoints) to local compute."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683060521847
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from os import path\n",
|
||||
"from distutils import dir_util\n",
|
||||
"\n",
|
||||
"training_artifacts_path = path.join(\"logs\", \"cartpole-ppo\")\n",
|
||||
"print(\"Training artifacts path:\", training_artifacts_path)\n",
|
||||
"\n",
|
||||
"if path.exists(training_artifacts_path):\n",
|
||||
" dir_util.remove_tree(training_artifacts_path)\n",
|
||||
"\n",
|
||||
"# Download run artifacts to local compute\n",
|
||||
"training_run.download_files(training_artifacts_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Display movies of selected training episodes\n",
|
||||
"\n",
|
||||
"Ray creates video output of selected training episodes in mp4 format. Here we will display two of these, i.e. the first and the last recorded videos, so you could see the improvement of the agent after training.\n",
|
||||
"\n",
|
||||
"First we introduce a few helper functions: a function to download the movies from our dataset, another one to find mp4 movies in a local directory, and one more to display a downloaded movie."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683060867182
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import shutil\n",
|
||||
"\n",
|
||||
"# A helper function to find movies in a directory\n",
|
||||
"def find_movies(movie_path):\n",
|
||||
" print(\"Looking in path:\", movie_path)\n",
|
||||
" mp4_movies = []\n",
|
||||
" for root, _, files in os.walk(movie_path):\n",
|
||||
" for name in files:\n",
|
||||
" if name.endswith('.mp4'):\n",
|
||||
" mp4_movies.append(path.join(root, name))\n",
|
||||
" print('Found {} movies'.format(len(mp4_movies)))\n",
|
||||
"\n",
|
||||
" return mp4_movies\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# A helper function to display a movie\n",
|
||||
"from IPython.core.display import Video\n",
|
||||
"from IPython.display import display\n",
|
||||
"def display_movie(movie_file):\n",
|
||||
" display(Video(movie_file, embed=True, html_attributes='controls'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Look for the downloaded movies in the local directory and sort them."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683060871682
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"mp4_files = find_movies(training_artifacts_path)\n",
|
||||
"mp4_files.sort()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Display a movie of the first training episode. This is how the agent performs with no training."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683060900828
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"first_movie = mp4_files[0] if len(mp4_files) > 0 else None\n",
|
||||
"print(\"First movie:\", first_movie)\n",
|
||||
"\n",
|
||||
"if first_movie:\n",
|
||||
" display_movie(first_movie)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Display a movie of the last training episode. This is how a fully-trained agent performs."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683060914790
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"last_movie = mp4_files[-1] if len(mp4_files) > 0 else None\n",
|
||||
"print(\"Last movie:\", last_movie)\n",
|
||||
"\n",
|
||||
"if last_movie:\n",
|
||||
" display_movie(last_movie)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Evaluate Trained Agent and See Results\n",
|
||||
"\n",
|
||||
"We can evaluate a previously trained policy using the `rollout.py` helper script provided by RLlib (see [Evaluating Trained Policies](https://ray.readthedocs.io/en/latest/rllib-training.html#evaluating-trained-policies) for more details). Here we use an adaptation of this script to reconstruct a policy from a checkpoint taken and saved during training. We took these checkpoints by setting `checkpoint-freq` and `checkpoint-at-end` parameters above.\n",
|
||||
"In this section we show how to use these checkpoints to evaluate the trained policy."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Evaluate a trained policy\n",
|
||||
"In this section, we submit another job, to evalute a trained policy. The entrypoint for this job is\n",
|
||||
"`cartpole-rollout.py` script, and we we pass the checkpoints dataset to this script as a dataset refrence.\n",
|
||||
"\n",
|
||||
"We are using script parameters to pass in the same algorithm and the same environment used during training. We also specify the checkpoint number of the checkpoint we wish to evaluate, `checkpoint-number`, and number of the steps we shall run the rollout, `steps`.\n",
|
||||
"\n",
|
||||
"The training artifacts dataset will be accessible to the rollout script as a mounted folder. The mounted folder and the checkpoint number, passed in via `checkpoint-number`, will be used to create a path to the checkpoint we are going to evaluate. The created checkpoint path then will be passed into RLlib rollout script for evaluation.\n",
|
||||
"\n",
|
||||
"Let's find the checkpoints and the last checkpoint number first."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683061167899
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A helper function to find all of the checkpoint directories located within a larger directory tree\n",
|
||||
"def find_checkpoints(file_path):\n",
|
||||
" print(\"Looking in path:\", file_path)\n",
|
||||
" checkpoints = []\n",
|
||||
" for root, dirs, files in os.walk(file_path):\n",
|
||||
" trimmed_root = root[len(file_path)+1:]\n",
|
||||
" for name in dirs:\n",
|
||||
" if name.startswith('checkpoint_'):\n",
|
||||
" checkpoints.append(path.join(trimmed_root, name))\n",
|
||||
" return checkpoints"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683061170184
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Find checkpoints and last checkpoint number\n",
|
||||
"checkpoint_files = find_checkpoints(training_artifacts_path)\n",
|
||||
"\n",
|
||||
"last_checkpoint_path = None\n",
|
||||
"last_checkpoint_number = -1\n",
|
||||
"for checkpoint_file in checkpoint_files:\n",
|
||||
" checkpoint_number = int(os.path.basename(checkpoint_file).split('_')[1])\n",
|
||||
" if checkpoint_number > last_checkpoint_number:\n",
|
||||
" last_checkpoint_path = checkpoint_file\n",
|
||||
" last_checkpoint_number = checkpoint_number\n",
|
||||
"\n",
|
||||
"print(\"Last checkpoint number:\", last_checkpoint_number)\n",
|
||||
"print(\"Last checkpoint path:\", last_checkpoint_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683061176740
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Upload the checkpoint files and create a DataSet\n",
|
||||
"from azureml.data.dataset_factory import FileDatasetFactory\n",
|
||||
"\n",
|
||||
"datastore = ws.get_default_datastore()\n",
|
||||
"checkpoint_ds = FileDatasetFactory.upload_directory(training_artifacts_path, (datastore, 'cartpole_checkpoints_' + training_run.id), overwrite=False, show_progress=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You can submit the training run using a `ScriptRunConfig`. By providing the\n",
|
||||
"command to run the training, and a `RunConfig` object configured w"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683062377151
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ray_environment_name = 'cartpole-ray-sc'\n",
|
||||
"\n",
|
||||
"experiment_name = 'CartPole-v1-SC'\n",
|
||||
"training_algorithm = 'PPO'\n",
|
||||
"rl_environment = 'CartPole-v1'\n",
|
||||
"\n",
|
||||
"experiment = Experiment(workspace=ws, name=experiment_name)\n",
|
||||
"ray_environment = Environment.get(workspace=ws, name=ray_environment_name)\n",
|
||||
"\n",
|
||||
"script_name = 'cartpole_rollout.py'\n",
|
||||
"script_arguments = [\n",
|
||||
" '--steps', '2000',\n",
|
||||
" '--checkpoint', last_checkpoint_path,\n",
|
||||
" '--algo', 'PPO',\n",
|
||||
" '--render', 'true',\n",
|
||||
" '--dataset_path', checkpoint_ds.as_named_input('dataset_path').as_mount()\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"aml_run_config_ml = RunConfiguration(communicator='OpenMpi')\n",
|
||||
"aml_run_config_ml.target = compute_target\n",
|
||||
"aml_run_config_ml.node_count = 1\n",
|
||||
"aml_run_config_ml.environment = ray_environment\n",
|
||||
"aml_run_config_ml.data\n",
|
||||
"\n",
|
||||
"rollout_config = ScriptRunConfig(\n",
|
||||
" source_directory='./files',\n",
|
||||
" script=script_name,\n",
|
||||
" arguments=script_arguments,\n",
|
||||
" run_config = aml_run_config_ml\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
"rollout_run = experiment.submit(rollout_config)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"And then, similar to the training section, we can monitor the real-time progress of the rollout run and its chid as follows. If you browse logs of the child run you can see the evaluation results recorded in driver_log.txt file. Note that you may need to wait several minutes before these results become available."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683062379999
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"RunDetails(rollout_run).show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Wait for completion of the rollout run before moving to the next section, or you may cancel the run."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683062451723
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Uncomment line below to cancel the run\n",
|
||||
"#rollout_run.cancel()\n",
|
||||
"rollout_run.wait_for_completion()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Display movies of selected rollout episodes\n",
|
||||
"\n",
|
||||
"To display recorded movies first we download recorded videos to local machine. Here again we create a dataset of rollout artifacts and use the helper functions introduced above to download and displays rollout videos."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683062747822
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Download rollout artifacts\n",
|
||||
"rollout_artifacts_path = path.join(\"logs\", \"rollout\")\n",
|
||||
"print(\"Rollout artifacts path:\", rollout_artifacts_path)\n",
|
||||
"\n",
|
||||
"if path.exists(rollout_artifacts_path):\n",
|
||||
" dir_util.remove_tree(rollout_artifacts_path)\n",
|
||||
"\n",
|
||||
"# Download videos to local compute\n",
|
||||
"rollout_run.download_files(\"logs/video\", output_directory = rollout_artifacts_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now, similar to the training section, we look for the last video."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683062752847
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Look for the downloaded movie in local directory\n",
|
||||
"mp4_files = find_movies(rollout_artifacts_path)\n",
|
||||
"mp4_files.sort()\n",
|
||||
"last_movie = mp4_files[-1] if len(mp4_files) > 1 else None\n",
|
||||
"print(\"Last movie:\", last_movie)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Display last video recorded during the rollout."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683062763275
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"last_movie = mp4_files[-1] if len(mp4_files) > 0 else None\n",
|
||||
"print(\"Last movie:\", last_movie)\n",
|
||||
"\n",
|
||||
"if last_movie:\n",
|
||||
" display_movie(last_movie)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Cleaning up\n",
|
||||
"For your convenience, below you can find code snippets to clean up any resources created as part of this tutorial that you don't wish to retain."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# To archive the created experiment:\n",
|
||||
"#exp.archive()\n",
|
||||
"\n",
|
||||
"# To delete the compute target:\n",
|
||||
"#compute_target.delete()\n",
|
||||
"\n",
|
||||
"# To delete downloaded training artifacts\n",
|
||||
"#if os.path.exists(training_artifacts_path):\n",
|
||||
"# dir_util.remove_tree(training_artifacts_path)\n",
|
||||
"\n",
|
||||
"# To delete downloaded rollout videos\n",
|
||||
"#if path.exists(rollout_artifacts_path):\n",
|
||||
"# dir_util.remove_tree(rollout_artifacts_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Next\n",
|
||||
"This example was about running Reinforcement Learning in Azure Machine Learning (Ray/RLlib Framework) on a single compute. Please see [Pong Problem](../atari-on-distributed-compute/pong_rllib.ipynb)\n",
|
||||
"example which uses Ray RLlib to train a Pong playing agent on a multi-node cluster."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"authors": [
|
||||
{
|
||||
"name": "hoazari"
|
||||
},
|
||||
{
|
||||
"name": "dasommer"
|
||||
}
|
||||
],
|
||||
"categories": [
|
||||
"how-to-use-azureml",
|
||||
"reinforcement-learning"
|
||||
],
|
||||
"kernel_info": {
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.5"
|
||||
},
|
||||
"microsoft": {
|
||||
"host": {
|
||||
"AzureML": {
|
||||
"notebookHasBeenCompleted": true
|
||||
}
|
||||
},
|
||||
"ms_spell_check": {
|
||||
"ms_spell_check_language": "en"
|
||||
}
|
||||
},
|
||||
"notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License.",
|
||||
"nteract": {
|
||||
"version": "nteract-front-end@1.0.0"
|
||||
},
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "00c28698cbad9eaca051e9759b1181630e646922505b47b4c6352eb5aa72ddfc"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
|
@ -1,24 +0,0 @@
|
|||
cartpole-ppo:
|
||||
env: CartPole-v1
|
||||
run: PPO
|
||||
stop:
|
||||
episode_reward_mean: 475
|
||||
time_total_s: 300
|
||||
checkpoint_config:
|
||||
checkpoint_frequency: 2
|
||||
checkpoint_at_end: true
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
framework: torch
|
||||
gamma: 0.99
|
||||
lr: 0.0003
|
||||
num_workers: 1
|
||||
observation_filter: MeanStdFilter
|
||||
num_sgd_iter: 6
|
||||
vf_loss_coeff: 0.01
|
||||
model:
|
||||
fcnet_hiddens: [32]
|
||||
fcnet_activation: linear
|
||||
vf_share_layers: true
|
||||
enable_connectors: true
|
||||
render_env: true
|
|
@ -1,108 +0,0 @@
|
|||
import os
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from ray.rllib.evaluate import RolloutSaver, rollout
|
||||
from ray_on_aml.core import Ray_On_AML
|
||||
import ray.cloudpickle as cloudpickle
|
||||
from ray.tune.utils import merge_dicts
|
||||
from ray.tune.registry import get_trainable_cls, _global_registry, ENV_CREATOR
|
||||
|
||||
from azureml.core import Run
|
||||
from utils import callbacks
|
||||
|
||||
import collections
|
||||
import copy
|
||||
import gymnasium as gym
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def run_rollout(checkpoint, algo, render, steps, episodes):
|
||||
config_dir = os.path.dirname(checkpoint)
|
||||
config_path = os.path.join(config_dir, "params.pkl")
|
||||
config = None
|
||||
|
||||
# Try parent directory.
|
||||
if not os.path.exists(config_path):
|
||||
config_path = os.path.join(config_dir, "../params.pkl")
|
||||
|
||||
# Load the config from pickled.
|
||||
if os.path.exists(config_path):
|
||||
with open(config_path, "rb") as f:
|
||||
config = cloudpickle.load(f)
|
||||
# If no pkl file found, require command line `--config`.
|
||||
else:
|
||||
raise ValueError("Could not find params.pkl in either the checkpoint dir or its parent directory")
|
||||
|
||||
# Make sure worker 0 has an Env.
|
||||
config["create_env_on_driver"] = True
|
||||
|
||||
# Merge with `evaluation_config` (first try from command line, then from
|
||||
# pkl file).
|
||||
evaluation_config = copy.deepcopy(config.get("evaluation_config", {}))
|
||||
config = merge_dicts(config, evaluation_config)
|
||||
env = config.get("env")
|
||||
|
||||
# Make sure we have evaluation workers.
|
||||
if not config.get("evaluation_num_workers"):
|
||||
config["evaluation_num_workers"] = config.get("num_workers", 0)
|
||||
if not config.get("evaluation_duration"):
|
||||
config["evaluation_duration"] = 1
|
||||
|
||||
# Hard-override this as it raises a warning by Algorithm otherwise.
|
||||
# Makes no sense anyways, to have it set to None as we don't call
|
||||
# `Algorithm.train()` here.
|
||||
config["evaluation_interval"] = 1
|
||||
|
||||
# Rendering settings.
|
||||
config["render_env"] = render
|
||||
|
||||
# Create the Algorithm from config.
|
||||
cls = get_trainable_cls(algo)
|
||||
algorithm = cls(env=env, config=config)
|
||||
|
||||
# Load state from checkpoint, if provided.
|
||||
if checkpoint:
|
||||
algorithm.restore(checkpoint)
|
||||
|
||||
# Do the actual rollout.
|
||||
with RolloutSaver(
|
||||
outfile=None,
|
||||
use_shelve=False,
|
||||
write_update_file=False,
|
||||
target_steps=steps,
|
||||
target_episodes=episodes,
|
||||
save_info=False,
|
||||
) as saver:
|
||||
rollout(algorithm, env, steps, episodes, saver, not render)
|
||||
algorithm.stop()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Start ray head (single node)
|
||||
ray_on_aml = Ray_On_AML()
|
||||
ray = ray_on_aml.getRay()
|
||||
if ray:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--dataset_path', required=True, help='Path to artifacts dataset')
|
||||
parser.add_argument('--checkpoint', required=True, help='Name of checkpoint file directory')
|
||||
parser.add_argument('--algo', required=True, help='Name of RL algorithm')
|
||||
parser.add_argument('--render', default=False, required=False, help='True to render')
|
||||
parser.add_argument('--steps', required=False, type=int, help='Number of steps to run')
|
||||
parser.add_argument('--episodes', required=False, type=int, help='Number of episodes to run')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Get a handle to run
|
||||
run = Run.get_context()
|
||||
|
||||
# Get handles to the tarining artifacts dataset and mount path
|
||||
dataset_path = run.input_datasets['dataset_path']
|
||||
|
||||
# Find checkpoint file to be evaluated
|
||||
checkpoint = os.path.join(dataset_path, args.checkpoint)
|
||||
print('Checkpoint:', checkpoint)
|
||||
|
||||
# Start rollout
|
||||
ray.init(address='auto')
|
||||
run_rollout(checkpoint, args.algo, args.render, args.steps, args.episodes)
|
|
@ -1,34 +0,0 @@
|
|||
from ray_on_aml.core import Ray_On_AML
|
||||
import yaml
|
||||
from ray.tune.tune import run_experiments
|
||||
from utils import callbacks
|
||||
import argparse
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--config', help='Path to yaml configuration file')
|
||||
args = parser.parse_args()
|
||||
|
||||
ray_on_aml = Ray_On_AML()
|
||||
ray = ray_on_aml.getRay()
|
||||
if ray: # in the headnode
|
||||
ray.init(address="auto")
|
||||
print("Configuring run from file: ", args.config)
|
||||
experiment_config = None
|
||||
with open(args.config, "r") as file:
|
||||
experiment_config = yaml.safe_load(file)
|
||||
|
||||
# Set local_dir in each experiment configuration to ensure generated logs get picked up
|
||||
# Also set monitor to ensure videos are captured
|
||||
for experiment_name, experiment in experiment_config.items():
|
||||
experiment["storage_path"] = "./logs"
|
||||
experiment['config']['monitor'] = True
|
||||
print(f'Config: {experiment_config}')
|
||||
|
||||
trials = run_experiments(
|
||||
experiment_config,
|
||||
callbacks=[callbacks.TrialCallback()],
|
||||
verbose=2
|
||||
)
|
||||
else:
|
||||
print("in worker node")
|
|
@ -1,35 +0,0 @@
|
|||
FROM mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python-opengl \
|
||||
rsync \
|
||||
xvfb && \
|
||||
apt-get clean -y && \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
rm -rf /usr/share/man/*
|
||||
|
||||
RUN pip install ray-on-aml==0.2.4 \
|
||||
ray==2.4.0 \
|
||||
ray[rllib]==2.4.0 \
|
||||
mlflow==2.3.1 \
|
||||
azureml-defaults==1.50.0 \
|
||||
azureml-dataset-runtime[fuse,pandas]==1.50.0 \
|
||||
azureml-contrib-reinforcementlearning==1.50.0 \
|
||||
gputil==1.4.0 \
|
||||
scipy==1.9.1 \
|
||||
pyglet==2.0.6 \
|
||||
cloudpickle==2.2.1 \
|
||||
tensorflow==2.11.0 \
|
||||
tensorflow-probability==0.19.0 \
|
||||
torch \
|
||||
tabulate==0.9.0 \
|
||||
dm_tree==0.1.8 \
|
||||
lz4==4.3.2 \
|
||||
psutil==5.9.4 \
|
||||
setproctitle==1.3.2 \
|
||||
pygame==2.1.0 \
|
||||
gymnasium[classic_control]==0.26.3 \
|
||||
gym[classic_control]==0.26.2
|
||||
|
||||
# Display the exact versions we have installed
|
||||
RUN pip freeze
|
|
@ -1,22 +0,0 @@
|
|||
'''RLlib callbacks module:
|
||||
Common callback methods to be passed to RLlib trainer.
|
||||
'''
|
||||
|
||||
from azureml.core import Run
|
||||
from ray import tune
|
||||
from ray.tune import Callback
|
||||
from ray.air import session
|
||||
|
||||
|
||||
class TrialCallback(Callback):
|
||||
|
||||
def on_trial_result(self, iteration, trials, trial, result, **info):
|
||||
'''Callback on train result to record metrics returned by trainer.
|
||||
'''
|
||||
run = Run.get_context()
|
||||
run.log(
|
||||
name='episode_reward_mean',
|
||||
value=result["episode_reward_mean"])
|
||||
run.log(
|
||||
name='episodes_total',
|
||||
value=result["episodes_total"])
|
|
@ -1,13 +0,0 @@
|
|||
'''Misc module:
|
||||
Miscellaneous helper functions and utilities.
|
||||
'''
|
||||
|
||||
import os
|
||||
import glob
|
||||
|
||||
|
||||
# Helper function to find a file or folder path
|
||||
def find_path(name, path_prefix):
|
||||
for root, _, _ in os.walk(path_prefix):
|
||||
if glob.glob(os.path.join(root, name)):
|
||||
return root
|
Двоичный файл не отображается.
До Ширина: | Высота: | Размер: 1.3 KiB |
|
@ -101,7 +101,7 @@
|
|||
"\n",
|
||||
"# Check core SDK version number\n",
|
||||
"\n",
|
||||
"print(\"This notebook was created using SDK version 1.56.0, you are currently running version\", azureml.core.VERSION)"
|
||||
"print(\"This notebook was created using SDK version 1.57.0, you are currently running version\", azureml.core.VERSION)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
@ -186,8 +186,7 @@
|
|||
"\n",
|
||||
"# Specify conda dependencies with scikit-learn and temporary pointers to mlflow extensions\n",
|
||||
"cd = CondaDependencies.create(\n",
|
||||
" conda_packages=[\"scikit-learn\", \"matplotlib\"],\n",
|
||||
" pip_packages=[\"azureml-mlflow\", \"pandas\", \"numpy\"]\n",
|
||||
" pip_packages=[\"azureml-mlflow\", \"scikit-learn\", \"matplotlib\", \"pandas\", \"numpy\"]\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"env.python.conda_dependencies = cd"
|
||||
|
|
3
index.md
3
index.md
|
@ -25,7 +25,6 @@ Machine Learning notebook samples and encourage efficient retrieval of topics an
|
|||
| [Forecasting away from training data](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/forecasting-forecast-function/auto-ml-forecasting-function.ipynb) | Forecasting | None | Remote | None | Azure ML AutoML | Forecasting, Confidence Intervals |
|
||||
| [Automated ML run with basic edition features.](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/classification-bank-marketing-all-features/auto-ml-classification-bank-marketing-all-features.ipynb) | Classification | Bankmarketing | AML | ACI | None | featurization, explainability, remote_run, AutomatedML |
|
||||
| [Classification of credit card fraudulent transactions using Automated ML](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.ipynb) | Classification | Creditcard | AML Compute | None | None | remote_run, AutomatedML |
|
||||
| [Classification of credit card fraudulent transactions using Automated ML](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/experimental/classification-credit-card-fraud-local-managed/auto-ml-classification-credit-card-fraud-local-managed.ipynb) | Classification | Creditcard | AML Compute | None | None | AutomatedML |
|
||||
| [Automated ML run with featurization and model explainability.](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/regression-explanation-featurization/auto-ml-regression-explanation-featurization.ipynb) | Regression | MachineData | AML | ACI | None | featurization, explainability, remote_run, AutomatedML |
|
||||
| [auto-ml-forecasting-backtest-single-model](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/forecasting-backtest-single-model/auto-ml-forecasting-backtest-single-model.ipynb) | | None | Remote | None | Azure ML AutoML | |
|
||||
| :star:[Azure Machine Learning Pipeline with DataTranferStep](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-data-transfer.ipynb) | Demonstrates the use of DataTranferStep | Custom | ADF | None | Azure ML | None |
|
||||
|
@ -119,8 +118,6 @@ Machine Learning notebook samples and encourage efficient retrieval of topics an
|
|||
| [nyc-taxi-data-regression-model-building](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/nyc-taxi-data-regression-model-building.ipynb) | | | | | | |
|
||||
| [authentication-in-azureml](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/manage-azureml-service/authentication-in-azureml/authentication-in-azureml.ipynb) | | | | | | |
|
||||
| [pong_rllib](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/pong_rllib.ipynb) | | | | | | |
|
||||
| [cartpole_ci](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/cartpole_ci.ipynb) | | | | | | |
|
||||
| [cartpole_sc](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/cartpole_sc.ipynb) | | | | | | |
|
||||
| [rai-loan-decision](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/responsible-ai/visualize-upload-loan-decision/rai-loan-decision.ipynb) | | | | | | |
|
||||
| [Logging APIs](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/track-and-monitor-experiments/logging-api/logging-api.ipynb) | Logging APIs and analyzing results | None | None | None | None | None |
|
||||
| [configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master//setup-environment/configuration.ipynb) | | | | | | |
|
||||
|
|
|
@ -102,7 +102,7 @@
|
|||
"source": [
|
||||
"import azureml.core\n",
|
||||
"\n",
|
||||
"print(\"This notebook was created using version 1.56.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.57.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
|
|
@ -280,7 +280,7 @@
|
|||
"# get a curated environment\n",
|
||||
"env = Environment.get(\n",
|
||||
" workspace=ws, \n",
|
||||
" name=\"AzureML-sklearn-1.0\"\n",
|
||||
" name=\"AzureML-sklearn-1.5\"\n",
|
||||
")\n",
|
||||
"env.inferencing_stack_version='latest'\n",
|
||||
"\n",
|
||||
|
|
|
@ -151,7 +151,7 @@
|
|||
"# use a curated environment that has already been built for you\n",
|
||||
"\n",
|
||||
"env = Environment.get(workspace=ws, \n",
|
||||
" name=\"AzureML-sklearn-1.0-ubuntu20.04-py38-cpu\")"
|
||||
" name=\"AzureML-sklearn-1.5\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
Двоичный файл не отображается.
После Ширина: | Высота: | Размер: 104 KiB |
|
@ -32,7 +32,7 @@
|
|||
"\n",
|
||||
"See prerequisites in the [Azure Machine Learning documentation](https://docs.microsoft.com/azure/machine-learning/service/tutorial-train-models-with-aml#prerequisites).\n",
|
||||
"\n",
|
||||
"On the computer running this notebook, conda install matplotlib, numpy, scikit-learn=0.22.1"
|
||||
"On the computer running this notebook, conda install matplotlib, numpy, scikit-learn=1.5.1"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -431,7 +431,7 @@
|
|||
"\n",
|
||||
"# to install required packages\n",
|
||||
"env = Environment('tutorial-env')\n",
|
||||
"cd = CondaDependencies.create(pip_packages=['azureml-dataset-runtime[pandas,fuse]', 'azureml-defaults'], conda_packages = ['scikit-learn==0.22.1', 'numpy==1.23'])\n",
|
||||
"cd = CondaDependencies.create(pip_packages=['azureml-dataset-runtime[pandas,fuse]', 'azureml-defaults'], conda_packages = ['scikit-learn==1.5.1', 'numpy==1.23.5'])\n",
|
||||
"\n",
|
||||
"env.python.conda_dependencies = cd\n",
|
||||
"\n",
|
||||
|
|
|
@ -82,7 +82,7 @@
|
|||
"\n",
|
||||
"# to install required packages\n",
|
||||
"env = Environment('tutorial-encryption-env')\n",
|
||||
"cd = CondaDependencies.create(pip_packages=['azureml-dataset-runtime[pandas,fuse]', 'azureml-defaults', 'azure-storage-blob', 'encrypted-inference==0.9'], conda_packages = ['scikit-learn==0.22.1', 'numpy==1.23'])\n",
|
||||
"cd = CondaDependencies.create(pip_packages=['azureml-dataset-runtime[pandas,fuse]', 'azureml-defaults', 'azure-storage-blob', 'encrypted-inference==0.9'], conda_packages = ['scikit-learn==1.5.1', 'numpy==1.23.5'])\n",
|
||||
"\n",
|
||||
"env.python.conda_dependencies = cd\n",
|
||||
"\n",
|
||||
|
|
Загрузка…
Ссылка в новой задаче