init

2021-11-05 08:20:58 +00:00 · 2021-11-05 08:20:58 +00:00 · b43da4c145
--- a/examples/train/dask-lightgbm/README.md
+++ b/examples/train/dask-lightgbm/README.md
@ -0,0 +1,19 @@
 # LightGBM Distributed Training with DASK
 This example shows how to use DASK to train LightGBM models in distributed mode on Azure Machine Learning.
 ## Prerequisites
 - Azure Machine Learning Workspace
    - Compute Clusters for DASK
    - Compute Instance with Azure ML CLI 2.0 installed
 ## LightGBM DASK Distributed Training
 LightGBM supports distributed training with DASK. DASK is a distributed computing framework for Python. See the following documents in reference section for more details.
 ## Reference
 - [DASK](https://dask.org/)
 - [LightGBM DASK](https://lightgbm.readthedocs.io/en/latest/Parallel-Learning-Guide.html#dask)
--- a/examples/train/dask-lightgbm/conda.yml
+++ b/examples/train/dask-lightgbm/conda.yml
@ -0,0 +1,19 @@
 name: dask
 channels:
  - defaults
  - conda-forge
 dependencies:
  - python=3.8
  - pip:
      - lightgbm
      - dask
      - dask-ml
      - bokeh
      - pandas
      - notebook
      - matplotlib
      - ipykernel
      - numpy
      - scikit-learn
      - azureml-sdk
      - azureml-mlflow
--- a/examples/train/dask-lightgbm/job.yml
+++ b/examples/train/dask-lightgbm/job.yml
@ -0,0 +1,37 @@
 $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
 code: 
  local_path: src
 # This is the command that will start up the dask cluster and run the script `prep-nyctaxi.py` with the following parameters.
 # For an interactive session, just remove the --script. That will just start the cluster and mount the dataset.
 command: >-
  python startDask.py
  --script train-lgb-dask.py 
  --dataset_path {inputs.nyc_taxi_dataset}
 inputs:
  nyc_taxi_dataset:
    data: 
      path: https://azuremlexamples.blob.core.windows.net/datasets/nyctaxi/
    mode: mount
 environment: 
  conda_file: file:conda.yml
  docker: 
    image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04
 compute:
  # use a sku with lots of disk space and memory
  target: azureml:daskclusters
  instance_count: 5
 distribution:
  # The job below is currently launched with `type: pytorch` since that 
  # gives the full flexibility of assigning the work to the
  # no pytorch is actually used in this job
  type: pytorch
 experiment_name: dask-nyctaxi-lgb-train
 description: DASK LightGBM Job (Multiple Instances)
--- a/examples/train/dask-lightgbm/src/startDask.py
+++ b/examples/train/dask-lightgbm/src/startDask.py
@ -0,0 +1,221 @@
 import os
 import argparse
 import time
 from dask.distributed import Client, get_task_stream
 import sys, uuid
 import threading
 import subprocess
 import socket
 import mlflow
 from bokeh.io import export_png # dashboard 保存
 from notebook.notebookapp import list_running_servers
 def flush(proc, proc_log):
    while True:
        proc_out = proc.stdout.readline()
        if proc_out == "" and proc.poll() is not None:
            proc_log.close()
            break
        elif proc_out:
            sys.stdout.write(proc_out)
            proc_log.write(proc_out)
            proc_log.flush()
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--jupyter_token", default=uuid.uuid1().hex)
    parser.add_argument("--script")
    args, unparsed = parser.parse_known_args()
    for k, v in os.environ.items():
        if k.startswith("MLFLOW"):
            print(k, v)
    MLFLOW_RUN_ID = os.getenv("MLFLOW_RUN_ID")
    # Dashboard の保存　
    client = Client()
    # 環境変数から、Daskの起動時に必要な情報を取得する
    print(
        "- env: AZ_BATCHAI_JOB_MASTER_NODE_IP: ",
        os.environ.get("AZ_BATCHAI_JOB_MASTER_NODE_IP"),
    )
    print(
        "- env: AZ_BATCHAI_IS_CURRENT_NODE_MASTER: ",
        os.environ.get("AZ_BATCHAI_IS_CURRENT_NODE_MASTER"),
    )
    print("- env: AZ_BATCHAI_NODE_IP: ", os.environ.get("AZ_BATCHAI_NODE_IP"))
    print("- env: AZ_BATCH_HOST_LIST: ", os.environ.get("AZ_BATCH_HOST_LIST"))
    print("- env: AZ_BATCH_NODE_LIST: ", os.environ.get("AZ_BATCH_NODE_LIST"))
    print("- env: MASTER_ADDR: ", os.environ.get("MASTER_ADDR"))
    print("- env: MASTER_PORT: ", os.environ.get("MASTER_PORT"))
    print("- env: RANK: ", os.environ.get("RANK"))
    print("- env: LOCAL_RANK: ", os.environ.get("LOCAL_RANK"))
    print("- env: NODE_RANK: ", os.environ.get("NODE_RANK"))
    print("- env: WORLD_SIZE: ", os.environ.get("WORLD_SIZE"))
    rank = os.environ.get("RANK")
    ip = socket.gethostbyname(socket.gethostname())
    master = os.environ.get("MASTER_ADDR")
    master_port = os.environ.get("MASTER_PORT")
    print("- my rank is ", rank)
    print("- my ip is ", ip)
    print("- master is ", master)
    print("- master port is ", master_port)
    scheduler = master + ":8786"
    dashboard = master + ":8787"
    print("- scheduler is ", scheduler)
    print("- dashboard is ", dashboard)
    print("args: ", args)
    print("unparsed: ", unparsed)
    print("- my rank is ", rank)
    print("- my ip is ", ip)
    if not os.path.exists("logs"):
        os.makedirs("logs")
    print("free disk space on /tmp")
    os.system(f"df -P /tmp")
    mlflow.log_param("WORLD_SIZE", os.environ.get("WORLD_SIZE"))
    # RANK 0 での処理
    if str(rank) == "0":
        mlflow.log_param("headnode", ip)
        mlflow.log_param(
            "cluster",
            "scheduler: {scheduler}, dashboard: {dashboard}".format(
                scheduler=scheduler, dashboard=dashboard
            ),
        )
        cmd = (
            "jupyter lab --ip 0.0.0.0 --port 8888"
            + " --NotebookApp.token={token}"
            + " --allow-root --no-browser"
        ).format(token=args.jupyter_token)
        os.environ["MLFLOW_RUN_ID"] = MLFLOW_RUN_ID
        jupyter_log = open("logs/jupyter_log.txt", "w")
        jupyter_proc = subprocess.Popen(
            cmd.split(),
            universal_newlines=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
        )
        jupyter_flush = threading.Thread(target=flush, args=(jupyter_proc, jupyter_log))
        jupyter_flush.start()
        # while not list(list_running_servers()):
        #    time.sleep(5)
        # jupyter_servers = list(list_running_servers())
        # assert (len(jupyter_servers) == 1), "more than one jupyter server is running"
        mlflow.log_param(
            "jupyter", "ip: {ip_addr}, port: {port}".format(ip_addr=ip, port="8888")
        )
        mlflow.log_param("jupyter-token", args.jupyter_token)
        cmd = (
            "dask-scheduler "
            + "--port "
            + scheduler.split(":")[1]
            + " --dashboard-address "
            + dashboard
        )
        print(cmd)
        os.environ["MLFLOW_RUN_ID"] = MLFLOW_RUN_ID
        scheduler_log = open("logs/scheduler_log.txt", "w")
        scheduler_proc = subprocess.Popen(
            cmd.split(),
            universal_newlines=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
        )
        scheduler_flush = threading.Thread(
            target=flush, args=(scheduler_proc, scheduler_log)
        )
        scheduler_flush.start()
        cmd = "dask-worker " + scheduler
        print(cmd)
        os.environ["MLFLOW_RUN_ID"] = MLFLOW_RUN_ID
        worker_log = open("logs/worker_{rank}_log.txt".format(rank=rank), "w")
        worker_proc = subprocess.Popen(
            cmd.split(),
            universal_newlines=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
        )
        worker_flush = threading.Thread(target=flush, args=(worker_proc, worker_log))
        worker_flush.start()
        print("### OUTPUT STREAM ###")
        with get_task_stream(client, plot='save', filename='task_stream.html') as ts:
            futs = client.map(lambda x: time.sleep(x**2), range(5))
            results = client.gather(futs)
        if args.script:
            command_line = " ".join(["python", args.script] + unparsed)
            print("Launching:", command_line)
            os.environ["MLFLOW_RUN_ID"] = MLFLOW_RUN_ID
            driver_log = open("logs/driver_log.txt", "w")
            driver_proc = subprocess.Popen(
                command_line.split(),
                universal_newlines=True,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
            )
            driver_flush = threading.Thread(
                target=flush, args=(driver_proc, driver_log)
            )
            driver_flush.start()
            # Wait until process terminates (without using p.wait())
            # while driver_proc.poll() is None:
            #    # Process hasn't exited yet, let's wait some
            #    time.sleep(0.5)
            print("waiting for driver process to terminate")
            driver_proc.wait()
            exit_code = driver_proc.returncode
            print("process ended with code", exit_code)
            print("killing scheduler, worker and jupyter")
            jupyter_proc.kill()
            scheduler_proc.kill()
            worker_proc.kill()
            exit(exit_code)
            export_png(ts.figure, filename="./outputs/plot_{rank}.png")
        else:
            flush(scheduler_proc, scheduler_log)
    # RANK 0 以外の処理
    else:
        cmd = "dask-worker " + scheduler
        print(cmd)
        os.environ["MLFLOW_RUN_ID"] = MLFLOW_RUN_ID
        worker_log = open("logs/worker_{rank}_log.txt".format(rank=rank), "w")
        worker_proc = subprocess.Popen(
            cmd.split(),
            universal_newlines=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
        )
        flush(worker_proc, worker_log)
--- a/examples/train/dask-lightgbm/src/train-lgb-dask.py
+++ b/examples/train/dask-lightgbm/src/train-lgb-dask.py
@ -0,0 +1,66 @@
 import argparse
 import pickle
 from re import VERBOSE
 import time
 import dask.dataframe as dd
 import joblib
 import lightgbm as lgb
 import mlflow
 from dask.distributed import Client, LocalCluster, performance_report, wait
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--dataset_path")
    args = parser.parse_args()
    dataset_path = args.dataset_path
    OUTOUT_DIR = './outputs'
    print("loading data")
    df = dd.read_csv(f"{dataset_path}/*.csv", parse_dates=["tpep_pickup_datetime", "tpep_dropoff_datetime"])
    df["total"] = df["total_amount"] + df["tolls_amount"] + df["tip_amount"] + df["extra"]
    dX = df.drop(["store_and_fwd_flag", "tpep_pickup_datetime", "tpep_dropoff_datetime", "total", "total_amount", "tolls_amount", "tip_amount", "extra"], axis=1)
    dy = df["total"]
    print("initializing a Dask cluster")
    #cluster = LocalCluster(dashboard_address=':9999') # port 8787 is already used by R studio server
    #client = Client(cluster)
    client = Client("localhost:8786")
    print("created a Dask LocalCluster")
    dX.persist()
    dy.persist()
    wait(dX)
    wait(dy)
    print("distributing training data on the Dask cluster")
    print("beginning training")
    dask_model = lgb.DaskLGBMRegressor(n_estimators=100)
    with performance_report(filename="./outputs/dask-report.html"):
        start = time.time()
        dask_model.fit(dX, dy, verbose=5)
        elapsed_time = time.time() - start
        print("elapsed time: {}".format(elapsed_time))
        mlflow.log_metric("training_time", elapsed_time)
        assert dask_model.fitted_
    # Save sklearn Estimator Model
    sklearn_model = dask_model.to_local()
    joblib.dump(sklearn_model, "./outputs/dask-sklearn-model.joblib")
    # Save Dask LightGBM Model
    with open("./outputs/dask-model.pkl", "wb") as f:
        pickle.dump(dask_model, f)
    print("done training")
--- a/examples/train/nni-hyperband/README.md
+++ b/examples/train/nni-hyperband/README.md
@ -0,0 +1,18 @@
 # HyperParameter Tuning HyperBand with NNI
 This example shows how to use NNI to perform hyperparameter tuning with HyperBand on Azure Machine Learning.
 ## Prerequisites
 - Azure Machine Learning Workspace
    - Compute Clusters for parallel training
    - Compute Instance with Azure ML CLI 2.0 and NNI library installed
 ## HPO with NNI
 Neural Network Intelligence (NNI) is a library that provides a unified interface for hyperparameter optimization. Many tuning algorithm is included. See the following link for more details in reference section.
 ## Reference
 - [Neural Network Intelligence (NNI)](https://github.com/microsoft/nni)
--- a/examples/train/nni-hyperband/config_hyperband.yml
+++ b/examples/train/nni-hyperband/config_hyperband.yml
@ -0,0 +1,22 @@
 searchSpaceFile: search_space.json
 trialCommand: python3 mnist.py
 trialCodeDirectory: src           # The path of trial code. By default it's ".", which means the same directory of this config file.
 trialGpuNumber: 0
 trialConcurrency: 10
 maxExperimentDuration: 10h
 maxTrialNumber: 1000
 advisor:
  name: Hyperband
  classArgs:
    R: 100                    # the maximum trial budget (could be the number of mini-batches or epochs) can be
                              # allocated to a trial. Each trial should use trial budget to control how long it runs.
    eta: 3                    # proportion of discarded trials
    optimize_mode: maximize   # maximize or minimize
    exec_mode: parallelism    # serial or parallelism
 TrainingService:
  platform: aml
  dockerImage: msranni/nni
  subscriptionId: 82a5d8d3-5322-4c49-b9d6-da6e00be5d57
  resourceGroup: azureml-automl
  workspaceName: azureml-automl
  computeTarget: cpuclusters
--- a/examples/train/nni-hyperband/src/mnist.py
+++ b/examples/train/nni-hyperband/src/mnist.py
@ -0,0 +1,146 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 """
 NNI example trial code.
 - Experiment type: Hyper-parameter Optimization
 - Trial framework: Tensorflow v2.x (Keras API)
 - Model: LeNet-5
 - Dataset: MNIST
 """
 import logging
 import tensorflow as tf
 from tensorflow.keras import Model
 from tensorflow.keras.callbacks import Callback
 from tensorflow.keras.layers import (Conv2D, Dense, Dropout, Flatten, MaxPool2D)
 from tensorflow.keras.optimizers import Adam
 import nni
 _logger = logging.getLogger('mnist_example')
 _logger.setLevel(logging.INFO)
 class MnistModel(Model):
    """
    LeNet-5 Model with customizable hyper-parameters
    """
    def __init__(self, conv_size, hidden_size, dropout_rate):
        """
        Initialize hyper-parameters.
        Parameters
        ----------
        conv_size : int
            Kernel size of convolutional layers.
        hidden_size : int
            Dimensionality of last hidden layer.
        dropout_rate : float
            Dropout rate between two fully connected (dense) layers, to prevent co-adaptation.
        """
        super().__init__()
        self.conv1 = Conv2D(filters=32, kernel_size=conv_size, activation='relu')
        self.pool1 = MaxPool2D(pool_size=2)
        self.conv2 = Conv2D(filters=64, kernel_size=conv_size, activation='relu')
        self.pool2 = MaxPool2D(pool_size=2)
        self.flatten = Flatten()
        self.fc1 = Dense(units=hidden_size, activation='relu')
        self.dropout = Dropout(rate=dropout_rate)
        self.fc2 = Dense(units=10, activation='softmax')
    def call(self, x):
        """Override ``Model.call`` to build LeNet-5 model."""
        x = self.conv1(x)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.pool2(x)
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.dropout(x)
        return self.fc2(x)
 class ReportIntermediates(Callback):
    """
    Callback class for reporting intermediate accuracy metrics.
    This callback sends accuracy to NNI framework every 100 steps,
    so you can view the learning curve on web UI.
    If an assessor is configured in experiment's YAML file,
    it will use these metrics for early stopping.
    """
    def on_epoch_end(self, epoch, logs=None):
        """Reports intermediate accuracy to NNI framework"""
        # TensorFlow 2.0 API reference claims the key is `val_acc`, but in fact it's `val_accuracy`
        if 'val_acc' in logs:
            nni.report_intermediate_result(logs['val_acc'])
        else:
            nni.report_intermediate_result(logs['val_accuracy'])
 def load_dataset():
    """Download and reformat MNIST dataset"""
    mnist = tf.keras.datasets.mnist
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x_train, x_test = x_train / 255.0, x_test / 255.0
    x_train = x_train[..., tf.newaxis]
    x_test = x_test[..., tf.newaxis]
    return (x_train, y_train), (x_test, y_test)
 def main(params):
    """
    Main program:
      - Build network
      - Prepare dataset
      - Train the model
      - Report accuracy to tuner
    """
    model = MnistModel(
        conv_size=params['conv_size'],
        hidden_size=params['hidden_size'],
        dropout_rate=params['dropout_rate']
    )
    optimizer = Adam(learning_rate=params['learning_rate'])
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    _logger.info('Model built')
    (x_train, y_train), (x_test, y_test) = load_dataset()
    _logger.info('Dataset loaded')
    model.fit(
        x_train,
        y_train,
        batch_size=params['batch_size'],
        epochs=10,
        verbose=0,
        callbacks=[ReportIntermediates()],
        validation_data=(x_test, y_test)
    )
    _logger.info('Training completed')
    loss, accuracy = model.evaluate(x_test, y_test, verbose=0)
    nni.report_final_result(accuracy)  # send final accuracy to NNI tuner and web UI
    _logger.info('Final accuracy reported: %s', accuracy)
 if __name__ == '__main__':
    params = {
        'dropout_rate': 0.5,
        'conv_size': 5,
        'hidden_size': 1024,
        'batch_size': 32,
        'learning_rate': 1e-4,
    }
    # fetch hyper-parameters from HPO tuner
    # comment out following two lines to run the code without NNI framework
    tuned_params = nni.get_next_parameter()
    params.update(tuned_params)
    _logger.info('Hyper-parameters: %s', params)
    main(params)
--- a/examples/train/pytorch-ddp/README.md
+++ b/examples/train/pytorch-ddp/README.md
@ -0,0 +1,13 @@
 # PyTorch Distributed Data Parallel (DDP)
 This example shows how to use Distributed Data Parallel (DDP) with PyTorch on Azure Machine Learning.
 ## Prerequisites
 - Azure Machine Learning Workspace
    - Compute Clusters with GPU for distributed training
    - Compute Instance with Azure ML CLI 2.0 installed
 ## Reference
 - [PyTorch Distributed Data Parallel (DDP)][1]
 [1]: https://pytorch.org/docs/stable/distributed.html
--- a/examples/train/pytorch-ddp/dataprep.py
+++ b/examples/train/pytorch-ddp/dataprep.py
@ -0,0 +1,16 @@
 import urllib
 import urllib.request
 import tarfile
 import os
 url = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
 filename = 'cifar-10-python.tar.gz'
 data_root = 'data'
 filepath = os.path.join(data_root, filename)
 if not os.path.isdir(data_root):
    os.makedirs(data_root, exist_ok=True)
    urllib.request.urlretrieve(url, filepath)
    with tarfile.open(filepath, "r:gz") as tar:
        tar.extractall(path=data_root)
    os.remove(filepath)  # delete tar.gz file after extraction
--- a/examples/train/pytorch-ddp/job.yml
+++ b/examples/train/pytorch-ddp/job.yml
@ -0,0 +1,26 @@
 $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
 code: 
  local_path: src
 command: >-
  python train.py 
  --epochs 1000
  --data-dir {inputs.cifar}
 inputs:
  cifar:
    data:
      local_path: data
    mode: mount
 environment: azureml:AzureML-pytorch-1.9-ubuntu18.04-py37-cuda11-gpu:3
 compute:
  target: azureml:gpuclusters2
  instance_count: 2
 distribution:
  type: pytorch 
  process_count: 4
 experiment_name: pytorch-cifar-distributed-example
 description: Train a basic convolutional neural network (CNN) with PyTorch on the CIFAR-10 dataset, distributed via PyTorch.
--- a/examples/train/pytorch-ddp/src/train.py
+++ b/examples/train/pytorch-ddp/src/train.py
@ -0,0 +1,261 @@
 # Copyright (c) 2017 Facebook, Inc. All rights reserved.
 # BSD 3-Clause License
 #
 # Script adapted from: https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#sphx-glr-beginner-blitz-cifar10-tutorial-py
 # ==============================================================================
 # imports
 import os
 import mlflow
 import argparse
 import torch
 import torchvision
 import torchvision.transforms as transforms
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
 # define network architecture
 class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, 3)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(32, 64, 3)
        self.conv3 = nn.Conv2d(64, 128, 3)
        self.fc1 = nn.Linear(128 * 6 * 6, 120)
        self.dropout = nn.Dropout(p=0.2)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = x.view(-1, 128 * 6 * 6)
        x = self.dropout(F.relu(self.fc1(x)))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
 # define functions
 def train(train_loader, model, criterion, optimizer, epoch, device, print_freq, rank):
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data[0].to(device), data[1].to(device)
        # zero the parameter gradients
        optimizer.zero_grad()
        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        # print statistics
        running_loss += loss.item()
    # mlflow loss logging
    mlflow.log_metric(f"train-loss-{rank}", running_loss)
 def evaluate(test_loader, model, device):
    classes = (
        "plane",
        "car",
        "bird",
        "cat",
        "deer",
        "dog",
        "frog",
        "horse",
        "ship",
        "truck",
    )
    model.eval()
    correct = 0
    total = 0
    class_correct = list(0.0 for i in range(10))
    class_total = list(0.0 for i in range(10))
    with torch.no_grad():
        for data in test_loader:
            images, labels = data[0].to(device), data[1].to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            c = (predicted == labels).squeeze()
            for i in range(10):
                label = labels[i]
                class_correct[label] += c[i].item()
                class_total[label] += 1
    # print total test set accuracy
    print(
        "Accuracy of the network on the 10000 test images: %d %%"
        % (100 * correct / total)
    )
    mlflow.log_metric("Accuracy of test images", 100 * correct / total)
    # print test accuracy for each of the classes
    for i in range(10):
        print(
            "Accuracy of %5s : %2d %%"
            % (classes[i], 100 * class_correct[i] / class_total[i])
        )
        mlflow.log_metric(f"{classes[i]}", 100 * class_correct[i] / class_total[i])
 def main(args):
    # get PyTorch environment variables
    world_size = int(os.environ["WORLD_SIZE"])
    rank = int(os.environ["RANK"])
    local_rank = int(os.environ["LOCAL_RANK"])
    distributed = world_size > 1
    # set device
    if distributed:
        device = torch.device("cuda", local_rank)
    else:
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # initialize distributed process group using default env:// method
    if distributed:
        torch.distributed.init_process_group(backend="nccl")
    # define train and test dataset DataLoaders
    transform = transforms.Compose(
        [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
    )
    train_set = torchvision.datasets.CIFAR10(
        root=args.data_dir, train=True, download=False, transform=transform
    )
    if distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_set)
    else:
        train_sampler = None
    train_loader = torch.utils.data.DataLoader(
        train_set,
        batch_size=args.batch_size,
        shuffle=(train_sampler is None),
        num_workers=args.workers,
        sampler=train_sampler,
    )
    test_set = torchvision.datasets.CIFAR10(
        root=args.data_dir, train=False, download=False, transform=transform
    )
    test_loader = torch.utils.data.DataLoader(
        test_set, batch_size=args.batch_size, shuffle=False, num_workers=args.workers
    )
    model = Net().to(device)
    # wrap model with DDP
    if distributed:
        model = nn.parallel.DistributedDataParallel(
            model, device_ids=[local_rank], output_device=local_rank
        )
    # define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(
        model.parameters(), lr=args.learning_rate, momentum=args.momentum
    )
    # train the model
    for epoch in range(args.epochs):
        print("Rank %d: Starting epoch %d" % (rank, epoch))
        if distributed:
            train_sampler.set_epoch(epoch)
        model.train()
        train(
            train_loader,
            model,
            criterion,
            optimizer,
            epoch,
            device,
            args.print_freq,
            rank,
        )
    print("Rank %d: Finished Training" % (rank))
    if not distributed or rank == 0:
        # log model
        mlflow.pytorch.log_model(model, "./model")
        # evaluate on full test dataset
        evaluate(test_loader, model, device)
 def parse_args():
    # setup argparse
    parser = argparse.ArgumentParser()
    # add arguments
    parser.add_argument(
        "--data-dir", type=str, help="directory containing CIFAR-10 dataset"
    )
    parser.add_argument("--epochs", default=10, type=int, help="number of epochs")
    parser.add_argument(
        "--batch-size",
        default=16,
        type=int,
        help="mini batch size for each gpu/process",
    )
    parser.add_argument(
        "--workers",
        default=2,
        type=int,
        help="number of data loading workers for each gpu/process",
    )
    parser.add_argument(
        "--learning-rate", default=0.001, type=float, help="learning rate"
    )
    parser.add_argument("--momentum", default=0.9, type=float, help="momentum")
    parser.add_argument(
        "--print-freq",
        default=200,
        type=int,
        help="frequency of printing training statistics",
    )
    # parse args
    args = parser.parse_args()
    # mlflow logging
    mlflow.log_param("batch size", args.batch_size)
    mlflow.log_param("learning rate", args.learning_rate)
    mlflow.log_param("momentum", args.momentum)
    # return args
    return args
 # run script
 if __name__ == "__main__":
    # add space in logs
    print("*" * 60)
    print("\n\n")
    # parse args
    args = parse_args()
    # call main function
    main(args)
    # add space in logs
    print("*" * 60)
    print("\n\n")
--- a/examples/train/ray-flaml/README.md
+++ b/examples/train/ray-flaml/README.md
@ -0,0 +1,17 @@
 # FLAML AutoML with RAY
 This example shows how to use FLAML to train a model on a dataset using RAY on Azure Machine Learning.
 ## Prerequisites
 - Azure Machine Learning Workspace
    - Compute Clusters for Ray
    - Compute Instance with Azure ML CLI 2.0 installed
 ## FLAML with RAY
 FLAML is a lightweight Python library that finds accurate machine learning models automatically, efficiently and economically. FLAML support Ray Tune for distributed search.
 ## Reference
 - [FLAML: A Framework for Learning from Data](https://github.com/microsoft/FLAML)
--- a/examples/train/ray-flaml/azureml-tensorboard.ipynb
+++ b/examples/train/ray-flaml/azureml-tensorboard.ipynb
@ -0,0 +1,91 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from azureml.core import Workspace, Run\n",
    "from azureml.tensorboard import Tensorboard\n",
    "ws = Workspace.from_config()\n",
    "run = Run.get(workspace=ws, run_id=\"cd0a70d1-aa17-4991-a9a5-98dfd0142663\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "https://automl-client-6006.japaneast.instances.azureml.ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'https://automl-client-6006.japaneast.instances.azureml.ms'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tb = Tensorboard([run], local_root=\"logs/azureml\", port=6006)\n",
    "tb.start()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "tb.stop()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "be3ccc1aacd5cd0ada9eab9372c3d7f901636bca88db42a566c3f238cceb324c"
  },
  "kernelspec": {
   "display_name": "Python 3.6.13 64-bit ('ray': conda)",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.13"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/examples/train/ray-flaml/conda.yml
+++ b/examples/train/ray-flaml/conda.yml
@ -0,0 +1,13 @@
 name: ray
 channels:
  - defaults
  - conda-forge
 dependencies:
  - python=3.7
  - pip:
      - flaml[notebook, blendsearch, ray, azureml]==0.6.9
      - azureml-tensorboard
      - ipykernel
      - matplotlib
      - tensorboardX<=2.2
      - mpi4py
--- a/examples/train/ray-flaml/job.yml
+++ b/examples/train/ray-flaml/job.yml
@ -0,0 +1,28 @@
 $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
 code: 
  local_path: src
 # This is the command that will start up the ray cluster and run the script `train-automl-flaml.py` with the following parameters.
 command: >-
  python startRay.py
  --script train-automl-flaml.py
 environment: 
  conda_file: file:conda.yml
  docker: 
    image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04
 compute:
  target: azureml:cpuclusters
  instance_count: 3
 distribution:
  # The job below is currently launched with `type: pytorch` since that 
  # gives the full flexibility of assigning the work to the
  # no pytorch is actually used in this job
  type: pytorch
 experiment_name: train-automl-flaml
 description: FLAML AutoML on Ray Cluster using Azure Machine Learning Compute Cluster
--- a/examples/train/ray-flaml/src/startRay.py
+++ b/examples/train/ray-flaml/src/startRay.py
@ -0,0 +1,100 @@
 import threading
 import sys
 import subprocess
 import os
 import argparse
 from mpi4py import MPI
 def flush(proc, proc_log):
    while True:
        proc_out = proc.stdout.readline()
        if proc_out == "" and proc.poll() is not None:
            proc_log.close()
            break
        elif proc_out:
            sys.stdout.write(proc_out)
            proc_log.write(proc_out)
            proc_log.flush()
 if __name__ == "__main__":
    comm = MPI.COMM_WORLD
    mpi_rank = comm.Get_rank()
    print("mpi rank:", mpi_rank)
    parser = argparse.ArgumentParser()
    parser.add_argument("--port", default="6379", type=int)
    parser.add_argument("--script", type=str)
    args = parser.parse_args()
    # this scirpt is for flaml automl training
    script = args.script
    # port for ray head node
    port = args.port
    print("head port is ", port)
    head_ip = os.environ.get("MASTER_ADDR")
    print("head address is ", head_ip)
    rank = os.environ.get("RANK")
    print("my rank is ", rank)
    # TODO:Get Password from Azure KeyVault
    password = "password"
    # Ray Head Node
    if str(rank) == "0":
        head_log = open("logs/worker_{rank}_log.txt".format(rank=rank), "w")
        cmd = f"ray start --head --port={port} --redis-password={password} --dashboard-port=9999"
        print(cmd)
        head_proc = subprocess.Popen(
            cmd.split(),
            universal_newlines=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
        )
        head_flush = threading.Thread(
            target=flush, args=(head_proc, head_log)
        )
        head_flush.start()
        python_log = open("logs/python_{rank}_log.txt".format(rank=rank), "w")
        command_line = f"python {script} --redis-password={password}"
        driver_proc = subprocess.Popen(
            command_line.split(),
            universal_newlines=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
        )
        driver_flush = threading.Thread(
            target=flush, args=(driver_proc, python_log)
        )
        driver_flush.start()
        driver_proc.wait()  
        head_proc.kill()
        driver_proc.kill()
        print("### Head Job Finished")
    # Ray Worker Node
    else:
        worker_log = open("logs/worker_{rank}_log.txt".format(rank=rank), "w")
        cmd = f"ray start --address={head_ip}:{port} --redis-password {password}"
        print(cmd)
        worker_proc = subprocess.Popen(
            cmd.split(),
            universal_newlines=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
        )
        flush(worker_proc, worker_log)
        # worker_flush = threading.Thread(
        #     target=flush, args=(worker_proc, worker_log)
        # )
        # worker_flush.start()
        # worker_proc.wait()
--- a/examples/train/ray-flaml/src/train-automl-flaml.py
+++ b/examples/train/ray-flaml/src/train-automl-flaml.py
@ -0,0 +1,76 @@
 import argparse
 import mlflow
 import ray
 from flaml import AutoML
 from sklearn.datasets import load_diabetes
 from tensorboardX import SummaryWriter
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--redis-password", default='password')
    args = parser.parse_args()
    password = args.redis_password
    ray.init('auto', _redis_password=f'{password}')
    # Initialize an AutoML instance
    automl = AutoML()
    # Specify automl goal and constraint
    automl_settings = {
        "time_budget": 300,  # in seconds
        "metric": 'mse',
        "task": 'regression',
        "log_type": 'all',
        "n_concurrent_trials": 3,
        "log_file_name": "./outputs/diabetes.log",
        "log_training_metric": True,
        "log_type": 'all',
        "append_log": True,
    }
    X_train, y_train = load_diabetes(return_X_y=True)
    # Train with labeled input data
    # TODO: mlflow logging to Azure ML
    mlflow.log_param("n_concurrent_trials", automl_settings['n_concurrent_trials'])
    mlflow.log_param("task", automl_settings['task'])
    mlflow.log_param("metric", automl_settings['metric'])
    mlflow.log_param("time_budget", automl_settings['time_budget'])
    try:
        automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
    except Exception as e:
        print(e)
    finally:
        print('Best ML leaner:', automl.best_estimator)
        print('Best hyperparmeter config:', automl.best_config)
        print('Best MSE: ', automl.best_loss)
        from flaml.data import get_output_from_log
        time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \
            get_output_from_log(filename=automl_settings['log_file_name'], time_budget=240)
        print(time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history)
        with SummaryWriter(comment='azureml', log_dir="logs/azureml/") as writer:
            for config, metric in zip(config_history, metric_history):
                hparam_dict_learner = {key: value for key, value in config.items() if key == 'Current Learner'}
                hparam_dict_param = config['Current Hyper-parameters']['ml']
                writer.add_hparams(hparam_dict=dict(**hparam_dict_learner, **hparam_dict_param), metric_dict=metric)
                mlflow.log_metric("mse", metric['train_loss'])
        import matplotlib.pyplot as plt
        import numpy as np
        fig = plt.figure()
        plt.title('Learning Curve')
        plt.xlabel('Wall Clock Time (s)')
        plt.ylabel('mse')
        plt.scatter(time_history, np.array(valid_loss_history))
        plt.step(time_history, np.array(best_valid_loss_history), where='post')
        plt.savefig("figure.png")
        mlflow.log_figure(fig, "figure.png")
        plt.show()
        ray.shutdown()