зеркало из https://github.com/microsoft/hi-ml.git
ENH: Single node runs for SDK v2 can be non-distributed (#905)
Add an option to make single node SDK v2 jobs not run as distributed jobs. At the moment distributed is the default to support Kubernetes compute.
This commit is contained in:
Родитель
036e2311f8
Коммит
f51a40860c
|
@ -24,7 +24,7 @@ from azure.ai.ml.constants import InputOutputModes
|
|||
from azure.ai.ml.entities import Command, Data
|
||||
from azure.ai.ml.entities import Environment as EnvironmentV2
|
||||
from azure.ai.ml.entities import Job, Sweep, UserIdentityConfiguration
|
||||
from azure.ai.ml.entities._job.distribution import MpiDistribution, PyTorchDistribution
|
||||
from azure.ai.ml.entities._job.distribution import DistributionConfiguration, MpiDistribution, PyTorchDistribution
|
||||
from azure.ai.ml.sweep import Choice
|
||||
from azureml._base_sdk_common import user_agent
|
||||
from azureml.core import ComputeTarget, Environment, Experiment, Run, RunConfiguration, ScriptRunConfig, Workspace
|
||||
|
@ -427,6 +427,7 @@ def submit_run_v2(
|
|||
hyperparam_args: Optional[Dict[str, Any]] = None,
|
||||
num_nodes: int = 1,
|
||||
pytorch_processes_per_node: Optional[int] = None,
|
||||
use_mpi_run_for_single_node_jobs: bool = True,
|
||||
display_name: Optional[str] = None,
|
||||
) -> Job:
|
||||
"""
|
||||
|
@ -454,6 +455,8 @@ def submit_run_v2(
|
|||
:param pytorch_processes_per_node: For plain PyTorch multi-GPU processing: The number of processes per node.
|
||||
If supplied, it will run a command job with the "pytorch" framework (rather than "Python"), and using "nccl"
|
||||
as the communication backend.
|
||||
:param use_mpi_run_for_single_node_jobs: If True, even single node jobs will be run as distributed MPI jobs.
|
||||
This is required for Kubernetes compute. If False, single node jobs will not be run as distributed jobs.
|
||||
:param display_name: The name for the run that will be displayed in the AML UI. If not provided, a random
|
||||
display name will be generated by AzureML.
|
||||
:return: An AzureML Run object.
|
||||
|
@ -484,11 +487,13 @@ def submit_run_v2(
|
|||
raise ValueError("pytorch_processes_per_node must be >= 1")
|
||||
|
||||
def create_command_job(cmd: str) -> Command:
|
||||
distribution: Optional[DistributionConfiguration] = None
|
||||
if pytorch_processes_per_node is None:
|
||||
# On AML managed compute, we can set distribution to None for single node jobs.
|
||||
# However, on Kubernetes compute, single node jobs don't see any GPUs. GPUs are visible for MpiDistribution
|
||||
# jobs, so we set MpiDistribution even for single node jobs.
|
||||
distribution: Union[MpiDistribution, PyTorchDistribution] = MpiDistribution(process_count_per_instance=1)
|
||||
if use_mpi_run_for_single_node_jobs:
|
||||
distribution = MpiDistribution(process_count_per_instance=1)
|
||||
else:
|
||||
distribution = PyTorchDistribution(process_count_per_instance=pytorch_processes_per_node)
|
||||
return command(
|
||||
|
@ -745,6 +750,7 @@ def submit_to_azure_if_needed( # type: ignore
|
|||
strictly_aml_v1: bool = False,
|
||||
identity_based_auth: bool = False,
|
||||
pytorch_processes_per_node_v2: Optional[int] = None,
|
||||
use_mpi_run_for_single_node_jobs: bool = True,
|
||||
display_name: Optional[str] = None,
|
||||
) -> AzureRunInfo: # pragma: no cover
|
||||
"""
|
||||
|
@ -813,6 +819,9 @@ def submit_to_azure_if_needed( # type: ignore
|
|||
:param pytorch_processes_per_node_v2: For plain PyTorch multi-GPU processing: The number of processes per node. This
|
||||
is only supported with AML SDK v2, and ignored in v1. If supplied, the job will be submitted as using the
|
||||
"pytorch" framework (rather than "Python"), and using "nccl" as the communication backend.
|
||||
:param use_mpi_run_for_single_node_jobs: If True, even single node jobs with SDK v2 will be run as distributed MPI
|
||||
jobs. This is required for Kubernetes compute. If False, single node jobs will not be run as distributed jobs.
|
||||
This setting only affects jobs submitted with SDK v2 (when `strictly_aml_v1=False`)
|
||||
:param display_name: The name for the run that will be displayed in the AML UI. If not provided, a random
|
||||
display name will be generated by AzureML.
|
||||
:return: If the script is submitted to AzureML then we terminate python as the script should be executed in AzureML,
|
||||
|
@ -981,6 +990,7 @@ def submit_to_azure_if_needed( # type: ignore
|
|||
hyperparam_args=hyperparam_args,
|
||||
num_nodes=num_nodes,
|
||||
pytorch_processes_per_node=pytorch_processes_per_node_v2,
|
||||
use_mpi_run_for_single_node_jobs=use_mpi_run_for_single_node_jobs,
|
||||
)
|
||||
|
||||
if after_submission is not None:
|
||||
|
|
|
@ -2023,7 +2023,8 @@ def test_submit_to_azure_v2_distributed() -> None:
|
|||
assert call_kwargs.get("num_nodes") == num_nodes
|
||||
assert call_kwargs.get("pytorch_processes_per_node") == processes_per_node
|
||||
|
||||
# Single node job: The "distribution" argument of "command" should be set to None
|
||||
# Single node job: The "distribution" argument of "command" should be set to MpiRun, to ensure that it
|
||||
# runs fine on Kubernetes compute.
|
||||
with patch("health_azure.himl.command") as mock_command:
|
||||
_ = himl.submit_to_azure_if_needed(
|
||||
workspace_config_file="mockconfig.json",
|
||||
|
@ -2037,6 +2038,21 @@ def test_submit_to_azure_v2_distributed() -> None:
|
|||
assert call_kwargs.get("instance_count") == 1
|
||||
assert call_kwargs.get("distribution") == MpiDistribution(process_count_per_instance=1)
|
||||
|
||||
# Single node job: The "distribution" argument of "command" should be set to None if we are passing a flag
|
||||
with patch("health_azure.himl.command") as mock_command:
|
||||
_ = himl.submit_to_azure_if_needed(
|
||||
workspace_config_file="mockconfig.json",
|
||||
entry_script=Path(__file__),
|
||||
snapshot_root_directory=Path.cwd(),
|
||||
submit_to_azureml=True,
|
||||
strictly_aml_v1=False,
|
||||
use_mpi_run_for_single_node_jobs=False,
|
||||
)
|
||||
mock_command.assert_called_once()
|
||||
_, call_kwargs = mock_command.call_args
|
||||
assert call_kwargs.get("instance_count") == 1
|
||||
assert call_kwargs.get("distribution") == None
|
||||
|
||||
with pytest.raises(ValueError, match="num_nodes must be >= 1"):
|
||||
_ = himl.submit_to_azure_if_needed(
|
||||
workspace_config_file="mockconfig.json",
|
||||
|
|
Загрузка…
Ссылка в новой задаче