BUG: Allow relative path for entry script for SDK v2 submission (#904)

When submitting a script with a path relative to current working
directory, SDK v2 raises an error, even though the script exists.

Closes #890
This commit is contained in:
Anton Schwaighofer 2023-11-02 14:40:27 +00:00 коммит произвёл GitHub
Родитель 1a754d8fcc
Коммит f3525192e1
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
4 изменённых файлов: 144 добавлений и 37 удалений

Просмотреть файл

@ -64,6 +64,9 @@ from health_azure.utils import (
run_duration_string_to_seconds,
to_azure_friendly_string,
wait_for_job_completion,
sanitize_snapshoot_directory,
sanitize_entry_script,
_is_module_calling_syntax,
)
logger = logging.getLogger('health_azure')
@ -340,8 +343,8 @@ def create_crossval_hyperparam_args_v2(
def create_script_run(
script_params: List[str],
snapshot_root_directory: Optional[Path] = None,
entry_script: Optional[PathOrString] = None,
snapshot_root_directory: Optional[Path],
entry_script: Optional[PathOrString],
) -> ScriptRunConfig:
"""
Creates an AzureML ScriptRunConfig object, that holds the information about the snapshot, the entry script, and
@ -355,31 +358,9 @@ def create_script_run(
executed.
:return:
"""
if snapshot_root_directory is None:
print("No snapshot root directory given. All files in the current working directory will be copied to AzureML.")
snapshot_root_directory = Path.cwd()
else:
print(f"All files in this folder will be copied to AzureML: {snapshot_root_directory}")
if entry_script is None:
entry_script = Path(sys.argv[0])
print("No entry script given. The current main Python file will be executed in AzureML.")
elif isinstance(entry_script, str):
entry_script = Path(entry_script)
if entry_script.is_absolute():
try:
# The entry script always needs to use Linux path separators, even when submitting from Windows
entry_script_relative = entry_script.relative_to(snapshot_root_directory).as_posix()
except ValueError:
raise ValueError(
"The entry script must be inside of the snapshot root directory. "
f"Snapshot root: {snapshot_root_directory}, entry script: {entry_script}"
)
else:
entry_script_relative = str(entry_script)
print(f"This command will be run in AzureML: {entry_script_relative} {' '.join(script_params)}")
return ScriptRunConfig(
source_directory=str(snapshot_root_directory), script=entry_script_relative, arguments=script_params
)
snapshot_root = sanitize_snapshoot_directory(snapshot_root_directory)
entry_script_relative = sanitize_entry_script(entry_script, snapshot_root)
return ScriptRunConfig(source_directory=str(snapshot_root), script=entry_script_relative, arguments=script_params)
def effective_experiment_name(experiment_name: Optional[str], entry_script: Optional[PathOrString] = None) -> str:
@ -398,7 +379,7 @@ def effective_experiment_name(experiment_name: Optional[str], entry_script: Opti
elif experiment_name:
raw_value = experiment_name
elif entry_script is not None:
if str(entry_script)[:3] == "-m ":
if _is_module_calling_syntax(str(entry_script)):
raw_value = str(entry_script)[3:]
else:
raw_value = Path(entry_script).stem
@ -461,18 +442,15 @@ def submit_run_v2(
display name will be generated by AzureML.
:return: An AzureML Run object.
"""
snapshot_root_directory = snapshot_root_directory or Path.cwd()
root_dir = Path(snapshot_root_directory)
root_dir = sanitize_snapshoot_directory(snapshot_root_directory)
entry_script_relative = sanitize_entry_script(entry_script, root_dir)
if str(entry_script)[:2] != "-m":
entry_script = Path(entry_script).relative_to(root_dir).as_posix()
experiment_name = effective_experiment_name(experiment_name, entry_script)
experiment_name = effective_experiment_name(experiment_name, entry_script_relative)
script_params = script_params or []
script_param_str = create_v2_job_command_line_args_from_params(script_params)
cmd = " ".join(["python", str(entry_script), script_param_str])
cmd = " ".join(["python", str(entry_script_relative), script_param_str])
print(f"The following command will be run in AzureML: {cmd}")

Просмотреть файл

@ -11,6 +11,7 @@ import logging
import os
import re
import shutil
import sys
import tempfile
import time
from collections import defaultdict
@ -2066,3 +2067,42 @@ def filter_v2_input_output_args(args: List[str]) -> List[str]:
any integer.
"""
return [a for a in args if not re.match(V2_INPUT_DATASET_PATTERN, a) and not re.match(V2_OUTPUT_DATASET_PATTERN, a)]
def _is_module_calling_syntax(entry_script: str) -> bool:
"""Returns True if the entry script is of the format seen when calling Python like 'python -m Foo.bar'"""
return entry_script.startswith("-m ")
def sanitize_snapshoot_directory(snapshot_root_directory: Optional[PathOrString]) -> Path:
"""Sets the default values for the snapshoot root directory, which is the current working directory."""
if snapshot_root_directory is None:
print("No snapshot root directory given. All files in the current working directory will be copied to AzureML.")
return Path.cwd()
else:
print(f"All files in this folder will be copied to AzureML: {snapshot_root_directory}")
return Path(snapshot_root_directory)
def sanitize_entry_script(entry_script: Optional[PathOrString], snapshot_root: Path) -> str:
if entry_script is None:
print("No entry script given. The current main Python file will be executed in AzureML.")
entry_script_path = Path(sys.argv[0])
elif isinstance(entry_script, str):
if _is_module_calling_syntax(entry_script):
return entry_script
entry_script_path = Path(entry_script)
elif isinstance(entry_script, Path):
entry_script_path = entry_script
else:
raise ValueError(f"entry_script must be a string or Path, but got {type(entry_script)}")
if entry_script_path.is_absolute():
try:
entry_script_path = entry_script_path.relative_to(snapshot_root)
except ValueError:
raise ValueError(
"The entry script must be inside of the snapshot root directory. "
f"Snapshot root: {snapshot_root}, entry script: {entry_script}"
)
# The entry script always needs to use Linux path separators, even when submitting from Windows
return str(entry_script_path.as_posix())

Просмотреть файл

@ -42,12 +42,15 @@ from health_azure.utils import (
download_files_by_suffix,
download_file_if_necessary,
resolve_workspace_config_path,
sanitize_snapshoot_directory,
sanitize_entry_script,
)
from testazure.test_himl import RunTarget, render_and_run_test_script
from testazure.utils_testazure import (
DEFAULT_IGNORE_FOLDERS,
DEFAULT_WORKSPACE,
MockRun,
change_working_directory,
create_unittest_run_object,
experiment_for_unittests,
repository_root,
@ -2143,3 +2146,39 @@ def test_resolve_workspace_config_path_missing(tmp_path: Path) -> None:
mocked_file = tmp_path / "foo.json"
with pytest.raises(FileNotFoundError, match="Workspace config file does not exist"):
resolve_workspace_config_path(mocked_file)
@pytest.mark.fast
def test_sanitize_snapshot_root(tmp_path: Path) -> None:
"""Test if the snapshot root directory will default to the current working directory if not provided.
Otherwise, pass through unchanged.
"""
with change_working_directory(tmp_path):
assert sanitize_snapshoot_directory(None) == tmp_path
folder = tmp_path / "foo"
assert sanitize_snapshoot_directory(folder) == folder
assert sanitize_snapshoot_directory(str(folder)) == folder
@pytest.mark.fast
def test_sanitize_entry_script(tmp_path: Path) -> None:
script = "some_script"
# If no entry script is given, derive from sys.argv
with patch("sys.argv", [script]):
assert sanitize_entry_script(None, tmp_path) == script
# If an entry script is given and it is valid, pass through unchanged
assert sanitize_entry_script(script, tmp_path) == script
# If the entry script is in a subfolder, we want to get the script with folder back, as a string.
folder = "folder"
script_with_folder = tmp_path / folder / script
assert sanitize_entry_script(script_with_folder, tmp_path) == f"{folder}/{script}"
# If the entry script is of the format "-m Foo.bar", return unchanged.
entry_module = "-m Foo.bar"
assert sanitize_entry_script(entry_module, tmp_path) == entry_module
# Error case: Invalid argument
with pytest.raises(ValueError, match="entry_script must be a string or Path"):
assert sanitize_entry_script([], tmp_path) # type: ignore
# Error case: Entry script is not in the snapshot
with pytest.raises(ValueError, match="entry script must be inside of the snapshot root"):
assert sanitize_entry_script(script_with_folder, tmp_path / "other_folder")

Просмотреть файл

@ -448,15 +448,19 @@ def test_invalid_entry_script(tmp_path: Path) -> None:
)
assert "entry script must be inside of the snapshot root directory" in str(e)
# If no info is provided, the script run should copy all of the current working directory, and read the entry
# script from sys.argv
with mock.patch("sys.argv", ["foo"]):
script_params = himl._get_script_params()
script_run = himl.create_script_run(script_params)
script_run = himl.create_script_run(script_params, snapshot_root_directory=None, entry_script=None)
assert script_run.source_directory == str(Path.cwd())
assert script_run.script == "foo"
assert script_run.arguments == []
# Entry scripts where the path is not absolute should be left unchanged
script_run = himl.create_script_run(entry_script="some_string", script_params=["--foo"])
script_run = himl.create_script_run(
script_params=["--foo"], snapshot_root_directory=None, entry_script="some_string"
)
assert script_run.script == "some_string"
assert script_run.arguments == ["--foo"]
@ -1865,6 +1869,52 @@ def test_submitting_script_with_sdk_v2(tmp_path: Path, wait_for_completion: bool
assert after_submission_called, "after_submission callback was not called"
def test_submitting_script_with_sdk_v2_accepts_relative_path(tmp_path: Path) -> None:
"""
Test that submission of a script with AML V2 works when the script path is relative to the current working folder.
"""
# Create a minimal script in a temp folder. Script
script_name = "test_script.py"
test_script = tmp_path / script_name
test_script.write_text("print('hello world')")
conda_env_path = create_empty_conda_env(tmp_path)
with patch.multiple(
"health_azure.himl",
get_ml_client=DEFAULT,
create_python_environment_v2=DEFAULT,
register_environment_v2=DEFAULT,
):
with patch("health_azure.himl.command", side_effect=NotImplementedError) as mock_command:
# Try calling the submission code with both relative and absolute paths. In either case, the entry script
# should be converted to a single string that is relative to the snapshot root directory.
for index, entry_script in enumerate([script_name, test_script]):
with pytest.raises(NotImplementedError):
himl.submit_to_azure_if_needed(
entry_script=entry_script, # type: ignore
conda_environment_file=conda_env_path,
snapshot_root_directory=tmp_path,
submit_to_azureml=True,
strictly_aml_v1=False,
)
assert mock_command.call_count == index + 1
_, call_kwargs = mock_command.call_args
# The constructed command is to start python with the given entry script, given as a relative path
expected_command = "python " + script_name
assert call_kwargs.get("command").startswith(expected_command), "Incorrect script argument"
# Submission should fail with an error if the entry script is not inside the snapshot root
with pytest.raises(ValueError, match="entry script must be inside of the snapshot root"):
with pytest.raises(NotImplementedError):
himl.submit_to_azure_if_needed(
entry_script=test_script,
conda_environment_file=conda_env_path,
snapshot_root_directory=tmp_path / "subfolder",
submit_to_azureml=True,
strictly_aml_v1=False,
)
def test_submitting_script_with_sdk_v2_passes_display_name(tmp_path: Path) -> None:
"""
Test that submission of a script with SDK v2 passes the display_name parameter to the "command" function