Fix for invalid "." dataset location in AzureML (#161)

* Catch "." and raise an exception
2021-11-23 15:40:59 +00:00 · 2021-11-23 15:40:59 +00:00 · 145e7dc9a2
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -21,6 +21,7 @@ the section headers (Added/Changed/...) and incrementing the package version.
 - ([164](https://github.com/microsoft/hi-ml/pull/164)) Look in more locations for std out from AzureML run.

 ### Fixed
+- ([#161](https://github.com/microsoft/hi-ml/pull/161)) Empty string as target folder for a dataset creates an invalid mounting path for the dataset in AzureML (fixes #160)

 ### Removed

--- a/hi-ml-azure/src/health_azure/datasets.py
+++ b/hi-ml-azure/src/health_azure/datasets.py
@ -95,7 +95,7 @@ class DatasetConfig:
            Defaults: False (downloading) for datasets that are script inputs, True (mounting) for datasets that are
            script outputs.
        :param target_folder: The folder into which the dataset should be downloaded or mounted. If left empty, a
-            random folder on /tmp will be chosen.
+            random folder on /tmp will be chosen. Do NOT use "." as the target_folder.
        :param local_folder: The folder on the local machine at which the dataset is available. This
            is used only for runs outside of AzureML. If this is empty then the target_folder will be used to
            mount or download the dataset.
@ -109,8 +109,11 @@ class DatasetConfig:
        self.datastore = datastore
        self.version = version
        self.use_mounting = use_mounting
-        self.target_folder = Path(target_folder) if target_folder is not None else None
-        self.local_folder = Path(local_folder) if local_folder is not None else None
+        # If target_folder is "" then convert to None
+        self.target_folder = Path(target_folder) if target_folder else None
+        if str(self.target_folder) == ".":
+            raise ValueError("Can't mount or download a dataset to the current working directory.")
+        self.local_folder = Path(local_folder) if local_folder else None

    def to_input_dataset_local(self, workspace: Optional[Workspace]) -> Tuple[Optional[Path], Optional[MountContext]]:
        """
@ -137,7 +140,6 @@ class DatasetConfig:
        azureml_dataset = get_or_create_dataset(workspace=workspace,
                                                dataset_name=self.name,
                                                datastore_name=self.datastore)
-
        target_path = self.target_folder or Path(tempfile.mkdtemp())
        use_mounting = self.use_mounting if self.use_mounting is not None else False
        if use_mounting:
@ -170,7 +172,9 @@ class DatasetConfig:
                                                dataset_name=self.name,
                                                datastore_name=self.datastore)
        named_input = azureml_dataset.as_named_input(_input_dataset_key(index=dataset_index))
-        path_on_compute = str(self.target_folder) if self.target_folder is not None else None
+        # If running on windows then self.target_folder may be a WindowsPath, make sure it is
+        # in posix format for Azure.
+        path_on_compute = self.target_folder.as_posix() if self.target_folder is not None else None
        use_mounting = False if self.use_mounting is None else self.use_mounting
        if use_mounting:
            status += "mounted at "
@ -202,7 +206,7 @@ class DatasetConfig:
                                          destination=(datastore, self.name + "/"))
        # TODO: Can we get tags into here too?
        dataset = dataset.register_on_complete(name=self.name)
-        if self.target_folder is not None:
+        if self.target_folder:
            raise ValueError("Output datasets can't have a target_folder set.")
        use_mounting = True if self.use_mounting is None else self.use_mounting
        if use_mounting:
--- a/hi-ml-azure/testazure/testazure/test_datasets.py
+++ b/hi-ml-azure/testazure/testazure/test_datasets.py
@ -5,15 +5,18 @@
 """
 Test the data input and output functionality
 """
+from pathlib import Path
 from unittest import mock
-from azureml.exceptions._azureml_exception import UserErrorException
+from health_azure.utils import PathOrString

 import pytest
+from azureml._restclient.exceptions import ServiceException
 from azureml.core import Dataset
 from azureml.data import FileDataset, OutputFileDatasetConfig
 from azureml.data.azure_storage_datastore import AzureBlobDatastore
 from azureml.data.dataset_consumption_config import DatasetConsumptionConfig
-from azureml._restclient.exceptions import ServiceException
+from azureml.exceptions._azureml_exception import UserErrorException
+
 from health_azure.datasets import (DatasetConfig, _input_dataset_key, _output_dataset_key,
                                   _replace_string_datasets, get_datastore, get_or_create_dataset)
 from testazure.util import DEFAULT_DATASTORE, DEFAULT_WORKSPACE
@ -81,6 +84,36 @@ def test_dataset_input() -> None:
    assert aml_dataset.mode == "mount"


+@pytest.mark.parametrize("target_folder", [
+    "",
+    None,
+])
+def test_dataset_input_target_empty(target_folder: PathOrString) -> None:
+    """
+    Leaving the target folder empty should NOT create a path_on_compute that is "."
+    """
+    workspace = DEFAULT_WORKSPACE.workspace
+    # This dataset must exist in the workspace already, or at least in blob storage.
+    dataset_config = DatasetConfig(name="hello_world", datastore=DEFAULT_DATASTORE, target_folder=target_folder)
+    aml_dataset = dataset_config.to_input_dataset(workspace=workspace, dataset_index=1)
+    assert isinstance(aml_dataset, DatasetConsumptionConfig)
+    assert aml_dataset.path_on_compute is None
+
+
+@pytest.mark.parametrize("target_folder", [
+    ".",
+    Path(),
+    Path("."),
+])
+def test_dataset_invalid_target(target_folder: PathOrString) -> None:
+    """
+    Passing in "." as a target_folder shouold raise an exception.
+    """
+    with pytest.raises(ValueError) as ex:
+        DatasetConfig(name="hello_world", datastore=DEFAULT_DATASTORE, target_folder=target_folder)
+    assert "current working directory" in str(ex)
+
+
 def test_dataset_output() -> None:
    """
    Test turning a dataset setup object to an actual AML output dataset.