Ignore local dataset argument when running inside AML runs (#238)

This PR modifies mount_or_download_dataset such that we ignore the `local_dataset` argument inside AML runs (only used for local runs).
2020-09-22 17:01:54 +02:00 · 2020-09-22 17:01:54 +02:00 · a112b399fe
--- a/InnerEye/ML/deep_learning_config.py
+++ b/InnerEye/ML/deep_learning_config.py
@ -190,8 +190,7 @@ class DeepLearningConfig(GenericConfig, CudaAwareConfig):
                                              "usually set from the class name.")

    random_seed: int = param.Integer(42, doc="The seed to use for all random number generators.")
-    azure_dataset_id: Optional[str] = param.String(None, allow_None=True,
-                                                   doc="The ID of the dataset to use. This dataset must exist as a "
+    azure_dataset_id: str = param.String(doc="If provided, the ID of the dataset to use. This dataset must exist as a "
                                                       "folder of the same name in the 'datasets' "
                                                       "container in the datasets storage account.")
    local_dataset: Optional[Path] = param.ClassSelector(class_=Path,
--- a/InnerEye/ML/run_ml.py
+++ b/InnerEye/ML/run_ml.py
@ -393,23 +393,27 @@ class MLRunner:
        mounted or downloaded.
        Returns the path of the dataset on the executing machine.
        """
-        local_dataset = self.model_config.local_dataset
-        if local_dataset:
-            expected_dir = Path(local_dataset)
-            if not expected_dir.is_dir():
-                raise FileNotFoundError(f"The model uses a dataset in {expected_dir}, but that does not exist.")
-            logging.info(f"Model training will use the local dataset provided in {expected_dir}")
-            return expected_dir
        azure_dataset_id = self.model_config.azure_dataset_id
-        if not azure_dataset_id:
-            raise ValueError("The model must contain either local_dataset or azure_dataset_id.")
+
        if is_offline_run_context(RUN_CONTEXT):
            # The present run is outside of AzureML: If local_dataset is set, use that as the path to the data.
            # Otherwise, download the dataset specified by the azure_dataset_id
+            local_dataset = self.model_config.local_dataset
+            if (not azure_dataset_id) and (local_dataset is None):
+                raise ValueError("The model must contain either local_dataset or azure_dataset_id.")
+            if local_dataset:
+                expected_dir = Path(local_dataset)
+                if not expected_dir.is_dir():
+                    raise FileNotFoundError(f"The model uses a dataset in {expected_dir}, but that does not exist.")
+                logging.info(f"Model training will use the local dataset provided in {expected_dir}")
+                return expected_dir
            return download_dataset(azure_dataset_id=azure_dataset_id,
                                    target_folder=self.project_root / fixed_paths.DATASETS_DIR_NAME,
                                    azure_config=self.azure_config)
+
        # Inside of AzureML, datasets can be either mounted or downloaded.
+        if not azure_dataset_id:
+            raise ValueError("The model must contain azure_dataset_id for running on AML")
        mounted = try_to_mount_input_dataset(RUN_CONTEXT)
        if not mounted:
            raise ValueError("Unable to mount or download input dataset.")
--- a/Tests/ML/models/architectures/test_image_encoder_with_mlp.py
+++ b/Tests/ML/models/architectures/test_image_encoder_with_mlp.py
@ -61,6 +61,7 @@ class ImageEncoder(ScalarModelBase):
            l_rate=1e-1,
            use_mixed_precision=True,
            aggregation_type=aggregation_type,
+            azure_dataset_id="test-dataset",
            **kwargs
        )
        self.encode_channels_jointly = encode_channels_jointly
--- a/azure-pipelines/build-pr.yml
+++ b/azure-pipelines/build-pr.yml
@ -2,7 +2,7 @@ name: PR-$(Date:yyyyMMdd)$(Rev:-r)
 variables:
  model: 'BasicModel2Epochs'
  train: 'True'
-  more_switches: '--log_level=DEBUG'
+  more_switches: '--log_level=DEBUG --local_dataset=F:\local'
  run_recovery_id: ''
  tags: 'PR'
  user_friendly_name: 'PR build'