From a2c27e19d726826dfdf8c0ea1b2c96c7bc89882a Mon Sep 17 00:00:00 2001 From: Javier Date: Thu, 3 Dec 2020 10:44:05 +0000 Subject: [PATCH] Remove blobxfer (#330) * Remove blobxfer * Update CHANGELOG.md * Remove configs that are not required * Remove from environment.yml * Fix numba issue * Improve CHANGELOG.md * Fix tests * Remove configs that are not required --- CHANGELOG.md | 1 + InnerEye/Azure/azure_config.py | 14 -- InnerEye/Azure/azure_util.py | 10 +- .../Statistics/report_structure_extremes.py | 32 +--- InnerEye/Common/fixed_paths.py | 2 - InnerEye/ML/run_ml.py | 55 +----- InnerEye/ML/utils/blobxfer_util.py | 177 ------------------ InnerEye/settings.yml | 1 - Tests/Azure/test_parsing.py | 3 +- Tests/Common/test_commandline_parsing.py | 7 +- Tests/ML/test_download_upload.py | 43 ----- environment.yml | 2 +- 12 files changed, 13 insertions(+), 334 deletions(-) delete mode 100644 InnerEye/ML/utils/blobxfer_util.py diff --git a/CHANGELOG.md b/CHANGELOG.md index da971138..507478ba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ folder structure is present irrespective of using InnerEye as a submodule or not environment will be contained in the model. ### Removed +- Removed blobxfer completely. AzureML Data-stores for reading datasets make the following configs obsolete: 'datasets_storage_account' and 'datasets_storage_account_key' and they are not longer supported. ### Deprecated diff --git a/InnerEye/Azure/azure_config.py b/InnerEye/Azure/azure_config.py index 468defb0..dc7a0769 100755 --- a/InnerEye/Azure/azure_config.py +++ b/InnerEye/Azure/azure_config.py @@ -59,13 +59,6 @@ class AzureConfig(GenericConfig): subscription_id: str = param.String(doc="The ID of your Azure subscription.") tenant_id: str = param.String(doc="The Azure tenant ID.") application_id: str = param.String(doc="Optional: The ID of the Service Principal for authentication to Azure.") - datasets_storage_account: str = \ - param.String(doc="Optional: The blob storage account to use when downloading datasets for use outside of " - "AzureML. This storage account must be the same as the one configured as a 'datastore' " - "in AzureML.") - datasets_storage_account_key: str = \ - param.String(doc="Optional: The access key for the storage account that holds the datasets. " - "This is only used for downloading datasets outside of AzureML.") datasets_container: str = param.String(doc="Optional: The blob storage container with the datasets.") azureml_datastore: str = param.String(doc="The name of the AzureML datastore that holds the input training data. " "This must be created manually, and point to a folder inside the " @@ -196,13 +189,6 @@ class AzureConfig(GenericConfig): config.project_root = project_root return config - def get_dataset_storage_account_key(self) -> Optional[str]: - """ - Gets the storage account key for the storage account that holds the dataset. - """ - secrets_handler = SecretsHandling(project_root=self.project_root) - return secrets_handler.get_secret_from_environment(fixed_paths.DATASETS_ACCOUNT_KEY, allow_missing=True) - def get_workspace(self) -> Workspace: """ Return a workspace object for an existing Azure Machine Learning Workspace (or default from YAML). diff --git a/InnerEye/Azure/azure_util.py b/InnerEye/Azure/azure_util.py index 0033d801..44869216 100644 --- a/InnerEye/Azure/azure_util.py +++ b/InnerEye/Azure/azure_util.py @@ -248,15 +248,6 @@ def get_run_id(run: Optional[Run] = None) -> str: return run_context.id -def storage_account_from_full_name(full_account_name: str) -> str: - """ - Extracts the actual storage account name from the full name, like "/subscriptions/abc123../something/account_name" - :param full_account_name: Full name of account - :return: Storage account name - """ - return full_account_name.split("/")[-1] - - def get_cross_validation_split_index(run: Run) -> int: """ Gets the cross validation index from the run's tags or returns the default @@ -425,6 +416,7 @@ def is_run_and_child_runs_completed(run: Run) -> bool: :param run: The AzureML run to check. :return: True if the run and all child runs completed successfully. """ + def is_completed(run: Run) -> bool: status = run.get_status() if run.status == RunStatus.COMPLETED: diff --git a/InnerEye/Common/Statistics/report_structure_extremes.py b/InnerEye/Common/Statistics/report_structure_extremes.py index f3a4c6cc..10740d04 100644 --- a/InnerEye/Common/Statistics/report_structure_extremes.py +++ b/InnerEye/Common/Statistics/report_structure_extremes.py @@ -26,19 +26,16 @@ This means: slices 79 to 107 inclusive in the z direction are missing, i.e. ther import csv import os -import sys from pathlib import Path from typing import Dict, Iterator, List, Optional, Set, TextIO, Tuple import numpy as np import param -from azure.storage.blob import BlockBlobService from InnerEye.Azure.azure_config import AzureConfig from InnerEye.Common import fixed_paths from InnerEye.Common.common_util import logging_to_stdout from InnerEye.Common.generic_parsing import GenericConfig -from InnerEye.ML.utils.blobxfer_util import download_blobs from InnerEye.ML.utils.io_util import read_image_as_array_with_header MISSING_SLICE_MARKER = "Ms:" @@ -78,7 +75,8 @@ def report_structure_extremes(dataset_dir: str, azure_config: AzureConfig) -> No :param azure_config: An object with all necessary information for accessing Azure. :param dataset_dir: directory containing subject subdirectories with integer names. """ - download_dataset_directory(azure_config, dataset_dir) + if not os.path.isdir(dataset_dir): + raise ValueError(f"Invalid path: {dataset_dir}") subjects: Set[int] = set() series_map = None institution_map = None @@ -122,37 +120,11 @@ def report_structure_extremes(dataset_dir: str, azure_config: AzureConfig) -> No if index % 25 == 0: print(f"Processed {index} subjects") print(f"Processed all {len(subjects)} subjects") - upload_to_dataset_directory(azure_config, dataset_dir, files_created) # If we found any structures with missing slices, raise an exception, which should be # uncaught where necessary to make any appropriate build step fail. if n_missing > 0: raise ValueError(f"Found {n_missing} structures with missing slices") - -def download_dataset_directory(azure_config: AzureConfig, dataset_dir: str) -> bool: - if os.path.isdir(dataset_dir): - return False - account_key = azure_config.get_dataset_storage_account_key() - blobs_root_path = os.path.join(azure_config.datasets_container, os.path.basename(dataset_dir)) + "/" - sys.stdout.write(f"Downloading data to {dataset_dir} ...") - assert account_key is not None # for mypy - download_blobs(azure_config.datasets_storage_account, account_key, blobs_root_path, Path(dataset_dir)) - sys.stdout.write("done\n") - return True - - -def upload_to_dataset_directory(azure_config: AzureConfig, dataset_dir: str, files: Set[str]) -> None: - if not files: - return - account_key = azure_config.get_dataset_storage_account_key() - block_blob_service = BlockBlobService(account_name=azure_config.datasets_storage_account, account_key=account_key) - container_name = os.path.join(azure_config.datasets_container, os.path.basename(dataset_dir)) - for path in files: - blob_name = path[len(dataset_dir) + 1:] - block_blob_service.create_blob_from_path(container_name, blob_name, path) - print(f"Uploaded {path} to {azure_config.datasets_storage_account}:{container_name}/{blob_name}") - - def report_structure_extremes_for_subject(subj_dir: str, series_id: str) -> Iterator[str]: """ :param subj_dir: subject directory, containing .nii.gz files diff --git a/InnerEye/Common/fixed_paths.py b/InnerEye/Common/fixed_paths.py index 5587fe3c..6f0ee01d 100755 --- a/InnerEye/Common/fixed_paths.py +++ b/InnerEye/Common/fixed_paths.py @@ -54,8 +54,6 @@ PRIVATE_SETTINGS_FILE = "InnerEyePrivateSettings.yml" # Names of secrets stored as environment variables or in the PROJECT_SECRETS_FILE: # Secret for the Service Principal SERVICE_PRINCIPAL_KEY = "APPLICATION_KEY" -# The access key for the Azure storage account that holds the datasets. -DATASETS_ACCOUNT_KEY = "DATASETS_ACCOUNT_KEY" INNEREYE_PACKAGE_ROOT = repository_root_directory(INNEREYE_PACKAGE_NAME) SETTINGS_YAML_FILE_NAME = "settings.yml" diff --git a/InnerEye/ML/run_ml.py b/InnerEye/ML/run_ml.py index eba76c55..644de2ea 100644 --- a/InnerEye/ML/run_ml.py +++ b/InnerEye/ML/run_ml.py @@ -5,6 +5,7 @@ import copy import logging import shutil +import time from pathlib import Path from typing import Any, Dict, List, Optional, Tuple @@ -25,7 +26,7 @@ from InnerEye.Azure.azure_util import CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, \ from InnerEye.Common import fixed_paths from InnerEye.Common.build_config import ExperimentResultLocation, build_information_to_dot_net_json_file from InnerEye.Common.common_util import ModelProcessing, is_windows, logging_section, print_exception -from InnerEye.Common.fixed_paths import INNEREYE_PACKAGE_NAME, PROJECT_SECRETS_FILE +from InnerEye.Common.fixed_paths import INNEREYE_PACKAGE_NAME from InnerEye.ML.common import DATASET_CSV_FILE_NAME, ModelExecutionMode from InnerEye.ML.config import SegmentationModelBase from InnerEye.ML.deep_learning_config import CHECKPOINT_FOLDER, FINAL_ENSEMBLE_MODEL_FOLDER, FINAL_MODEL_FOLDER, \ @@ -38,7 +39,6 @@ from InnerEye.ML.model_training import model_train from InnerEye.ML.runner import ModelDeploymentHookSignature, Runner, get_all_environment_files from InnerEye.ML.scalar_config import ScalarModelBase from InnerEye.ML.utils import ml_util -from InnerEye.ML.utils.blobxfer_util import download_blobs from InnerEye.ML.utils.checkpoint_handling import CheckpointHandler from InnerEye.ML.utils.ml_util import make_pytorch_reproducible from InnerEye.ML.visualizers import activation_maps @@ -61,45 +61,6 @@ def try_to_mount_input_dataset(run_context: Any) -> Optional[Path]: return None -def download_dataset_via_blobxfer(dataset_id: str, - azure_config: AzureConfig, - target_folder: Path) -> Optional[Path]: - """ - Attempts to downloads a dataset from the Azure storage account for datasets, with download happening via - blobxfer. This is only possible if the datasets storage account and keyword are present in the `azure_config`. - The function returns None if the required settings were not present. - :param dataset_id: The folder of the dataset, expected in the container given by azure_config.datasets_container. - :param azure_config: The object with all Azure-related settings. - :param target_folder: The local folder into which the dataset should be downloaded. - :return: The folder that contains the downloaded dataset. Returns None if the datasets account name or password - were not present. - """ - datasets_account_key = azure_config.get_dataset_storage_account_key() - if not datasets_account_key: - logging.info("No account key for the dataset storage account was found.") - logging.info(f"We checked in environment variables and in the file {PROJECT_SECRETS_FILE}") - return None - if (not azure_config.datasets_container) or (not azure_config.datasets_storage_account): - logging.info("Datasets storage account or container missing.") - return None - target_folder.mkdir(exist_ok=True) - result_folder = target_folder / dataset_id - # only download if hasn't already been downloaded - if result_folder.is_dir(): - logging.info(f"Folder already exists, skipping download: {result_folder}") - return result_folder - with logging_section(f"Downloading dataset {dataset_id}"): - download_blobs( - account=azure_config.datasets_storage_account, - account_key=datasets_account_key, - # When specifying the blobs root path, ensure that there is a slash at the end, otherwise - # all datasets with that dataset_id as a prefix get downloaded. - blobs_root_path=f"{azure_config.datasets_container}/{dataset_id}/", - destination=result_folder - ) - return result_folder - - def download_dataset(azure_dataset_id: str, target_folder: Path, azure_config: AzureConfig) -> Path: @@ -109,20 +70,11 @@ def download_dataset(azure_dataset_id: str, AzureML dataset attached to the given AzureML workspace. The dataset is downloaded into the `target_folder`, in a subfolder that has the same name as the dataset. If there already appears to be such a folder, and the folder contains a dataset.csv file, no download is started. - :param local_dataset: The path to an existing local dataset. :param azure_dataset_id: The name of a dataset that is registered in the AzureML workspace. :param target_folder: The folder in which to download the dataset from Azure. :param azure_config: All Azure-related configuration options. :return: A path on the local machine that contains the dataset. """ - try: - downloaded_via_blobxfer = download_dataset_via_blobxfer(dataset_id=azure_dataset_id, - azure_config=azure_config, - target_folder=target_folder) - if downloaded_via_blobxfer: - return downloaded_via_blobxfer - except Exception as ex: - print_exception(ex, message="Unable to download dataset via blobxfer.") logging.info("Trying to download dataset via AzureML datastore now.") azure_dataset = get_or_create_dataset(azure_config, azure_dataset_id) if not isinstance(azure_dataset, FileDataset): @@ -136,7 +88,10 @@ def download_dataset(azure_dataset_id: str, return expected_dataset_path logging.info("Starting to download the dataset - WARNING, this could take very long!") with logging_section("Downloading dataset"): + t0 = time.perf_counter() azure_dataset.download(target_path=str(expected_dataset_path), overwrite=False) + t1 = time.perf_counter() - t0 + logging.info(f"Azure dataset '{azure_dataset_id}' downloaded in {t1} seconds") logging.info(f"Azure dataset '{azure_dataset_id}' is now available in {expected_dataset_path}") return expected_dataset_path diff --git a/InnerEye/ML/utils/blobxfer_util.py b/InnerEye/ML/utils/blobxfer_util.py deleted file mode 100644 index d5bb716d..00000000 --- a/InnerEye/ML/utils/blobxfer_util.py +++ /dev/null @@ -1,177 +0,0 @@ -# ------------------------------------------------------------------------------------------ -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. -# ------------------------------------------------------------------------------------------ -from __future__ import annotations - -import logging -import time -from dataclasses import dataclass -from pathlib import Path -from typing import Optional - -import blobxfer -import blobxfer.models.azure as azmodels -from blobxfer.api import AzureStorageCredentials, ConcurrencyOptions, DownloadOptions, GeneralOptions, SkipOnOptions, \ - UploadOptions -from blobxfer.models.options import FileProperties, Timeout, VectoredIo -from blobxfer.models.upload import VectoredIoDistributionMode - -from InnerEye.Azure.azure_util import storage_account_from_full_name, to_azure_friendly_container_path - -# Azure storage is extremely talkative, printing out each client request (thousands of them) -logger = logging.getLogger('azure.storage') -logger.setLevel(logging.WARNING) -# Blobxfer also prints at single line per file at least. -logger = logging.getLogger('blobxfer') -logger.setLevel(logging.WARNING) - - -@dataclass -class BlobXFerConfig: - """ - Class to hold Bloxfer configurations and helpers functions to download and - upload functions to Azure. - """ - account_name: str - account_key: str - concurrency: ConcurrencyOptions - timeout: Timeout - general: GeneralOptions - file_properties: FileProperties - skipon_options: SkipOnOptions - - @staticmethod - def create_default(account: str, account_key: str) -> BlobXFerConfig: - """ - Returns the default configuration. - - :param account: Name of the Azure storage account - :param account_key: Key to this storage account - :return: default Blobxferconfig - """ - concurrency = ConcurrencyOptions(crypto_processes=2, md5_processes=2, disk_threads=16, - transfer_threads=16, action=1) - timeout = Timeout(connect=20, read=60, max_retries=3) - general = GeneralOptions(concurrency, progress_bar=False, verbose=False, timeout=timeout, quiet=True) - file_properties = FileProperties(attributes=False, cache_control=None, content_type=None, - lmt=False, md5=None) - skipon_options = SkipOnOptions(filesize_match=True, lmt_ge=True, md5_match=True) - return BlobXFerConfig( - account_name=storage_account_from_full_name(account), - account_key=account_key, - concurrency=concurrency, - timeout=timeout, - general=general, - file_properties=file_properties, - skipon_options=skipon_options - ) - - def get_download_options(self, num_folders_to_strip: int = 0) -> DownloadOptions: - """ - Returns a BloxFer DownloadOptions object. - - :param num_folders_to_strip: The filenames will be stripped off their leading directories, up to this level e.g. - if original path is 'container/foo/1.txt' and number_folders_to_strip is 2, and destination folder is 'bar', - the downloaded file will be 'bar/1.txt' - """ - return DownloadOptions(check_file_md5=True, - chunk_size_bytes=4194304, - delete_extraneous_destination=False, - delete_only=False, - max_single_object_concurrency=8, - mode=azmodels.StorageModes.Auto, - overwrite=True, - recursive=True, - rename=False, - restore_file_properties=self.file_properties, - rsa_private_key=None, - strip_components=num_folders_to_strip) - - def get_upload_options(self, num_folders_to_strip: int = 0) -> UploadOptions: - """ - Returns a UploadOptions object. - - :param num_folders_to_strip: The filenames will be stripped off their leading directories, up to this level e.g. - if original path is 'container/foo/1.txt' and number_folders_to_strip is 2, and destination folder is 'bar', - the downloaded file will be 'bar/1.txt' - """ - return UploadOptions( - access_tier=None, - one_shot_bytes=33554432, - rsa_public_key=None, - stdin_as_page_blob_size=0, - store_file_properties=self.file_properties, - vectored_io=VectoredIo( - stripe_chunk_size_bytes=0, - distribution_mode=VectoredIoDistributionMode.Disabled - ), - chunk_size_bytes=4194304, - delete_extraneous_destination=False, - delete_only=False, - mode=azmodels.StorageModes.Auto, - overwrite=True, - recursive=True, - rename=False, - strip_components=num_folders_to_strip - ) - - def get_credentials(self) -> AzureStorageCredentials: - credentials = AzureStorageCredentials(self.general) - credentials.add_storage_account(self.account_name, self.account_key, endpoint="core.windows.net") - return credentials - - -def download_blobs(account: str, account_key: str, blobs_root_path: str, destination: Path, - is_file: bool = False, config: Optional[BlobXFerConfig] = None) -> Path: - """ - Download a given set of files in Azure blob storage to the local destination path, via blobxfer. - :param account: The name of the storage account to access the files. - :param account_key: The key for the storage account. - :param blobs_root_path: The path of the files that should be downloaded. This must be in format - 'container/file_prefix/', ending with a slash (will be added if not provided and is_file is False). - :param destination: The destination folder for the copied files on the local machine. - :param is_file: If True then only a single file is required to be downloaded - :param config: BlobXFerConfig to use for download configuration, use default presets if None. - The filenames will be stripped off their leading directories, up to the level given by blobs_root_path. - For example, if blobs_root_path is 'container/foo/' - and contains a file 'container/foo/1.txt', and destination is 'bar', the downloaded file will be 'bar/1.txt' - """ - if not config: - config = BlobXFerConfig.create_default(account=account, account_key=account_key) - - start_time = time.time() - # the account name can be an Azure Resource ID so extract the name from it if this is the case - logging.info(f"Downloading '{blobs_root_path}' from storage account {config.account_name} to {destination}") - blobs_root_path = to_azure_friendly_container_path(Path(blobs_root_path)) - if not (blobs_root_path.endswith("/") or is_file): - blobs_root_path += "/" - blobs_root_path_dirs = blobs_root_path.rstrip("/").split("/") - num_folders_to_strip = len(blobs_root_path_dirs) - 1 - - blobs_path_without_container = "/".join(blobs_root_path_dirs[1:]) - logging.info(f"Cleaned download path: '{blobs_root_path}' from storage account {config.account_name}") - - download = config.get_download_options(num_folders_to_strip) - local_path = blobxfer.api.LocalDestinationPath(str(destination)) - # noinspection PyTypeChecker - download_spec = blobxfer.api.DownloadSpecification(download, config.skipon_options, local_path) - source = blobxfer.api.AzureSourcePath() - source.add_path_with_storage_account(blobs_root_path, config.account_name) - if not is_file: - source.add_includes([f"{blobs_path_without_container}/*"]) - download_spec.add_azure_source_path(source) - # noinspection PyTypeChecker - downloader = blobxfer.api.Downloader(config.general, config.get_credentials(), download_spec) - downloader.start() - elapsed = time.time() - start_time - logging.info(f"Finished downloading in {elapsed:0.2f}sec.") - - if is_file: - destination = destination / Path(blobs_root_path).name - if destination.exists(): - return destination - raise ValueError(f"Unable to download {blobs_root_path} from " - f"storage account {config.account_name} to {destination}") - else: - return destination diff --git a/InnerEye/settings.yml b/InnerEye/settings.yml index 7d0143cf..05080f37 100644 --- a/InnerEye/settings.yml +++ b/InnerEye/settings.yml @@ -2,7 +2,6 @@ variables: tenant_id: '72f988bf-86f1-41af-91ab-2d7cd011db47' subscription_id: '' application_id: '' - datasets_storage_account: 'innereyepublicdatasets' datasets_container: 'datasets' azureml_datastore: 'innereyedatasets' resource_group: 'InnerEye-DeepLearning' diff --git a/Tests/Azure/test_parsing.py b/Tests/Azure/test_parsing.py index ccc5c2d9..b9d1b2ba 100644 --- a/Tests/Azure/test_parsing.py +++ b/Tests/Azure/test_parsing.py @@ -86,7 +86,7 @@ def test_create_runner_parser(with_config: bool) -> None: azure_parser = create_runner_parser(SegmentationModelBase if with_config else None) args_list = ["--model=Lung", "--train=False", "--l_rate=100.0", "--unknown=1", "--subscription_id", "Test1", "--tenant_id=Test2", - "--application_id", "Test3", "--datasets_storage_account=Test4", + "--application_id", "Test3", "--log_level=INFO", # Normally we don't use extra index URLs in InnerEye, hence this won't be set in YAML. "--pip_extra_index_url=foo"] @@ -96,7 +96,6 @@ def test_create_runner_parser(with_config: bool) -> None: # These values have been set on the commandline, to values that are not the parser defaults. non_default_args = { - "datasets_storage_account": "Test4", "train": False, "model": "Lung", "subscription_id": "Test1", diff --git a/Tests/Common/test_commandline_parsing.py b/Tests/Common/test_commandline_parsing.py index 7ece0f3a..431b34d3 100644 --- a/Tests/Common/test_commandline_parsing.py +++ b/Tests/Common/test_commandline_parsing.py @@ -34,7 +34,7 @@ def test_create_ml_runner_args(is_default_namespace: bool, args_list = [f"--model={model_name}", "--train=True", "--l_rate=100.0", "--norm_method=Simple Norm", "--subscription_id", "Test1", "--tenant_id=Test2", - "--application_id", "Test3", "--datasets_storage_account=Test4", "--datasets_container", "Test5", + "--application_id", "Test3", "--datasets_container", "Test5", "--pytest_mark", "gpu", f"--output_to={outputs_folder}"] if not is_default_namespace: args_list.append(f"--model_configs_namespace={model_configs_namespace}") @@ -45,7 +45,6 @@ def test_create_ml_runner_args(is_default_namespace: bool, runner.parse_and_load_model() azure_config = runner.azure_config model_config = runner.model_config - assert azure_config.datasets_storage_account == "Test4" assert azure_config.model == model_name assert model_config.l_rate == 100.0 assert model_config.norm_method == PhotometricNormalizationMethod.SimpleNorm @@ -60,7 +59,7 @@ def test_create_ml_runner_args(is_default_namespace: bool, assert model_config.outputs_folder == (project_root / DEFAULT_AML_UPLOAD_DIR) assert model_config.logs_folder == (project_root / DEFAULT_LOGS_DIR_NAME) - assert not hasattr(model_config, "datasets_storage_account") + assert not hasattr(model_config, "datasets_container") assert azure_config.pytest_mark == "gpu" @@ -126,7 +125,6 @@ def test_parsing_with_custom_yaml(test_output_dirs: OutputFolderForTests) -> Non yaml_file = test_output_dirs.root_dir / "custom.yml" yaml_file.write_text("""variables: tenant_id: 'foo' - datasets_storage_account: 'account' start_epoch: 7 random_seed: 1 """) @@ -143,7 +141,6 @@ def test_parsing_with_custom_yaml(test_output_dirs: OutputFolderForTests) -> Non assert loader_result is not None assert runner.azure_config is not None # This is only present in yaml - assert runner.azure_config.datasets_storage_account == "account" # This is present in yaml and command line, and the latter should be used. assert runner.azure_config.tenant_id == "bar" # Settings in model config: start_epoch is only in yaml diff --git a/Tests/ML/test_download_upload.py b/Tests/ML/test_download_upload.py index bd6c1731..d399796e 100644 --- a/Tests/ML/test_download_upload.py +++ b/Tests/ML/test_download_upload.py @@ -12,11 +12,9 @@ from InnerEye.Azure.azure_util import fetch_child_runs, fetch_run, get_results_b from InnerEye.Common import common_util, fixed_paths from InnerEye.Common.common_util import OTHER_RUNS_SUBDIR_NAME, logging_section, logging_to_stdout from InnerEye.Common.output_directories import OutputFolderForTests -from InnerEye.ML import run_ml from InnerEye.ML.common import CHECKPOINT_FILE_SUFFIX, DATASET_CSV_FILE_NAME from InnerEye.ML.model_config_base import ModelConfigBase from InnerEye.ML.run_ml import MLRunner -from InnerEye.ML.utils.blobxfer_util import download_blobs from InnerEye.ML.utils.run_recovery import RunRecovery from Tests.Common.test_util import DEFAULT_ENSEMBLE_RUN_RECOVERY_ID, DEFAULT_RUN_RECOVERY_ID from Tests.ML.util import get_default_azure_config @@ -128,44 +126,3 @@ def test_download_azureml_dataset(test_output_dirs: OutputFolderForTests) -> Non for file in ["ct", "esophagus", "heart", "lung_l", "lung_r", "spinalcord"]: f = (sub_folder / file).with_suffix(".nii.gz") assert f.is_file() - - -def test_download_dataset_via_blobxfer(test_output_dirs: OutputFolderForTests) -> None: - azure_config = get_default_azure_config() - result_path = run_ml.download_dataset_via_blobxfer(dataset_id="test-dataset", - azure_config=azure_config, - target_folder=test_output_dirs.root_dir) - assert result_path - assert result_path.is_dir() - dataset_csv = Path(result_path) / DATASET_CSV_FILE_NAME - assert dataset_csv.exists() - - -@pytest.mark.parametrize("is_file", [True, False]) -def test_download_blobxfer(test_output_dirs: OutputFolderForTests, is_file: bool, runner_config: AzureConfig) -> None: - """ - Test for a bug in early versions of download_blobs: download is happening via prefixes, but because of - stripping leading directory names, blobs got overwritten. - """ - root = test_output_dirs.root_dir - account_key = runner_config.get_dataset_storage_account_key() - assert account_key is not None - # Expected test data in Azure blobs: - # folder1/folder1.txt with content "folder1.txt" - # folder1_with_suffix/folder2.txt with content "folder2.txt" - # folder1_with_suffix/folder1.txt with content "this comes from folder2" - # with bug present, folder1_with_suffix/folder1.txt will overwrite folder1/folder1.txt - blobs_root_path = "data-for-testsuite/folder1" - if is_file: - blobs_root_path += "/folder1.txt" - download_blobs(runner_config.datasets_storage_account, account_key, blobs_root_path, root, is_file) - - folder1 = root / "folder1.txt" - assert folder1.exists() - if not is_file: - otherfile = root / "otherfile.txt" - folder2 = root / "folder2.txt" - assert folder1.read_text().strip() == "folder1.txt" - assert otherfile.exists() - assert otherfile.read_text().strip() == "folder1.txt" - assert not folder2.exists() diff --git a/environment.yml b/environment.yml index d91bafd4..fc309c3d 100644 --- a/environment.yml +++ b/environment.yml @@ -14,7 +14,6 @@ dependencies: - azureml-mlflow==1.17.0 - azureml-sdk==1.17.0 - azureml-tensorboard==1.17.0 - - blobxfer==1.9.4 - conda-merge==0.1.5 - dataclasses-json==0.5.2 - flake8==3.8.3 @@ -29,6 +28,7 @@ dependencies: - mypy==0.770 - mypy-extensions==0.4.3 - numpy==1.19.1 + - numba==0.51.2 - pandas==1.1.0 - papermill==2.2.2 - param==1.9.3