Remove blobxfer (#330)

* Remove blobxfer * Update CHANGELOG.md * Remove configs that are not required * Remove from environment.yml * Fix numba issue * Improve CHANGELOG.md * Fix tests * Remove configs that are not required
2020-12-03 10:44:05 +00:00 · 2020-12-03 10:44:05 +00:00 · a2c27e19d7
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -24,6 +24,7 @@ folder structure is present irrespective of using InnerEye as a submodule or not
 environment will be contained in the model.

 ### Removed
+- Removed blobxfer completely. AzureML Data-stores for reading datasets make the following configs obsolete: 'datasets_storage_account' and 'datasets_storage_account_key' and they are not longer supported. 

 ### Deprecated

--- a/InnerEye/Azure/azure_config.py
+++ b/InnerEye/Azure/azure_config.py
@ -59,13 +59,6 @@ class AzureConfig(GenericConfig):
    subscription_id: str = param.String(doc="The ID of your Azure subscription.")
    tenant_id: str = param.String(doc="The Azure tenant ID.")
    application_id: str = param.String(doc="Optional: The ID of the Service Principal for authentication to Azure.")
-    datasets_storage_account: str = \
-        param.String(doc="Optional: The blob storage account to use when downloading datasets for use outside of "
-                         "AzureML. This storage account must be the same as the one configured as a 'datastore' "
-                         "in AzureML.")
-    datasets_storage_account_key: str = \
-        param.String(doc="Optional: The access key for the storage account that holds the datasets. "
-                         "This is only used for downloading datasets outside of AzureML.")
    datasets_container: str = param.String(doc="Optional: The blob storage container with the datasets.")
    azureml_datastore: str = param.String(doc="The name of the AzureML datastore that holds the input training data. "
                                              "This must be created manually, and point to a folder inside the "
@ -196,13 +189,6 @@ class AzureConfig(GenericConfig):
            config.project_root = project_root
        return config

-    def get_dataset_storage_account_key(self) -> Optional[str]:
-        """
-        Gets the storage account key for the storage account that holds the dataset.
-        """
-        secrets_handler = SecretsHandling(project_root=self.project_root)
-        return secrets_handler.get_secret_from_environment(fixed_paths.DATASETS_ACCOUNT_KEY, allow_missing=True)
-
    def get_workspace(self) -> Workspace:
        """
        Return a workspace object for an existing Azure Machine Learning Workspace (or default from YAML).
--- a/InnerEye/Azure/azure_util.py
+++ b/InnerEye/Azure/azure_util.py
@ -248,15 +248,6 @@ def get_run_id(run: Optional[Run] = None) -> str:
        return run_context.id


-def storage_account_from_full_name(full_account_name: str) -> str:
-    """
-    Extracts the actual storage account name from the full name, like "/subscriptions/abc123../something/account_name"
-    :param full_account_name: Full name of account
-    :return: Storage account name
-    """
-    return full_account_name.split("/")[-1]
-
-
 def get_cross_validation_split_index(run: Run) -> int:
    """
    Gets the cross validation index from the run's tags or returns the default
@ -425,6 +416,7 @@ def is_run_and_child_runs_completed(run: Run) -> bool:
    :param run: The AzureML run to check.
    :return: True if the run and all child runs completed successfully.
    """
+
    def is_completed(run: Run) -> bool:
        status = run.get_status()
        if run.status == RunStatus.COMPLETED:
--- a/InnerEye/Common/Statistics/report_structure_extremes.py
+++ b/InnerEye/Common/Statistics/report_structure_extremes.py
@ -26,19 +26,16 @@ This means: slices 79 to 107 inclusive in the z direction are missing, i.e. ther

 import csv
 import os
-import sys
 from pathlib import Path
 from typing import Dict, Iterator, List, Optional, Set, TextIO, Tuple

 import numpy as np
 import param
-from azure.storage.blob import BlockBlobService

 from InnerEye.Azure.azure_config import AzureConfig
 from InnerEye.Common import fixed_paths
 from InnerEye.Common.common_util import logging_to_stdout
 from InnerEye.Common.generic_parsing import GenericConfig
-from InnerEye.ML.utils.blobxfer_util import download_blobs
 from InnerEye.ML.utils.io_util import read_image_as_array_with_header

 MISSING_SLICE_MARKER = "Ms:"
@ -78,7 +75,8 @@ def report_structure_extremes(dataset_dir: str, azure_config: AzureConfig) -> No
    :param azure_config: An object with all necessary information for accessing Azure.
    :param dataset_dir: directory containing subject subdirectories with integer names.
    """
-    download_dataset_directory(azure_config, dataset_dir)
+    if not os.path.isdir(dataset_dir):
+        raise ValueError(f"Invalid path: {dataset_dir}")
    subjects: Set[int] = set()
    series_map = None
    institution_map = None
@ -122,37 +120,11 @@ def report_structure_extremes(dataset_dir: str, azure_config: AzureConfig) -> No
        if index % 25 == 0:
            print(f"Processed {index} subjects")
    print(f"Processed all {len(subjects)} subjects")
-    upload_to_dataset_directory(azure_config, dataset_dir, files_created)
    # If we found any structures with missing slices, raise an exception, which should be
    # uncaught where necessary to make any appropriate build step fail.
    if n_missing > 0:
        raise ValueError(f"Found {n_missing} structures with missing slices")

-
-def download_dataset_directory(azure_config: AzureConfig, dataset_dir: str) -> bool:
-    if os.path.isdir(dataset_dir):
-        return False
-    account_key = azure_config.get_dataset_storage_account_key()
-    blobs_root_path = os.path.join(azure_config.datasets_container, os.path.basename(dataset_dir)) + "/"
-    sys.stdout.write(f"Downloading data to {dataset_dir} ...")
-    assert account_key is not None  # for mypy
-    download_blobs(azure_config.datasets_storage_account, account_key, blobs_root_path, Path(dataset_dir))
-    sys.stdout.write("done\n")
-    return True
-
-
-def upload_to_dataset_directory(azure_config: AzureConfig, dataset_dir: str, files: Set[str]) -> None:
-    if not files:
-        return
-    account_key = azure_config.get_dataset_storage_account_key()
-    block_blob_service = BlockBlobService(account_name=azure_config.datasets_storage_account, account_key=account_key)
-    container_name = os.path.join(azure_config.datasets_container, os.path.basename(dataset_dir))
-    for path in files:
-        blob_name = path[len(dataset_dir) + 1:]
-        block_blob_service.create_blob_from_path(container_name, blob_name, path)
-        print(f"Uploaded {path} to {azure_config.datasets_storage_account}:{container_name}/{blob_name}")
-
-
 def report_structure_extremes_for_subject(subj_dir: str, series_id: str) -> Iterator[str]:
    """
    :param subj_dir: subject directory, containing <structure>.nii.gz files
--- a/InnerEye/Common/fixed_paths.py
+++ b/InnerEye/Common/fixed_paths.py
@ -54,8 +54,6 @@ PRIVATE_SETTINGS_FILE = "InnerEyePrivateSettings.yml"
 # Names of secrets stored as environment variables or in the PROJECT_SECRETS_FILE:
 # Secret for the Service Principal
 SERVICE_PRINCIPAL_KEY = "APPLICATION_KEY"
-# The access key for the Azure storage account that holds the datasets.
-DATASETS_ACCOUNT_KEY = "DATASETS_ACCOUNT_KEY"

 INNEREYE_PACKAGE_ROOT = repository_root_directory(INNEREYE_PACKAGE_NAME)
 SETTINGS_YAML_FILE_NAME = "settings.yml"
--- a/InnerEye/ML/run_ml.py
+++ b/InnerEye/ML/run_ml.py
@ -5,6 +5,7 @@
 import copy
 import logging
 import shutil
+import time
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple

@ -25,7 +26,7 @@ from InnerEye.Azure.azure_util import CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, \
 from InnerEye.Common import fixed_paths
 from InnerEye.Common.build_config import ExperimentResultLocation, build_information_to_dot_net_json_file
 from InnerEye.Common.common_util import ModelProcessing, is_windows, logging_section, print_exception
-from InnerEye.Common.fixed_paths import INNEREYE_PACKAGE_NAME, PROJECT_SECRETS_FILE
+from InnerEye.Common.fixed_paths import INNEREYE_PACKAGE_NAME
 from InnerEye.ML.common import DATASET_CSV_FILE_NAME, ModelExecutionMode
 from InnerEye.ML.config import SegmentationModelBase
 from InnerEye.ML.deep_learning_config import CHECKPOINT_FOLDER, FINAL_ENSEMBLE_MODEL_FOLDER, FINAL_MODEL_FOLDER, \
@ -38,7 +39,6 @@ from InnerEye.ML.model_training import model_train
 from InnerEye.ML.runner import ModelDeploymentHookSignature, Runner, get_all_environment_files
 from InnerEye.ML.scalar_config import ScalarModelBase
 from InnerEye.ML.utils import ml_util
-from InnerEye.ML.utils.blobxfer_util import download_blobs
 from InnerEye.ML.utils.checkpoint_handling import CheckpointHandler
 from InnerEye.ML.utils.ml_util import make_pytorch_reproducible
 from InnerEye.ML.visualizers import activation_maps
@ -61,45 +61,6 @@ def try_to_mount_input_dataset(run_context: Any) -> Optional[Path]:
    return None


-def download_dataset_via_blobxfer(dataset_id: str,
-                                  azure_config: AzureConfig,
-                                  target_folder: Path) -> Optional[Path]:
-    """
-    Attempts to downloads a dataset from the Azure storage account for datasets, with download happening via
-    blobxfer. This is only possible if the datasets storage account and keyword are present in the `azure_config`.
-    The function returns None if the required settings were not present.
-    :param dataset_id: The folder of the dataset, expected in the container given by azure_config.datasets_container.
-    :param azure_config: The object with all Azure-related settings.
-    :param target_folder: The local folder into which the dataset should be downloaded.
-    :return: The folder that contains the downloaded dataset. Returns None if the datasets account name or password
-    were not present.
-    """
-    datasets_account_key = azure_config.get_dataset_storage_account_key()
-    if not datasets_account_key:
-        logging.info("No account key for the dataset storage account was found.")
-        logging.info(f"We checked in environment variables and in the file {PROJECT_SECRETS_FILE}")
-        return None
-    if (not azure_config.datasets_container) or (not azure_config.datasets_storage_account):
-        logging.info("Datasets storage account or container missing.")
-        return None
-    target_folder.mkdir(exist_ok=True)
-    result_folder = target_folder / dataset_id
-    # only download if hasn't already been downloaded
-    if result_folder.is_dir():
-        logging.info(f"Folder already exists, skipping download: {result_folder}")
-        return result_folder
-    with logging_section(f"Downloading dataset {dataset_id}"):
-        download_blobs(
-            account=azure_config.datasets_storage_account,
-            account_key=datasets_account_key,
-            # When specifying the blobs root path, ensure that there is a slash at the end, otherwise
-            # all datasets with that dataset_id as a prefix get downloaded.
-            blobs_root_path=f"{azure_config.datasets_container}/{dataset_id}/",
-            destination=result_folder
-        )
-    return result_folder
-
-
 def download_dataset(azure_dataset_id: str,
                     target_folder: Path,
                     azure_config: AzureConfig) -> Path:
@ -109,20 +70,11 @@ def download_dataset(azure_dataset_id: str,
    AzureML dataset attached to the given AzureML workspace. The dataset is downloaded into the `target_folder`,
    in a subfolder that has the same name as the dataset. If there already appears to be such a folder, and the folder
    contains a dataset.csv file, no download is started.
-    :param local_dataset: The path to an existing local dataset.
    :param azure_dataset_id: The name of a dataset that is registered in the AzureML workspace.
    :param target_folder: The folder in which to download the dataset from Azure.
    :param azure_config: All Azure-related configuration options.
    :return: A path on the local machine that contains the dataset.
    """
-    try:
-        downloaded_via_blobxfer = download_dataset_via_blobxfer(dataset_id=azure_dataset_id,
-                                                                azure_config=azure_config,
-                                                                target_folder=target_folder)
-        if downloaded_via_blobxfer:
-            return downloaded_via_blobxfer
-    except Exception as ex:
-        print_exception(ex, message="Unable to download dataset via blobxfer.")
    logging.info("Trying to download dataset via AzureML datastore now.")
    azure_dataset = get_or_create_dataset(azure_config, azure_dataset_id)
    if not isinstance(azure_dataset, FileDataset):
@ -136,7 +88,10 @@ def download_dataset(azure_dataset_id: str,
        return expected_dataset_path
    logging.info("Starting to download the dataset - WARNING, this could take very long!")
    with logging_section("Downloading dataset"):
+        t0 = time.perf_counter()
        azure_dataset.download(target_path=str(expected_dataset_path), overwrite=False)
+        t1 = time.perf_counter() - t0
+        logging.info(f"Azure dataset '{azure_dataset_id}' downloaded in {t1} seconds")
    logging.info(f"Azure dataset '{azure_dataset_id}' is now available in {expected_dataset_path}")
    return expected_dataset_path

--- a/InnerEye/ML/utils/blobxfer_util.py
+++ b/InnerEye/ML/utils/blobxfer_util.py
@ -1,177 +0,0 @@
-#  ------------------------------------------------------------------------------------------
-#  Copyright (c) Microsoft Corporation. All rights reserved.
-#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
-#  ------------------------------------------------------------------------------------------
-from __future__ import annotations
-
-import logging
-import time
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Optional
-
-import blobxfer
-import blobxfer.models.azure as azmodels
-from blobxfer.api import AzureStorageCredentials, ConcurrencyOptions, DownloadOptions, GeneralOptions, SkipOnOptions, \
-    UploadOptions
-from blobxfer.models.options import FileProperties, Timeout, VectoredIo
-from blobxfer.models.upload import VectoredIoDistributionMode
-
-from InnerEye.Azure.azure_util import storage_account_from_full_name, to_azure_friendly_container_path
-
-# Azure storage is extremely talkative, printing out each client request (thousands of them)
-logger = logging.getLogger('azure.storage')
-logger.setLevel(logging.WARNING)
-# Blobxfer also prints at single line per file at least.
-logger = logging.getLogger('blobxfer')
-logger.setLevel(logging.WARNING)
-
-
-@dataclass
-class BlobXFerConfig:
-    """
-    Class to hold Bloxfer configurations and helpers functions to download and
-    upload functions to Azure.
-    """
-    account_name: str
-    account_key: str
-    concurrency: ConcurrencyOptions
-    timeout: Timeout
-    general: GeneralOptions
-    file_properties: FileProperties
-    skipon_options: SkipOnOptions
-
-    @staticmethod
-    def create_default(account: str, account_key: str) -> BlobXFerConfig:
-        """
-        Returns the default configuration.
-
-        :param account: Name of the Azure storage account
-        :param account_key: Key to this storage account
-        :return: default Blobxferconfig
-        """
-        concurrency = ConcurrencyOptions(crypto_processes=2, md5_processes=2, disk_threads=16,
-                                         transfer_threads=16, action=1)
-        timeout = Timeout(connect=20, read=60, max_retries=3)
-        general = GeneralOptions(concurrency, progress_bar=False, verbose=False, timeout=timeout, quiet=True)
-        file_properties = FileProperties(attributes=False, cache_control=None, content_type=None,
-                                         lmt=False, md5=None)
-        skipon_options = SkipOnOptions(filesize_match=True, lmt_ge=True, md5_match=True)
-        return BlobXFerConfig(
-            account_name=storage_account_from_full_name(account),
-            account_key=account_key,
-            concurrency=concurrency,
-            timeout=timeout,
-            general=general,
-            file_properties=file_properties,
-            skipon_options=skipon_options
-        )
-
-    def get_download_options(self, num_folders_to_strip: int = 0) -> DownloadOptions:
-        """
-        Returns a BloxFer DownloadOptions object.
-
-        :param num_folders_to_strip: The filenames will be stripped off their leading directories, up to this level e.g.
-        if original path is 'container/foo/1.txt' and number_folders_to_strip is 2, and destination folder is 'bar',
-        the downloaded file will be 'bar/1.txt'
-        """
-        return DownloadOptions(check_file_md5=True,
-                               chunk_size_bytes=4194304,
-                               delete_extraneous_destination=False,
-                               delete_only=False,
-                               max_single_object_concurrency=8,
-                               mode=azmodels.StorageModes.Auto,
-                               overwrite=True,
-                               recursive=True,
-                               rename=False,
-                               restore_file_properties=self.file_properties,
-                               rsa_private_key=None,
-                               strip_components=num_folders_to_strip)
-
-    def get_upload_options(self, num_folders_to_strip: int = 0) -> UploadOptions:
-        """
-        Returns a UploadOptions object.
-
-        :param num_folders_to_strip: The filenames will be stripped off their leading directories, up to this level e.g.
-        if original path is 'container/foo/1.txt' and number_folders_to_strip is 2, and destination folder is 'bar',
-        the downloaded file will be 'bar/1.txt'
-        """
-        return UploadOptions(
-            access_tier=None,
-            one_shot_bytes=33554432,
-            rsa_public_key=None,
-            stdin_as_page_blob_size=0,
-            store_file_properties=self.file_properties,
-            vectored_io=VectoredIo(
-                stripe_chunk_size_bytes=0,
-                distribution_mode=VectoredIoDistributionMode.Disabled
-            ),
-            chunk_size_bytes=4194304,
-            delete_extraneous_destination=False,
-            delete_only=False,
-            mode=azmodels.StorageModes.Auto,
-            overwrite=True,
-            recursive=True,
-            rename=False,
-            strip_components=num_folders_to_strip
-        )
-
-    def get_credentials(self) -> AzureStorageCredentials:
-        credentials = AzureStorageCredentials(self.general)
-        credentials.add_storage_account(self.account_name, self.account_key, endpoint="core.windows.net")
-        return credentials
-
-
-def download_blobs(account: str, account_key: str, blobs_root_path: str, destination: Path,
-                   is_file: bool = False, config: Optional[BlobXFerConfig] = None) -> Path:
-    """
-    Download a given set of files in Azure blob storage to the local destination path, via blobxfer.
-    :param account: The name of the storage account to access the files.
-    :param account_key: The key for the storage account.
-    :param blobs_root_path: The path of the files that should be downloaded. This must be in format
-    'container/file_prefix/', ending with a slash (will be added if not provided and is_file is False).
-    :param destination: The destination folder for the copied files on the local machine.
-    :param is_file: If True then only a single file is required to be downloaded
-    :param config: BlobXFerConfig to use for download configuration, use default presets if None.
-    The filenames will be stripped off their leading directories, up to the level given by blobs_root_path.
-    For example, if blobs_root_path is 'container/foo/'
-    and contains a file 'container/foo/1.txt', and destination is 'bar', the downloaded file will be 'bar/1.txt'
-    """
-    if not config:
-        config = BlobXFerConfig.create_default(account=account, account_key=account_key)
-
-    start_time = time.time()
-    # the account name can be an Azure Resource ID so extract the name from it if this is the case
-    logging.info(f"Downloading '{blobs_root_path}' from storage account {config.account_name} to {destination}")
-    blobs_root_path = to_azure_friendly_container_path(Path(blobs_root_path))
-    if not (blobs_root_path.endswith("/") or is_file):
-        blobs_root_path += "/"
-    blobs_root_path_dirs = blobs_root_path.rstrip("/").split("/")
-    num_folders_to_strip = len(blobs_root_path_dirs) - 1
-
-    blobs_path_without_container = "/".join(blobs_root_path_dirs[1:])
-    logging.info(f"Cleaned download path: '{blobs_root_path}' from storage account {config.account_name}")
-
-    download = config.get_download_options(num_folders_to_strip)
-    local_path = blobxfer.api.LocalDestinationPath(str(destination))
-    # noinspection PyTypeChecker
-    download_spec = blobxfer.api.DownloadSpecification(download, config.skipon_options, local_path)
-    source = blobxfer.api.AzureSourcePath()
-    source.add_path_with_storage_account(blobs_root_path, config.account_name)
-    if not is_file:
-        source.add_includes([f"{blobs_path_without_container}/*"])
-    download_spec.add_azure_source_path(source)
-    # noinspection PyTypeChecker
-    downloader = blobxfer.api.Downloader(config.general, config.get_credentials(), download_spec)
-    downloader.start()
-    elapsed = time.time() - start_time
-    logging.info(f"Finished downloading in {elapsed:0.2f}sec.")
-
-    if is_file:
-        destination = destination / Path(blobs_root_path).name
-        if destination.exists():
-            return destination
-        raise ValueError(f"Unable to download {blobs_root_path} from "
-                         f"storage account {config.account_name} to {destination}")
-    else:
-        return destination
--- a/InnerEye/settings.yml
+++ b/InnerEye/settings.yml
@ -2,7 +2,6 @@ variables:
  tenant_id: '72f988bf-86f1-41af-91ab-2d7cd011db47'
  subscription_id: ''
  application_id: ''
-  datasets_storage_account: 'innereyepublicdatasets'
  datasets_container: 'datasets'
  azureml_datastore: 'innereyedatasets'
  resource_group: 'InnerEye-DeepLearning'
--- a/Tests/Azure/test_parsing.py
+++ b/Tests/Azure/test_parsing.py
@ -86,7 +86,7 @@ def test_create_runner_parser(with_config: bool) -> None:
    azure_parser = create_runner_parser(SegmentationModelBase if with_config else None)
    args_list = ["--model=Lung", "--train=False", "--l_rate=100.0",
                 "--unknown=1", "--subscription_id", "Test1", "--tenant_id=Test2",
-                 "--application_id", "Test3", "--datasets_storage_account=Test4",
+                 "--application_id", "Test3",
                 "--log_level=INFO",
                 # Normally we don't use extra index URLs in InnerEye, hence this won't be set in YAML.
                 "--pip_extra_index_url=foo"]
@ -96,7 +96,6 @@ def test_create_runner_parser(with_config: bool) -> None:

    # These values have been set on the commandline, to values that are not the parser defaults.
    non_default_args = {
-        "datasets_storage_account": "Test4",
        "train": False,
        "model": "Lung",
        "subscription_id": "Test1",
--- a/Tests/Common/test_commandline_parsing.py
+++ b/Tests/Common/test_commandline_parsing.py
@ -34,7 +34,7 @@ def test_create_ml_runner_args(is_default_namespace: bool,

    args_list = [f"--model={model_name}", "--train=True", "--l_rate=100.0",
                 "--norm_method=Simple Norm", "--subscription_id", "Test1", "--tenant_id=Test2",
-                 "--application_id", "Test3", "--datasets_storage_account=Test4", "--datasets_container", "Test5",
+                 "--application_id", "Test3", "--datasets_container", "Test5",
                 "--pytest_mark", "gpu", f"--output_to={outputs_folder}"]
    if not is_default_namespace:
        args_list.append(f"--model_configs_namespace={model_configs_namespace}")
@ -45,7 +45,6 @@ def test_create_ml_runner_args(is_default_namespace: bool,
            runner.parse_and_load_model()
            azure_config = runner.azure_config
            model_config = runner.model_config
-    assert azure_config.datasets_storage_account == "Test4"
    assert azure_config.model == model_name
    assert model_config.l_rate == 100.0
    assert model_config.norm_method == PhotometricNormalizationMethod.SimpleNorm
@ -60,7 +59,7 @@ def test_create_ml_runner_args(is_default_namespace: bool,
        assert model_config.outputs_folder == (project_root / DEFAULT_AML_UPLOAD_DIR)
        assert model_config.logs_folder == (project_root / DEFAULT_LOGS_DIR_NAME)

-    assert not hasattr(model_config, "datasets_storage_account")
+    assert not hasattr(model_config, "datasets_container")
    assert azure_config.pytest_mark == "gpu"


@ -126,7 +125,6 @@ def test_parsing_with_custom_yaml(test_output_dirs: OutputFolderForTests) -> Non
    yaml_file = test_output_dirs.root_dir / "custom.yml"
    yaml_file.write_text("""variables:
  tenant_id: 'foo'
-  datasets_storage_account: 'account'
  start_epoch: 7
  random_seed: 1
 """)
@ -143,7 +141,6 @@ def test_parsing_with_custom_yaml(test_output_dirs: OutputFolderForTests) -> Non
    assert loader_result is not None
    assert runner.azure_config is not None
    # This is only present in yaml
-    assert runner.azure_config.datasets_storage_account == "account"
    # This is present in yaml and command line, and the latter should be used.
    assert runner.azure_config.tenant_id == "bar"
    # Settings in model config: start_epoch is only in yaml
--- a/Tests/ML/test_download_upload.py
+++ b/Tests/ML/test_download_upload.py
@ -12,11 +12,9 @@ from InnerEye.Azure.azure_util import fetch_child_runs, fetch_run, get_results_b
 from InnerEye.Common import common_util, fixed_paths
 from InnerEye.Common.common_util import OTHER_RUNS_SUBDIR_NAME, logging_section, logging_to_stdout
 from InnerEye.Common.output_directories import OutputFolderForTests
-from InnerEye.ML import run_ml
 from InnerEye.ML.common import CHECKPOINT_FILE_SUFFIX, DATASET_CSV_FILE_NAME
 from InnerEye.ML.model_config_base import ModelConfigBase
 from InnerEye.ML.run_ml import MLRunner
-from InnerEye.ML.utils.blobxfer_util import download_blobs
 from InnerEye.ML.utils.run_recovery import RunRecovery
 from Tests.Common.test_util import DEFAULT_ENSEMBLE_RUN_RECOVERY_ID, DEFAULT_RUN_RECOVERY_ID
 from Tests.ML.util import get_default_azure_config
@ -128,44 +126,3 @@ def test_download_azureml_dataset(test_output_dirs: OutputFolderForTests) -> Non
        for file in ["ct", "esophagus", "heart", "lung_l", "lung_r", "spinalcord"]:
            f = (sub_folder / file).with_suffix(".nii.gz")
            assert f.is_file()
-
-
-def test_download_dataset_via_blobxfer(test_output_dirs: OutputFolderForTests) -> None:
-    azure_config = get_default_azure_config()
-    result_path = run_ml.download_dataset_via_blobxfer(dataset_id="test-dataset",
-                                                       azure_config=azure_config,
-                                                       target_folder=test_output_dirs.root_dir)
-    assert result_path
-    assert result_path.is_dir()
-    dataset_csv = Path(result_path) / DATASET_CSV_FILE_NAME
-    assert dataset_csv.exists()
-
-
-@pytest.mark.parametrize("is_file", [True, False])
-def test_download_blobxfer(test_output_dirs: OutputFolderForTests, is_file: bool, runner_config: AzureConfig) -> None:
-    """
-    Test for a bug in early versions of download_blobs: download is happening via prefixes, but because of
-    stripping leading directory names, blobs got overwritten.
-    """
-    root = test_output_dirs.root_dir
-    account_key = runner_config.get_dataset_storage_account_key()
-    assert account_key is not None
-    # Expected test data in Azure blobs:
-    # folder1/folder1.txt with content "folder1.txt"
-    # folder1_with_suffix/folder2.txt with content "folder2.txt"
-    # folder1_with_suffix/folder1.txt with content "this comes from folder2"
-    # with bug present, folder1_with_suffix/folder1.txt will overwrite folder1/folder1.txt
-    blobs_root_path = "data-for-testsuite/folder1"
-    if is_file:
-        blobs_root_path += "/folder1.txt"
-    download_blobs(runner_config.datasets_storage_account, account_key, blobs_root_path, root, is_file)
-
-    folder1 = root / "folder1.txt"
-    assert folder1.exists()
-    if not is_file:
-        otherfile = root / "otherfile.txt"
-        folder2 = root / "folder2.txt"
-        assert folder1.read_text().strip() == "folder1.txt"
-        assert otherfile.exists()
-        assert otherfile.read_text().strip() == "folder1.txt"
-        assert not folder2.exists()
--- a/environment.yml
+++ b/environment.yml
@ -14,7 +14,6 @@ dependencies:
      - azureml-mlflow==1.17.0
      - azureml-sdk==1.17.0
      - azureml-tensorboard==1.17.0
-      - blobxfer==1.9.4
      - conda-merge==0.1.5
      - dataclasses-json==0.5.2
      - flake8==3.8.3
@ -29,6 +28,7 @@ dependencies:
      - mypy==0.770
      - mypy-extensions==0.4.3
      - numpy==1.19.1
+      - numba==0.51.2
      - pandas==1.1.0
      - papermill==2.2.2
      - param==1.9.3