Remove blobxfer (#330)
* Remove blobxfer * Update CHANGELOG.md * Remove configs that are not required * Remove from environment.yml * Fix numba issue * Improve CHANGELOG.md * Fix tests * Remove configs that are not required
This commit is contained in:
Родитель
014c74e34f
Коммит
a2c27e19d7
|
@ -24,6 +24,7 @@ folder structure is present irrespective of using InnerEye as a submodule or not
|
|||
environment will be contained in the model.
|
||||
|
||||
### Removed
|
||||
- Removed blobxfer completely. AzureML Data-stores for reading datasets make the following configs obsolete: 'datasets_storage_account' and 'datasets_storage_account_key' and they are not longer supported.
|
||||
|
||||
### Deprecated
|
||||
|
||||
|
|
|
@ -59,13 +59,6 @@ class AzureConfig(GenericConfig):
|
|||
subscription_id: str = param.String(doc="The ID of your Azure subscription.")
|
||||
tenant_id: str = param.String(doc="The Azure tenant ID.")
|
||||
application_id: str = param.String(doc="Optional: The ID of the Service Principal for authentication to Azure.")
|
||||
datasets_storage_account: str = \
|
||||
param.String(doc="Optional: The blob storage account to use when downloading datasets for use outside of "
|
||||
"AzureML. This storage account must be the same as the one configured as a 'datastore' "
|
||||
"in AzureML.")
|
||||
datasets_storage_account_key: str = \
|
||||
param.String(doc="Optional: The access key for the storage account that holds the datasets. "
|
||||
"This is only used for downloading datasets outside of AzureML.")
|
||||
datasets_container: str = param.String(doc="Optional: The blob storage container with the datasets.")
|
||||
azureml_datastore: str = param.String(doc="The name of the AzureML datastore that holds the input training data. "
|
||||
"This must be created manually, and point to a folder inside the "
|
||||
|
@ -196,13 +189,6 @@ class AzureConfig(GenericConfig):
|
|||
config.project_root = project_root
|
||||
return config
|
||||
|
||||
def get_dataset_storage_account_key(self) -> Optional[str]:
|
||||
"""
|
||||
Gets the storage account key for the storage account that holds the dataset.
|
||||
"""
|
||||
secrets_handler = SecretsHandling(project_root=self.project_root)
|
||||
return secrets_handler.get_secret_from_environment(fixed_paths.DATASETS_ACCOUNT_KEY, allow_missing=True)
|
||||
|
||||
def get_workspace(self) -> Workspace:
|
||||
"""
|
||||
Return a workspace object for an existing Azure Machine Learning Workspace (or default from YAML).
|
||||
|
|
|
@ -248,15 +248,6 @@ def get_run_id(run: Optional[Run] = None) -> str:
|
|||
return run_context.id
|
||||
|
||||
|
||||
def storage_account_from_full_name(full_account_name: str) -> str:
|
||||
"""
|
||||
Extracts the actual storage account name from the full name, like "/subscriptions/abc123../something/account_name"
|
||||
:param full_account_name: Full name of account
|
||||
:return: Storage account name
|
||||
"""
|
||||
return full_account_name.split("/")[-1]
|
||||
|
||||
|
||||
def get_cross_validation_split_index(run: Run) -> int:
|
||||
"""
|
||||
Gets the cross validation index from the run's tags or returns the default
|
||||
|
@ -425,6 +416,7 @@ def is_run_and_child_runs_completed(run: Run) -> bool:
|
|||
:param run: The AzureML run to check.
|
||||
:return: True if the run and all child runs completed successfully.
|
||||
"""
|
||||
|
||||
def is_completed(run: Run) -> bool:
|
||||
status = run.get_status()
|
||||
if run.status == RunStatus.COMPLETED:
|
||||
|
|
|
@ -26,19 +26,16 @@ This means: slices 79 to 107 inclusive in the z direction are missing, i.e. ther
|
|||
|
||||
import csv
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterator, List, Optional, Set, TextIO, Tuple
|
||||
|
||||
import numpy as np
|
||||
import param
|
||||
from azure.storage.blob import BlockBlobService
|
||||
|
||||
from InnerEye.Azure.azure_config import AzureConfig
|
||||
from InnerEye.Common import fixed_paths
|
||||
from InnerEye.Common.common_util import logging_to_stdout
|
||||
from InnerEye.Common.generic_parsing import GenericConfig
|
||||
from InnerEye.ML.utils.blobxfer_util import download_blobs
|
||||
from InnerEye.ML.utils.io_util import read_image_as_array_with_header
|
||||
|
||||
MISSING_SLICE_MARKER = "Ms:"
|
||||
|
@ -78,7 +75,8 @@ def report_structure_extremes(dataset_dir: str, azure_config: AzureConfig) -> No
|
|||
:param azure_config: An object with all necessary information for accessing Azure.
|
||||
:param dataset_dir: directory containing subject subdirectories with integer names.
|
||||
"""
|
||||
download_dataset_directory(azure_config, dataset_dir)
|
||||
if not os.path.isdir(dataset_dir):
|
||||
raise ValueError(f"Invalid path: {dataset_dir}")
|
||||
subjects: Set[int] = set()
|
||||
series_map = None
|
||||
institution_map = None
|
||||
|
@ -122,37 +120,11 @@ def report_structure_extremes(dataset_dir: str, azure_config: AzureConfig) -> No
|
|||
if index % 25 == 0:
|
||||
print(f"Processed {index} subjects")
|
||||
print(f"Processed all {len(subjects)} subjects")
|
||||
upload_to_dataset_directory(azure_config, dataset_dir, files_created)
|
||||
# If we found any structures with missing slices, raise an exception, which should be
|
||||
# uncaught where necessary to make any appropriate build step fail.
|
||||
if n_missing > 0:
|
||||
raise ValueError(f"Found {n_missing} structures with missing slices")
|
||||
|
||||
|
||||
def download_dataset_directory(azure_config: AzureConfig, dataset_dir: str) -> bool:
|
||||
if os.path.isdir(dataset_dir):
|
||||
return False
|
||||
account_key = azure_config.get_dataset_storage_account_key()
|
||||
blobs_root_path = os.path.join(azure_config.datasets_container, os.path.basename(dataset_dir)) + "/"
|
||||
sys.stdout.write(f"Downloading data to {dataset_dir} ...")
|
||||
assert account_key is not None # for mypy
|
||||
download_blobs(azure_config.datasets_storage_account, account_key, blobs_root_path, Path(dataset_dir))
|
||||
sys.stdout.write("done\n")
|
||||
return True
|
||||
|
||||
|
||||
def upload_to_dataset_directory(azure_config: AzureConfig, dataset_dir: str, files: Set[str]) -> None:
|
||||
if not files:
|
||||
return
|
||||
account_key = azure_config.get_dataset_storage_account_key()
|
||||
block_blob_service = BlockBlobService(account_name=azure_config.datasets_storage_account, account_key=account_key)
|
||||
container_name = os.path.join(azure_config.datasets_container, os.path.basename(dataset_dir))
|
||||
for path in files:
|
||||
blob_name = path[len(dataset_dir) + 1:]
|
||||
block_blob_service.create_blob_from_path(container_name, blob_name, path)
|
||||
print(f"Uploaded {path} to {azure_config.datasets_storage_account}:{container_name}/{blob_name}")
|
||||
|
||||
|
||||
def report_structure_extremes_for_subject(subj_dir: str, series_id: str) -> Iterator[str]:
|
||||
"""
|
||||
:param subj_dir: subject directory, containing <structure>.nii.gz files
|
||||
|
|
|
@ -54,8 +54,6 @@ PRIVATE_SETTINGS_FILE = "InnerEyePrivateSettings.yml"
|
|||
# Names of secrets stored as environment variables or in the PROJECT_SECRETS_FILE:
|
||||
# Secret for the Service Principal
|
||||
SERVICE_PRINCIPAL_KEY = "APPLICATION_KEY"
|
||||
# The access key for the Azure storage account that holds the datasets.
|
||||
DATASETS_ACCOUNT_KEY = "DATASETS_ACCOUNT_KEY"
|
||||
|
||||
INNEREYE_PACKAGE_ROOT = repository_root_directory(INNEREYE_PACKAGE_NAME)
|
||||
SETTINGS_YAML_FILE_NAME = "settings.yml"
|
||||
|
|
|
@ -5,6 +5,7 @@
|
|||
import copy
|
||||
import logging
|
||||
import shutil
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
|
@ -25,7 +26,7 @@ from InnerEye.Azure.azure_util import CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, \
|
|||
from InnerEye.Common import fixed_paths
|
||||
from InnerEye.Common.build_config import ExperimentResultLocation, build_information_to_dot_net_json_file
|
||||
from InnerEye.Common.common_util import ModelProcessing, is_windows, logging_section, print_exception
|
||||
from InnerEye.Common.fixed_paths import INNEREYE_PACKAGE_NAME, PROJECT_SECRETS_FILE
|
||||
from InnerEye.Common.fixed_paths import INNEREYE_PACKAGE_NAME
|
||||
from InnerEye.ML.common import DATASET_CSV_FILE_NAME, ModelExecutionMode
|
||||
from InnerEye.ML.config import SegmentationModelBase
|
||||
from InnerEye.ML.deep_learning_config import CHECKPOINT_FOLDER, FINAL_ENSEMBLE_MODEL_FOLDER, FINAL_MODEL_FOLDER, \
|
||||
|
@ -38,7 +39,6 @@ from InnerEye.ML.model_training import model_train
|
|||
from InnerEye.ML.runner import ModelDeploymentHookSignature, Runner, get_all_environment_files
|
||||
from InnerEye.ML.scalar_config import ScalarModelBase
|
||||
from InnerEye.ML.utils import ml_util
|
||||
from InnerEye.ML.utils.blobxfer_util import download_blobs
|
||||
from InnerEye.ML.utils.checkpoint_handling import CheckpointHandler
|
||||
from InnerEye.ML.utils.ml_util import make_pytorch_reproducible
|
||||
from InnerEye.ML.visualizers import activation_maps
|
||||
|
@ -61,45 +61,6 @@ def try_to_mount_input_dataset(run_context: Any) -> Optional[Path]:
|
|||
return None
|
||||
|
||||
|
||||
def download_dataset_via_blobxfer(dataset_id: str,
|
||||
azure_config: AzureConfig,
|
||||
target_folder: Path) -> Optional[Path]:
|
||||
"""
|
||||
Attempts to downloads a dataset from the Azure storage account for datasets, with download happening via
|
||||
blobxfer. This is only possible if the datasets storage account and keyword are present in the `azure_config`.
|
||||
The function returns None if the required settings were not present.
|
||||
:param dataset_id: The folder of the dataset, expected in the container given by azure_config.datasets_container.
|
||||
:param azure_config: The object with all Azure-related settings.
|
||||
:param target_folder: The local folder into which the dataset should be downloaded.
|
||||
:return: The folder that contains the downloaded dataset. Returns None if the datasets account name or password
|
||||
were not present.
|
||||
"""
|
||||
datasets_account_key = azure_config.get_dataset_storage_account_key()
|
||||
if not datasets_account_key:
|
||||
logging.info("No account key for the dataset storage account was found.")
|
||||
logging.info(f"We checked in environment variables and in the file {PROJECT_SECRETS_FILE}")
|
||||
return None
|
||||
if (not azure_config.datasets_container) or (not azure_config.datasets_storage_account):
|
||||
logging.info("Datasets storage account or container missing.")
|
||||
return None
|
||||
target_folder.mkdir(exist_ok=True)
|
||||
result_folder = target_folder / dataset_id
|
||||
# only download if hasn't already been downloaded
|
||||
if result_folder.is_dir():
|
||||
logging.info(f"Folder already exists, skipping download: {result_folder}")
|
||||
return result_folder
|
||||
with logging_section(f"Downloading dataset {dataset_id}"):
|
||||
download_blobs(
|
||||
account=azure_config.datasets_storage_account,
|
||||
account_key=datasets_account_key,
|
||||
# When specifying the blobs root path, ensure that there is a slash at the end, otherwise
|
||||
# all datasets with that dataset_id as a prefix get downloaded.
|
||||
blobs_root_path=f"{azure_config.datasets_container}/{dataset_id}/",
|
||||
destination=result_folder
|
||||
)
|
||||
return result_folder
|
||||
|
||||
|
||||
def download_dataset(azure_dataset_id: str,
|
||||
target_folder: Path,
|
||||
azure_config: AzureConfig) -> Path:
|
||||
|
@ -109,20 +70,11 @@ def download_dataset(azure_dataset_id: str,
|
|||
AzureML dataset attached to the given AzureML workspace. The dataset is downloaded into the `target_folder`,
|
||||
in a subfolder that has the same name as the dataset. If there already appears to be such a folder, and the folder
|
||||
contains a dataset.csv file, no download is started.
|
||||
:param local_dataset: The path to an existing local dataset.
|
||||
:param azure_dataset_id: The name of a dataset that is registered in the AzureML workspace.
|
||||
:param target_folder: The folder in which to download the dataset from Azure.
|
||||
:param azure_config: All Azure-related configuration options.
|
||||
:return: A path on the local machine that contains the dataset.
|
||||
"""
|
||||
try:
|
||||
downloaded_via_blobxfer = download_dataset_via_blobxfer(dataset_id=azure_dataset_id,
|
||||
azure_config=azure_config,
|
||||
target_folder=target_folder)
|
||||
if downloaded_via_blobxfer:
|
||||
return downloaded_via_blobxfer
|
||||
except Exception as ex:
|
||||
print_exception(ex, message="Unable to download dataset via blobxfer.")
|
||||
logging.info("Trying to download dataset via AzureML datastore now.")
|
||||
azure_dataset = get_or_create_dataset(azure_config, azure_dataset_id)
|
||||
if not isinstance(azure_dataset, FileDataset):
|
||||
|
@ -136,7 +88,10 @@ def download_dataset(azure_dataset_id: str,
|
|||
return expected_dataset_path
|
||||
logging.info("Starting to download the dataset - WARNING, this could take very long!")
|
||||
with logging_section("Downloading dataset"):
|
||||
t0 = time.perf_counter()
|
||||
azure_dataset.download(target_path=str(expected_dataset_path), overwrite=False)
|
||||
t1 = time.perf_counter() - t0
|
||||
logging.info(f"Azure dataset '{azure_dataset_id}' downloaded in {t1} seconds")
|
||||
logging.info(f"Azure dataset '{azure_dataset_id}' is now available in {expected_dataset_path}")
|
||||
return expected_dataset_path
|
||||
|
||||
|
|
|
@ -1,177 +0,0 @@
|
|||
# ------------------------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
|
||||
# ------------------------------------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import blobxfer
|
||||
import blobxfer.models.azure as azmodels
|
||||
from blobxfer.api import AzureStorageCredentials, ConcurrencyOptions, DownloadOptions, GeneralOptions, SkipOnOptions, \
|
||||
UploadOptions
|
||||
from blobxfer.models.options import FileProperties, Timeout, VectoredIo
|
||||
from blobxfer.models.upload import VectoredIoDistributionMode
|
||||
|
||||
from InnerEye.Azure.azure_util import storage_account_from_full_name, to_azure_friendly_container_path
|
||||
|
||||
# Azure storage is extremely talkative, printing out each client request (thousands of them)
|
||||
logger = logging.getLogger('azure.storage')
|
||||
logger.setLevel(logging.WARNING)
|
||||
# Blobxfer also prints at single line per file at least.
|
||||
logger = logging.getLogger('blobxfer')
|
||||
logger.setLevel(logging.WARNING)
|
||||
|
||||
|
||||
@dataclass
|
||||
class BlobXFerConfig:
|
||||
"""
|
||||
Class to hold Bloxfer configurations and helpers functions to download and
|
||||
upload functions to Azure.
|
||||
"""
|
||||
account_name: str
|
||||
account_key: str
|
||||
concurrency: ConcurrencyOptions
|
||||
timeout: Timeout
|
||||
general: GeneralOptions
|
||||
file_properties: FileProperties
|
||||
skipon_options: SkipOnOptions
|
||||
|
||||
@staticmethod
|
||||
def create_default(account: str, account_key: str) -> BlobXFerConfig:
|
||||
"""
|
||||
Returns the default configuration.
|
||||
|
||||
:param account: Name of the Azure storage account
|
||||
:param account_key: Key to this storage account
|
||||
:return: default Blobxferconfig
|
||||
"""
|
||||
concurrency = ConcurrencyOptions(crypto_processes=2, md5_processes=2, disk_threads=16,
|
||||
transfer_threads=16, action=1)
|
||||
timeout = Timeout(connect=20, read=60, max_retries=3)
|
||||
general = GeneralOptions(concurrency, progress_bar=False, verbose=False, timeout=timeout, quiet=True)
|
||||
file_properties = FileProperties(attributes=False, cache_control=None, content_type=None,
|
||||
lmt=False, md5=None)
|
||||
skipon_options = SkipOnOptions(filesize_match=True, lmt_ge=True, md5_match=True)
|
||||
return BlobXFerConfig(
|
||||
account_name=storage_account_from_full_name(account),
|
||||
account_key=account_key,
|
||||
concurrency=concurrency,
|
||||
timeout=timeout,
|
||||
general=general,
|
||||
file_properties=file_properties,
|
||||
skipon_options=skipon_options
|
||||
)
|
||||
|
||||
def get_download_options(self, num_folders_to_strip: int = 0) -> DownloadOptions:
|
||||
"""
|
||||
Returns a BloxFer DownloadOptions object.
|
||||
|
||||
:param num_folders_to_strip: The filenames will be stripped off their leading directories, up to this level e.g.
|
||||
if original path is 'container/foo/1.txt' and number_folders_to_strip is 2, and destination folder is 'bar',
|
||||
the downloaded file will be 'bar/1.txt'
|
||||
"""
|
||||
return DownloadOptions(check_file_md5=True,
|
||||
chunk_size_bytes=4194304,
|
||||
delete_extraneous_destination=False,
|
||||
delete_only=False,
|
||||
max_single_object_concurrency=8,
|
||||
mode=azmodels.StorageModes.Auto,
|
||||
overwrite=True,
|
||||
recursive=True,
|
||||
rename=False,
|
||||
restore_file_properties=self.file_properties,
|
||||
rsa_private_key=None,
|
||||
strip_components=num_folders_to_strip)
|
||||
|
||||
def get_upload_options(self, num_folders_to_strip: int = 0) -> UploadOptions:
|
||||
"""
|
||||
Returns a UploadOptions object.
|
||||
|
||||
:param num_folders_to_strip: The filenames will be stripped off their leading directories, up to this level e.g.
|
||||
if original path is 'container/foo/1.txt' and number_folders_to_strip is 2, and destination folder is 'bar',
|
||||
the downloaded file will be 'bar/1.txt'
|
||||
"""
|
||||
return UploadOptions(
|
||||
access_tier=None,
|
||||
one_shot_bytes=33554432,
|
||||
rsa_public_key=None,
|
||||
stdin_as_page_blob_size=0,
|
||||
store_file_properties=self.file_properties,
|
||||
vectored_io=VectoredIo(
|
||||
stripe_chunk_size_bytes=0,
|
||||
distribution_mode=VectoredIoDistributionMode.Disabled
|
||||
),
|
||||
chunk_size_bytes=4194304,
|
||||
delete_extraneous_destination=False,
|
||||
delete_only=False,
|
||||
mode=azmodels.StorageModes.Auto,
|
||||
overwrite=True,
|
||||
recursive=True,
|
||||
rename=False,
|
||||
strip_components=num_folders_to_strip
|
||||
)
|
||||
|
||||
def get_credentials(self) -> AzureStorageCredentials:
|
||||
credentials = AzureStorageCredentials(self.general)
|
||||
credentials.add_storage_account(self.account_name, self.account_key, endpoint="core.windows.net")
|
||||
return credentials
|
||||
|
||||
|
||||
def download_blobs(account: str, account_key: str, blobs_root_path: str, destination: Path,
|
||||
is_file: bool = False, config: Optional[BlobXFerConfig] = None) -> Path:
|
||||
"""
|
||||
Download a given set of files in Azure blob storage to the local destination path, via blobxfer.
|
||||
:param account: The name of the storage account to access the files.
|
||||
:param account_key: The key for the storage account.
|
||||
:param blobs_root_path: The path of the files that should be downloaded. This must be in format
|
||||
'container/file_prefix/', ending with a slash (will be added if not provided and is_file is False).
|
||||
:param destination: The destination folder for the copied files on the local machine.
|
||||
:param is_file: If True then only a single file is required to be downloaded
|
||||
:param config: BlobXFerConfig to use for download configuration, use default presets if None.
|
||||
The filenames will be stripped off their leading directories, up to the level given by blobs_root_path.
|
||||
For example, if blobs_root_path is 'container/foo/'
|
||||
and contains a file 'container/foo/1.txt', and destination is 'bar', the downloaded file will be 'bar/1.txt'
|
||||
"""
|
||||
if not config:
|
||||
config = BlobXFerConfig.create_default(account=account, account_key=account_key)
|
||||
|
||||
start_time = time.time()
|
||||
# the account name can be an Azure Resource ID so extract the name from it if this is the case
|
||||
logging.info(f"Downloading '{blobs_root_path}' from storage account {config.account_name} to {destination}")
|
||||
blobs_root_path = to_azure_friendly_container_path(Path(blobs_root_path))
|
||||
if not (blobs_root_path.endswith("/") or is_file):
|
||||
blobs_root_path += "/"
|
||||
blobs_root_path_dirs = blobs_root_path.rstrip("/").split("/")
|
||||
num_folders_to_strip = len(blobs_root_path_dirs) - 1
|
||||
|
||||
blobs_path_without_container = "/".join(blobs_root_path_dirs[1:])
|
||||
logging.info(f"Cleaned download path: '{blobs_root_path}' from storage account {config.account_name}")
|
||||
|
||||
download = config.get_download_options(num_folders_to_strip)
|
||||
local_path = blobxfer.api.LocalDestinationPath(str(destination))
|
||||
# noinspection PyTypeChecker
|
||||
download_spec = blobxfer.api.DownloadSpecification(download, config.skipon_options, local_path)
|
||||
source = blobxfer.api.AzureSourcePath()
|
||||
source.add_path_with_storage_account(blobs_root_path, config.account_name)
|
||||
if not is_file:
|
||||
source.add_includes([f"{blobs_path_without_container}/*"])
|
||||
download_spec.add_azure_source_path(source)
|
||||
# noinspection PyTypeChecker
|
||||
downloader = blobxfer.api.Downloader(config.general, config.get_credentials(), download_spec)
|
||||
downloader.start()
|
||||
elapsed = time.time() - start_time
|
||||
logging.info(f"Finished downloading in {elapsed:0.2f}sec.")
|
||||
|
||||
if is_file:
|
||||
destination = destination / Path(blobs_root_path).name
|
||||
if destination.exists():
|
||||
return destination
|
||||
raise ValueError(f"Unable to download {blobs_root_path} from "
|
||||
f"storage account {config.account_name} to {destination}")
|
||||
else:
|
||||
return destination
|
|
@ -2,7 +2,6 @@ variables:
|
|||
tenant_id: '72f988bf-86f1-41af-91ab-2d7cd011db47'
|
||||
subscription_id: ''
|
||||
application_id: ''
|
||||
datasets_storage_account: 'innereyepublicdatasets'
|
||||
datasets_container: 'datasets'
|
||||
azureml_datastore: 'innereyedatasets'
|
||||
resource_group: 'InnerEye-DeepLearning'
|
||||
|
|
|
@ -86,7 +86,7 @@ def test_create_runner_parser(with_config: bool) -> None:
|
|||
azure_parser = create_runner_parser(SegmentationModelBase if with_config else None)
|
||||
args_list = ["--model=Lung", "--train=False", "--l_rate=100.0",
|
||||
"--unknown=1", "--subscription_id", "Test1", "--tenant_id=Test2",
|
||||
"--application_id", "Test3", "--datasets_storage_account=Test4",
|
||||
"--application_id", "Test3",
|
||||
"--log_level=INFO",
|
||||
# Normally we don't use extra index URLs in InnerEye, hence this won't be set in YAML.
|
||||
"--pip_extra_index_url=foo"]
|
||||
|
@ -96,7 +96,6 @@ def test_create_runner_parser(with_config: bool) -> None:
|
|||
|
||||
# These values have been set on the commandline, to values that are not the parser defaults.
|
||||
non_default_args = {
|
||||
"datasets_storage_account": "Test4",
|
||||
"train": False,
|
||||
"model": "Lung",
|
||||
"subscription_id": "Test1",
|
||||
|
|
|
@ -34,7 +34,7 @@ def test_create_ml_runner_args(is_default_namespace: bool,
|
|||
|
||||
args_list = [f"--model={model_name}", "--train=True", "--l_rate=100.0",
|
||||
"--norm_method=Simple Norm", "--subscription_id", "Test1", "--tenant_id=Test2",
|
||||
"--application_id", "Test3", "--datasets_storage_account=Test4", "--datasets_container", "Test5",
|
||||
"--application_id", "Test3", "--datasets_container", "Test5",
|
||||
"--pytest_mark", "gpu", f"--output_to={outputs_folder}"]
|
||||
if not is_default_namespace:
|
||||
args_list.append(f"--model_configs_namespace={model_configs_namespace}")
|
||||
|
@ -45,7 +45,6 @@ def test_create_ml_runner_args(is_default_namespace: bool,
|
|||
runner.parse_and_load_model()
|
||||
azure_config = runner.azure_config
|
||||
model_config = runner.model_config
|
||||
assert azure_config.datasets_storage_account == "Test4"
|
||||
assert azure_config.model == model_name
|
||||
assert model_config.l_rate == 100.0
|
||||
assert model_config.norm_method == PhotometricNormalizationMethod.SimpleNorm
|
||||
|
@ -60,7 +59,7 @@ def test_create_ml_runner_args(is_default_namespace: bool,
|
|||
assert model_config.outputs_folder == (project_root / DEFAULT_AML_UPLOAD_DIR)
|
||||
assert model_config.logs_folder == (project_root / DEFAULT_LOGS_DIR_NAME)
|
||||
|
||||
assert not hasattr(model_config, "datasets_storage_account")
|
||||
assert not hasattr(model_config, "datasets_container")
|
||||
assert azure_config.pytest_mark == "gpu"
|
||||
|
||||
|
||||
|
@ -126,7 +125,6 @@ def test_parsing_with_custom_yaml(test_output_dirs: OutputFolderForTests) -> Non
|
|||
yaml_file = test_output_dirs.root_dir / "custom.yml"
|
||||
yaml_file.write_text("""variables:
|
||||
tenant_id: 'foo'
|
||||
datasets_storage_account: 'account'
|
||||
start_epoch: 7
|
||||
random_seed: 1
|
||||
""")
|
||||
|
@ -143,7 +141,6 @@ def test_parsing_with_custom_yaml(test_output_dirs: OutputFolderForTests) -> Non
|
|||
assert loader_result is not None
|
||||
assert runner.azure_config is not None
|
||||
# This is only present in yaml
|
||||
assert runner.azure_config.datasets_storage_account == "account"
|
||||
# This is present in yaml and command line, and the latter should be used.
|
||||
assert runner.azure_config.tenant_id == "bar"
|
||||
# Settings in model config: start_epoch is only in yaml
|
||||
|
|
|
@ -12,11 +12,9 @@ from InnerEye.Azure.azure_util import fetch_child_runs, fetch_run, get_results_b
|
|||
from InnerEye.Common import common_util, fixed_paths
|
||||
from InnerEye.Common.common_util import OTHER_RUNS_SUBDIR_NAME, logging_section, logging_to_stdout
|
||||
from InnerEye.Common.output_directories import OutputFolderForTests
|
||||
from InnerEye.ML import run_ml
|
||||
from InnerEye.ML.common import CHECKPOINT_FILE_SUFFIX, DATASET_CSV_FILE_NAME
|
||||
from InnerEye.ML.model_config_base import ModelConfigBase
|
||||
from InnerEye.ML.run_ml import MLRunner
|
||||
from InnerEye.ML.utils.blobxfer_util import download_blobs
|
||||
from InnerEye.ML.utils.run_recovery import RunRecovery
|
||||
from Tests.Common.test_util import DEFAULT_ENSEMBLE_RUN_RECOVERY_ID, DEFAULT_RUN_RECOVERY_ID
|
||||
from Tests.ML.util import get_default_azure_config
|
||||
|
@ -128,44 +126,3 @@ def test_download_azureml_dataset(test_output_dirs: OutputFolderForTests) -> Non
|
|||
for file in ["ct", "esophagus", "heart", "lung_l", "lung_r", "spinalcord"]:
|
||||
f = (sub_folder / file).with_suffix(".nii.gz")
|
||||
assert f.is_file()
|
||||
|
||||
|
||||
def test_download_dataset_via_blobxfer(test_output_dirs: OutputFolderForTests) -> None:
|
||||
azure_config = get_default_azure_config()
|
||||
result_path = run_ml.download_dataset_via_blobxfer(dataset_id="test-dataset",
|
||||
azure_config=azure_config,
|
||||
target_folder=test_output_dirs.root_dir)
|
||||
assert result_path
|
||||
assert result_path.is_dir()
|
||||
dataset_csv = Path(result_path) / DATASET_CSV_FILE_NAME
|
||||
assert dataset_csv.exists()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("is_file", [True, False])
|
||||
def test_download_blobxfer(test_output_dirs: OutputFolderForTests, is_file: bool, runner_config: AzureConfig) -> None:
|
||||
"""
|
||||
Test for a bug in early versions of download_blobs: download is happening via prefixes, but because of
|
||||
stripping leading directory names, blobs got overwritten.
|
||||
"""
|
||||
root = test_output_dirs.root_dir
|
||||
account_key = runner_config.get_dataset_storage_account_key()
|
||||
assert account_key is not None
|
||||
# Expected test data in Azure blobs:
|
||||
# folder1/folder1.txt with content "folder1.txt"
|
||||
# folder1_with_suffix/folder2.txt with content "folder2.txt"
|
||||
# folder1_with_suffix/folder1.txt with content "this comes from folder2"
|
||||
# with bug present, folder1_with_suffix/folder1.txt will overwrite folder1/folder1.txt
|
||||
blobs_root_path = "data-for-testsuite/folder1"
|
||||
if is_file:
|
||||
blobs_root_path += "/folder1.txt"
|
||||
download_blobs(runner_config.datasets_storage_account, account_key, blobs_root_path, root, is_file)
|
||||
|
||||
folder1 = root / "folder1.txt"
|
||||
assert folder1.exists()
|
||||
if not is_file:
|
||||
otherfile = root / "otherfile.txt"
|
||||
folder2 = root / "folder2.txt"
|
||||
assert folder1.read_text().strip() == "folder1.txt"
|
||||
assert otherfile.exists()
|
||||
assert otherfile.read_text().strip() == "folder1.txt"
|
||||
assert not folder2.exists()
|
||||
|
|
|
@ -14,7 +14,6 @@ dependencies:
|
|||
- azureml-mlflow==1.17.0
|
||||
- azureml-sdk==1.17.0
|
||||
- azureml-tensorboard==1.17.0
|
||||
- blobxfer==1.9.4
|
||||
- conda-merge==0.1.5
|
||||
- dataclasses-json==0.5.2
|
||||
- flake8==3.8.3
|
||||
|
@ -29,6 +28,7 @@ dependencies:
|
|||
- mypy==0.770
|
||||
- mypy-extensions==0.4.3
|
||||
- numpy==1.19.1
|
||||
- numba==0.51.2
|
||||
- pandas==1.1.0
|
||||
- papermill==2.2.2
|
||||
- param==1.9.3
|
||||
|
|
Загрузка…
Ссылка в новой задаче