зеркало из https://github.com/microsoft/hi-ml.git
Adding cross-validation to the hi-ml runner (#198)
* Adding cross-validation * Simplified model config loading for models in the histopathology folder * Editor setup for VSCode * Simplified pytest usage, also from within VSCode
This commit is contained in:
Родитель
92d2f22650
Коммит
314433cb95
|
@ -0,0 +1,16 @@
|
|||
/.git
|
||||
/.github
|
||||
/azure-pipelines
|
||||
/docs
|
||||
/.idea
|
||||
.pytest_cache
|
||||
.mypy_cache
|
||||
logs
|
||||
outputs
|
||||
config.json
|
||||
*.egg-info
|
||||
# Temporary files generated from conda merging
|
||||
temp_environment-*
|
||||
.config
|
||||
.vscode
|
||||
node_modules
|
|
@ -2,7 +2,7 @@
|
|||
omit =
|
||||
**/pytest
|
||||
**/__init__.py
|
||||
*/hello_container_2.py
|
||||
**/temp_config_for_unittests.py
|
||||
|
||||
[html]
|
||||
skip_empty = true
|
||||
|
|
|
@ -75,6 +75,8 @@ jobs:
|
|||
strategy:
|
||||
matrix:
|
||||
folder: [ hi-ml, hi-ml-azure, hi-ml-histopathology ]
|
||||
# This will let all parts of the matrix run, to collect as many errors as possible, rather than aborting after first fail
|
||||
fail-fast: false
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
|
@ -85,7 +87,7 @@ jobs:
|
|||
with:
|
||||
python-version: ${{ env.pythonVersion }}
|
||||
|
||||
- name: Install dependencies
|
||||
- name: Install hi-ml-azure in editable mode
|
||||
if: ${{ matrix.folder != 'hi-ml-azure' }}
|
||||
run: |
|
||||
cd hi-ml-azure
|
||||
|
@ -93,6 +95,14 @@ jobs:
|
|||
# Install local package in editable mode
|
||||
make pip_local
|
||||
|
||||
- name: Install hi-ml in editable mode
|
||||
if: ${{ matrix.folder != 'hi-ml-azure' && matrix.folder != 'hi-ml' }}
|
||||
run: |
|
||||
cd hi-ml
|
||||
|
||||
# Install local package in editable mode
|
||||
make pip_local
|
||||
|
||||
- name: Test with pytest, fast only
|
||||
run: |
|
||||
cd ${{ matrix.folder }}
|
||||
|
@ -108,7 +118,7 @@ jobs:
|
|||
needs: [ pytest_fast ]
|
||||
strategy:
|
||||
matrix:
|
||||
folder: [ hi-ml, hi-ml-azure, hi-ml-histopathology ]
|
||||
folder: [ hi-ml, hi-ml-azure ]
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
|
@ -134,7 +144,7 @@ jobs:
|
|||
needs: [ build-python ]
|
||||
strategy:
|
||||
matrix:
|
||||
folder: [ hi-ml, hi-ml-azure, hi-ml-histopathology ]
|
||||
folder: [ hi-ml, hi-ml-azure ]
|
||||
packageName: [ '*.whl', '*.tar.gz' ]
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
@ -199,6 +209,61 @@ jobs:
|
|||
with:
|
||||
folder: ${{ matrix.folder }}
|
||||
|
||||
test-project-folders:
|
||||
runs-on: ubuntu-18.04
|
||||
needs: [ build-python ]
|
||||
strategy:
|
||||
matrix:
|
||||
folder: [ hi-ml-histopathology ]
|
||||
packageName: [ '*.whl' ]
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
lfs: true
|
||||
|
||||
- name: Set up Python ${{ env.pythonVersion }}
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: ${{ env.pythonVersion }}
|
||||
|
||||
- name: PIP upgrade
|
||||
run: |
|
||||
cd hi-ml-azure
|
||||
make pip_upgrade
|
||||
|
||||
- name: Download hi-ml-azure
|
||||
id: download_himlazure
|
||||
uses: ./.github/actions/download_package_artifacts
|
||||
with:
|
||||
folder: hi-ml-azure
|
||||
package_type: ${{ matrix.packageName }}
|
||||
|
||||
- name: Download hi-ml
|
||||
id: download_himl
|
||||
uses: ./.github/actions/download_package_artifacts
|
||||
with:
|
||||
folder: hi-ml
|
||||
package_type: ${{ matrix.packageName }}
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
cd hi-ml-azure
|
||||
pip install ${{ steps.download_himlazure.outputs.package_filename }}
|
||||
cd ../hi-ml
|
||||
pip install ${{ steps.download_himl.outputs.package_filename }}
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
cd ${{ matrix.folder }}
|
||||
make pip_local
|
||||
make pytest_and_coverage
|
||||
|
||||
- name: Upload coverage artifacts
|
||||
uses: ./.github/actions/upload_coverage_artifacts
|
||||
if: ${{ matrix.packageName == '*.whl' }}
|
||||
with:
|
||||
folder: ${{ matrix.folder }}
|
||||
|
||||
|
||||
build-coverage:
|
||||
runs-on: ubuntu-18.04
|
||||
|
|
|
@ -150,4 +150,11 @@ node_modules/
|
|||
!.github/actions/format_coverage/dist/
|
||||
package-lock.json
|
||||
|
||||
logs/
|
||||
logs/
|
||||
|
||||
# Temporary files generated from conda merging
|
||||
temp_environment-*
|
||||
# Temporary files from unittesting
|
||||
temp_config_for_unittests.py
|
||||
# Temp file from building requirements for histo
|
||||
temp_requirements.txt
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<sourceFolder url="file://$MODULE_DIR$" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/hi-ml-azure/src" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/hi-ml/src" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/hi-ml-azure/testazure" isTestSource="false" />
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"recommendations": [
|
||||
"njpwerner.autodocstring",
|
||||
"doi.fileheadercomment",
|
||||
"ms-python.python",
|
||||
"ms-python.vscode-pylance"
|
||||
]
|
||||
}
|
|
@ -11,6 +11,54 @@
|
|||
"program": "${file}",
|
||||
"console": "integratedTerminal"
|
||||
},
|
||||
{
|
||||
"name": "Python: Run the HelloWorld model locally",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "${workspaceFolder}/hi-ml/src/health_ml/runner.py",
|
||||
"args": [
|
||||
"--model=HelloWorld"
|
||||
],
|
||||
"console": "integratedTerminal"
|
||||
},
|
||||
{
|
||||
"name": "Python: Run the HelloWorld model in AzureML",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "${workspaceFolder}/hi-ml/src/health_ml/runner.py",
|
||||
"args": [
|
||||
"--model=HelloWorld",
|
||||
"--azureml",
|
||||
"--cluster=lite-testing-ds2",
|
||||
],
|
||||
"console": "integratedTerminal"
|
||||
},
|
||||
{
|
||||
"name": "Python: Run HelloWorld with cross-validation",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "${workspaceFolder}/hi-ml/src/health_ml/runner.py",
|
||||
"args": [
|
||||
"--model=HelloWorld",
|
||||
"--azureml",
|
||||
"--cluster=lite-testing-ds2",
|
||||
"--crossval_count=2"
|
||||
],
|
||||
"console": "integratedTerminal"
|
||||
},
|
||||
{
|
||||
"name": "Python: Run DeepSMILECrck in AzureML",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "${workspaceFolder}/hi-ml/src/health_ml/runner.py",
|
||||
"cwd": "${workspaceFolder}/hi-ml-histopathology",
|
||||
"args": [
|
||||
"--model=histopathology.configs.classification.DeepSMILECrck",
|
||||
"--azureml",
|
||||
"--cluster=lite-testing-ds2",
|
||||
],
|
||||
"console": "integratedTerminal"
|
||||
},
|
||||
{
|
||||
"name": "Elevate: HI-ML-AZURE Local",
|
||||
"type": "python",
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
{
|
||||
"autoDocstring.docstringFormat": "sphinx",
|
||||
"fileHeaderComment.parameter":{
|
||||
"*":{
|
||||
"line": "-------------------------------------------------------------------------------------------",
|
||||
|
@ -49,4 +50,20 @@
|
|||
"[python]": {
|
||||
"editor.rulers": [120],
|
||||
},
|
||||
"python.analysis.extraPaths": [
|
||||
"./hi-ml-azure/src",
|
||||
"./hi-ml-azure/testazure",
|
||||
"./hi-ml/src",
|
||||
"./hi-ml/testhiml",
|
||||
"./hi-ml-histopathology/src",
|
||||
"./hi-ml-histopathology/testhisto",
|
||||
"./hi-ml-histopathology/testSSL",
|
||||
],
|
||||
"terminal.integrated.defaultProfile.windows": "Command Prompt",
|
||||
"terminal.integrated.env.windows": {
|
||||
"PYTHONPATH":"${workspaceFolder}/hi-ml/src:${workspaceFolder}/hi-ml-azure/src:${workspaceFolder}/hi-ml-histopathology/src"
|
||||
},
|
||||
"terminal.integrated.env.linux": {
|
||||
"PYTHONPATH":"${workspaceFolder}/hi-ml/src:${workspaceFolder}/hi-ml-azure/src:${workspaceFolder}/hi-ml-histopathology/src"
|
||||
}
|
||||
}
|
|
@ -20,12 +20,17 @@ the environment file since it is necessary for the augmentations.
|
|||
- ([#181](https://github.com/microsoft/hi-ml/pull/181)) Add computational pathology tools in hi-ml-histopathology folder.
|
||||
- ([#187](https://github.com/microsoft/hi-ml/pull/187)) Add mean pooling layer for MIL.
|
||||
- ([#186](https://github.com/microsoft/hi-ml/pull/186)) Add inference to hi-ml runner.
|
||||
- ([#198](https://github.com/microsoft/hi-ml/pull/198)) Add cross-validation to hi-ml runner.
|
||||
- ([#198](https://github.com/microsoft/hi-ml/pull/198)) Improved editor setup for VSCode.
|
||||
|
||||
### Changed
|
||||
- ([#198](https://github.com/microsoft/hi-ml/pull/198)) Model config loader is now more flexible, can accept fully qualified class name or just top-level module name and class (like histopathology.DeepSMILECrck)
|
||||
- ([#198](https://github.com/microsoft/hi-ml/pull/198)) Runner raises an error when Conda environment file contains a pip include (-r) statement
|
||||
|
||||
- ([#196](https://github.com/microsoft/hi-ml/pull/196)) Show current workspace name in error message.
|
||||
|
||||
### Fixed
|
||||
- ([#198](https://github.com/microsoft/hi-ml/pull/198)) Dependencies for histopathology folder are no longer specified in `test_requirements.txt`, but correctly in the histopathology Conda environment.
|
||||
- ([#188](https://github.com/microsoft/hi-ml/pull/188)) Updated DeepSMILES models. Now they are uptodate with innereye-dl.
|
||||
- ([#179](https://github.com/microsoft/hi-ml/pull/179)) HEDJitter was jittering the D channel as well. StainNormalization was relying on skimage.
|
||||
- ([#195](https://github.com/microsoft/hi-ml/pull/195)) Fix DeepMIL metrics bug whereby hard labels were used instead of probabilities.
|
||||
|
|
14
README.md
14
README.md
|
@ -6,13 +6,11 @@ This toolbox aims at providing low-level and high-level building blocks for Mach
|
|||
practitioners. It helps to simplify and streamline work on deep learning models for healthcare and life sciences,
|
||||
by providing tested components (data loaders, pre-processing), deep learning models, and cloud integration tools.
|
||||
|
||||
This toolbox is still in very early stages, and presently offers only the cloud integration components. ML components
|
||||
will be added in the next few weeks.
|
||||
This repository consists of two Python packages, as well as project-specific codebases:
|
||||
|
||||
This toolbox consists of two Python projects:
|
||||
|
||||
* [hi-ml-azure](https://pypi.org/project/hi-ml-azure/) - providing helper functions for running in AzureML.
|
||||
* [hi-ml](https://pypi.org/project/hi-ml/) - providing ML components.
|
||||
* PyPi package [hi-ml-azure](https://pypi.org/project/hi-ml-azure/) - providing helper functions for running in AzureML.
|
||||
* PyPi package [hi-ml](https://pypi.org/project/hi-ml/) - providing ML components.
|
||||
* hi-ml-histopathology: Models and workflows for working with histopathology images
|
||||
|
||||
## Getting started
|
||||
|
||||
|
@ -24,6 +22,10 @@ For just the AzureML helper functions:
|
|||
|
||||
* Install from `pypi` via `pip`, by running `pip install hi-ml-azure`
|
||||
|
||||
For the histopathology workflows, please follow the instructions [here](hi-ml-histopathology/README.md).
|
||||
|
||||
If you would like to contribute to the code, please check the [developer guide](docs/source/developers.md).
|
||||
|
||||
## Documentation
|
||||
|
||||
The detailed package documentation, with examples and API reference, is on
|
||||
|
|
|
@ -12,6 +12,10 @@ If in doubt, reach out to the core `hi-ml` team before starting your work.
|
|||
|
||||
Please look through the existing folder structure to find a good home for your contribution.
|
||||
|
||||
## Setting up your dev environment
|
||||
|
||||
Please see the detailed instructions [here](developers.md).
|
||||
|
||||
## Submitting a Pull Request
|
||||
|
||||
If you'd like to submit a PR to the codebase, please ensure you:
|
||||
|
|
|
@ -1,5 +1,13 @@
|
|||
# Notes for developers
|
||||
|
||||
## Development environment
|
||||
|
||||
We suggest using Visual Studio Code (VSCode), available for multiple platforms [here](https://code.visualstudio.com/).
|
||||
On Windows system, we recommend using WSL, the Windows Subsystem for Linux, because some PyTorch features are not available on Windows.
|
||||
Inside VSCode, please install the extensions that are recommended for this project - they are available in `.vscode/extensions.json` in the
|
||||
repository root.
|
||||
|
||||
|
||||
## Creating a Conda environment
|
||||
|
||||
To create a separate Conda environment with all packages that `hi-ml` requires for running and testing,
|
||||
|
@ -15,6 +23,9 @@ outside the Conda environment. For WSL, these are the required steps (see also
|
|||
[here](https://docs.microsoft.com/en-us/windows/dev-environment/javascript/nodejs-on-wsl)):
|
||||
```shell
|
||||
curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.38.0/install.sh | bash
|
||||
```
|
||||
Close your terminal and re-open it, then run:
|
||||
```shell
|
||||
nvm install node
|
||||
npm install -g pyright
|
||||
```
|
||||
|
@ -103,3 +114,13 @@ To create a new package release, follow these steps:
|
|||
* Click "Auto-generate release notes" to pull in the titles of the Pull Requests since the last release.
|
||||
* Before the auto-generated "What's changed" section, add a few sentences that summarize what's new.
|
||||
* Click "Publish release"
|
||||
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Debugging a test in VSCode fails on Windows
|
||||
|
||||
* Symptom: Debugging just does not seem to do anything
|
||||
* Check: Debug Console shows error `from _sqlite3 import *: ImportError: DLL load failed: The specified module could not be found.`
|
||||
* Fix: [see here](https://stackoverflow.com/questions/54876404/unable-to-import-sqlite3-using-anaconda-python)
|
||||
* Run `conda info --envs` to see where your Conda environment lives, then place `sqlite3.dll` into the `DLLs` folder inside of the environment
|
||||
|
|
|
@ -11,17 +11,36 @@ use of these features:
|
|||
This can be used by invoking the hi-ml runner and providing the name of the container class, like this:
|
||||
`himl-runner --model=MyContainer`.
|
||||
|
||||
There is a fully working example [HelloContainer](../../hi-ml/src/health-ml/configs/hello_container.py), that
|
||||
There is a fully working example [HelloContainer](../../hi-ml/src/health-ml/configs/hello_world.py), that
|
||||
implements a simple 1-dimensional regression model from data stored in a CSV file. You can run that
|
||||
from the command line by `himl-runner --model=HelloContainer`.
|
||||
from the command line by `himl-runner --model=HelloWorld`.
|
||||
|
||||
# Running ML experiments in Azure ML
|
||||
## Specifying the model to run
|
||||
|
||||
The `--model` argument specifies the name of a class that should be used for model training. The class needs to
|
||||
be a subclass of `LightningContainer`, see below. There are different ways of telling the runner where to find
|
||||
that class:
|
||||
* If just providing a single class name, like `--model=HelloWorld`, the class is expected somewhere in the
|
||||
`health_ml.configs` namespace. It can be in any module/folder inside of that namespace.
|
||||
* If the class is outside of the `health_ml.configs` (as would be normal if using the `himl-runner` from a package),
|
||||
you need to provide some "hints" where to start searching. It is enough to provide the start of the namespace string:
|
||||
for example, `--model histopathology.PandaImageNetMIL` is effectively telling the runner to search for the
|
||||
`PandaImageNetMIL` class _anywhere_ in the `histopathology` namespace. You can think of this as
|
||||
`histopathology.*.PandaImageNetMIL`
|
||||
|
||||
## Running ML experiments in Azure ML
|
||||
|
||||
To train in AzureML, add a `--azureml` flag. Use the flag `--cluster` to specify the name of the cluster
|
||||
in your Workspace that you want to submit the job to. So the whole command would look like:
|
||||
`himl-runner --model=HelloContainer --cluster=my_cluster_name --azureml`. You can also specify `--num_nodes` if
|
||||
you wish to distribute the model training.
|
||||
|
||||
When starting the runner, you need to do that from a directory that contains all the code that your experiment needs:
|
||||
The current working directory will be used as the root of all data that will be copied to AzureML to run your experiment.
|
||||
(the only exception to this rule is if you start the runner from within an enlistment of the HI-ML GitHub repository).
|
||||
|
||||
AzureML needs to know which Python/Conda environment it should use. For that, the runner expects a file `environment.yml`
|
||||
in the current working directory, that contains a Conda environment definition.
|
||||
|
||||
## Setup - creating your model config file
|
||||
|
||||
|
@ -140,7 +159,7 @@ By default, config files will be looked for in the folder "health_ml.configs". T
|
|||
that live elsewhere, use a fully qualified name for the parameter `--model` - e.g. "MyModule.Configs.my_config.py"
|
||||
|
||||
|
||||
### Outputting files during training
|
||||
## Outputting files during training
|
||||
|
||||
The Lightning model returned by `create_model` needs to write its output files to the current working directory.
|
||||
When running inside of AzureML, the output folders will be directly under the project root. If not running inside
|
||||
|
@ -150,7 +169,7 @@ When running in AzureML, the folder structure will be set up such that all files
|
|||
to the current working directory are later uploaded to Azure blob storage at the end of the AzureML job. The files
|
||||
will also be later available via the AzureML UI.
|
||||
|
||||
### Trainer arguments
|
||||
## Trainer arguments
|
||||
All arguments that control the PyTorch Lightning `Trainer` object are defined in the class `TrainerParams`. A
|
||||
`LightningContainer` object inherits from this class. The most essential one is the `max_epochs` field, which controls
|
||||
the `max_epochs` argument of the `Trainer`.
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
# This environment definition contains all packages to run hi-ml and hi-ml-azure development work, building and
|
||||
# testing
|
||||
name: himl
|
||||
channels:
|
||||
- defaults
|
||||
|
@ -5,7 +7,10 @@ channels:
|
|||
dependencies:
|
||||
- pip=20.1.1
|
||||
- python=3.7.3
|
||||
- pytorch=1.8.0
|
||||
- pytorch=1.10.0
|
||||
- cudatoolkit=11.3.1
|
||||
- pip:
|
||||
- -r hi-ml-azure/run_requirements.txt
|
||||
- -r hi-ml/run_requirements.txt
|
||||
- -r build_requirements.txt
|
||||
- -r test_requirements.txt
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
[pytest]
|
||||
testpaths = testazure
|
||||
norecursedirs = outputs
|
||||
log_cli = True
|
||||
log_cli_level = DEBUG
|
||||
adopts = --strict-markers
|
||||
addopts = --strict-markers
|
||||
markers =
|
||||
fast: Tests that should run very fast, and can act as smoke tests to see if something goes terribly wrong.
|
||||
slow: Tests that are slow to run and not crucial to the build.
|
||||
timeout: Tests will terminate and fail if not completed within this length of time.
|
||||
timeout: Tests will terminate and fail if not completed within this length of time.
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
azureml-sdk==1.32.0
|
||||
azureml-tensorboard==1.32.0
|
||||
conda-merge==0.1.5
|
||||
pandas==1.3.4
|
||||
param==1.9.3
|
||||
ruamel.yaml==0.16.12
|
||||
tensorboard==2.6.0
|
||||
azureml-sdk>=1.36.0
|
||||
azureml-tensorboard>=1.36.0
|
||||
conda-merge>=0.1.5
|
||||
pandas>=1.3.4
|
||||
param>=1.12
|
||||
pysocks>=1.5.8
|
||||
ruamel.yaml>=0.16.12
|
||||
tensorboard>=2.6.0
|
||||
|
|
|
@ -27,7 +27,7 @@ from azureml.data.dataset_consumption_config import DatasetConsumptionConfig
|
|||
from azureml.train.hyperdrive import HyperDriveConfig, GridParameterSampling, PrimaryMetricGoal, choice
|
||||
from azureml.dataprep.fuse.daemon import MountContext
|
||||
|
||||
from health_azure.utils import (create_python_environment, create_run_recovery_id, _find_file,
|
||||
from health_azure.utils import (create_python_environment, create_run_recovery_id, find_file_in_parent_to_pythonpath,
|
||||
is_run_and_child_runs_completed, is_running_in_azure_ml, register_environment,
|
||||
run_duration_string_to_seconds, to_azure_friendly_string, RUN_CONTEXT, get_workspace,
|
||||
PathOrString, DEFAULT_ENVIRONMENT_VARIABLES)
|
||||
|
@ -178,7 +178,7 @@ def create_run_configuration(workspace: Workspace,
|
|||
|
||||
|
||||
def create_crossval_hyperdrive_config(num_splits: int,
|
||||
cross_val_index_arg_name: str = "cross_validation_split_index",
|
||||
cross_val_index_arg_name: str = "crossval_index",
|
||||
metric_name: str = "val/loss") -> HyperDriveConfig:
|
||||
"""
|
||||
Creates an Azure ML HyperDriveConfig object for running cross validation. Note: this config expects a metric
|
||||
|
@ -186,20 +186,21 @@ def create_crossval_hyperdrive_config(num_splits: int,
|
|||
https://docs.microsoft.com/en-us/azure/machine-learning/how-to-tune-hyperparameters#log-metrics-for-hyperparameter-tuning))
|
||||
|
||||
:param num_splits: The number of splits for k-fold cross validation
|
||||
:param cross_val_index_arg_name: The name of the argument received by each of the child runs that indicates which
|
||||
split that child represents.
|
||||
:param cross_val_index_arg_name: The name of the commandline argument that each of the child runs gets, to
|
||||
indicate which split they should work on.
|
||||
:param metric_name: The name of the metric that the HyperDriveConfig will compare runs by. Please note that it is
|
||||
your responsibility to make sure a metric with this name is logged to the Run in your training script
|
||||
:return: an Azure ML HyperDriveConfig object
|
||||
"""
|
||||
logging.info(f"Creating a HyperDriveConfig. Please be aware that this expects to find the metric {metric_name}"
|
||||
f" logged to the Run during your training script.")
|
||||
logging.info(f"Creating a HyperDriveConfig. Please note that this expects to find the specified "
|
||||
f"metric '{metric_name}' logged to AzureML from your training script (for example, using the "
|
||||
f"AzureMLLogger with Pytorch Lightning)")
|
||||
parameter_dict = {
|
||||
cross_val_index_arg_name: choice(list(range(num_splits))),
|
||||
}
|
||||
return HyperDriveConfig(
|
||||
run_config=ScriptRunConfig(""),
|
||||
hyperparameter_sampling=GridParameterSampling(
|
||||
{
|
||||
cross_val_index_arg_name: choice(list(range(num_splits)))
|
||||
}),
|
||||
hyperparameter_sampling=GridParameterSampling(parameter_dict),
|
||||
primary_metric_name=metric_name,
|
||||
primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
|
||||
max_total_runs=num_splits
|
||||
|
@ -337,7 +338,8 @@ def submit_to_azure_if_needed( # type: ignore
|
|||
submit_to_azureml: Optional[bool] = None,
|
||||
tags: Optional[Dict[str, str]] = None,
|
||||
after_submission: Optional[Callable[[Run], None]] = None,
|
||||
hyperdrive_config: Optional[HyperDriveConfig] = None
|
||||
hyperdrive_config: Optional[HyperDriveConfig] = None,
|
||||
create_output_folders: bool = True,
|
||||
) -> AzureRunInfo: # pragma: no cover
|
||||
"""
|
||||
Submit a folder to Azure, if needed and run it.
|
||||
|
@ -390,6 +392,7 @@ def submit_to_azure_if_needed( # type: ignore
|
|||
for local execution (i.e., return immediately) will be executed. If not provided (None), submission to AzureML
|
||||
will be triggered if the commandline flag '--azureml' is present in sys.argv
|
||||
:param hyperdrive_config: A configuration object for Hyperdrive (hyperparameter search).
|
||||
:param create_output_folders: If True (default), create folders "outputs" and "logs" in the current working folder.
|
||||
:return: If the script is submitted to AzureML then we terminate python as the script should be executed in AzureML,
|
||||
otherwise we return a AzureRunInfo object.
|
||||
"""
|
||||
|
@ -448,7 +451,7 @@ def submit_to_azure_if_needed( # type: ignore
|
|||
workspace = get_workspace(aml_workspace, workspace_config_path)
|
||||
|
||||
if conda_environment_file is None:
|
||||
conda_environment_file = _find_file(CONDA_ENVIRONMENT_FILE)
|
||||
conda_environment_file = find_file_in_parent_to_pythonpath(CONDA_ENVIRONMENT_FILE)
|
||||
conda_environment_file = _str_to_path(conda_environment_file)
|
||||
|
||||
logging.info(f"Loaded AzureML workspace {workspace.name}")
|
||||
|
|
|
@ -0,0 +1,48 @@
|
|||
# ------------------------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
|
||||
# ------------------------------------------------------------------------------------------
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
ENVIRONMENT_YAML_FILE_NAME = "environment.yml"
|
||||
|
||||
REPO_HIML_FOLDER = "hi-ml"
|
||||
REPO_HIML_AZURE_FOLDER = "hi-ml-azure"
|
||||
|
||||
|
||||
def is_himl_used_from_git_repo() -> bool:
|
||||
"""Returns False if HI-ML was installed as a package into site-packages. Returns True if the HI-ML codebase is
|
||||
used from a clone of the full git repository.
|
||||
|
||||
:return: False if HI-ML is installed as a package, True if used via source from git.
|
||||
:rtype: bool
|
||||
"""
|
||||
health_ml_root = Path(__file__).parent.parent
|
||||
logging.debug(f"health_ml root: {health_ml_root}")
|
||||
if health_ml_root.parent.stem == "site-packages":
|
||||
return False
|
||||
himl_root = health_ml_root.parent.parent
|
||||
# These two folder are present in the top-level folder of the git repo
|
||||
expected_folders = [REPO_HIML_FOLDER, REPO_HIML_AZURE_FOLDER]
|
||||
all_folders_exist = all((himl_root / folder).is_dir() for folder in expected_folders)
|
||||
if all_folders_exist:
|
||||
return True
|
||||
raise ValueError(
|
||||
"Unable to determine the installation status: Code is not used from site-packages, but the "
|
||||
"expected top-level folders are not present?"
|
||||
)
|
||||
|
||||
|
||||
def git_repo_root_folder() -> Path:
|
||||
"""
|
||||
Attempts to return the path to the top-level hi-ml repo that contains the hi-ml and hi-ml-azure packages.
|
||||
This top level repo will only be present if hi-ml has been installed as a git submodule, or the repo has
|
||||
been directly downloaded. Otherwise (e.g.if hi-ml has been installed as a pip package) returns None
|
||||
|
||||
return: Path to the himl root dir if it exists, else None
|
||||
"""
|
||||
if not is_himl_used_from_git_repo():
|
||||
raise ValueError("This function can only be used if the HI-ML package is used directly from the git repo.")
|
||||
return Path(__file__).parent.parent.parent.parent
|
|
@ -65,6 +65,13 @@ RUN_CONTEXT = Run.get_context()
|
|||
PARENT_RUN_CONTEXT = getattr(RUN_CONTEXT, "parent", None)
|
||||
WORKSPACE_CONFIG_JSON = "config.json"
|
||||
|
||||
# Names for sections in a Conda environment definition
|
||||
CONDA_NAME = "name"
|
||||
CONDA_CHANNELS = "channels"
|
||||
CONDA_DEPENDENCIES = "dependencies"
|
||||
CONDA_PIP = "pip"
|
||||
|
||||
|
||||
# By default, define several environment variables that work around known issues in the software stack
|
||||
DEFAULT_ENVIRONMENT_VARIABLES = {
|
||||
"AZUREML_OUTPUT_UPLOAD_TIMEOUT_SEC": "3600",
|
||||
|
@ -112,14 +119,15 @@ class GenericConfig(param.Parameterized):
|
|||
"""
|
||||
# check if illegal arguments are passed in
|
||||
legal_params = self.get_overridable_parameters()
|
||||
illegal = [k for k, v in params.items() if (k in self.params().keys()) and (k not in legal_params)]
|
||||
current_param_names = self.param.values().keys()
|
||||
illegal = [k for k, v in params.items() if (k in current_param_names) and (k not in legal_params)]
|
||||
|
||||
if illegal:
|
||||
raise ValueError(f"The following parameters cannot be overridden as they are either "
|
||||
f"readonly, constant, or private members : {illegal}")
|
||||
if throw_if_unknown_param:
|
||||
# check if parameters not defined by the config class are passed in
|
||||
unknown = [k for k, v in params.items() if (k not in self.params().keys())]
|
||||
unknown = [k for k, v in params.items() if (k not in current_param_names)]
|
||||
if unknown:
|
||||
raise ValueError(f"The following parameters do not exist: {unknown}")
|
||||
# set known arguments
|
||||
|
@ -358,7 +366,7 @@ def get_overridable_parameters(config: Any) -> Dict[str, param.Parameter]:
|
|||
:return: A dictionary of parameter names and their definitions.
|
||||
"""
|
||||
assert isinstance(config, param.Parameterized)
|
||||
return dict((k, v) for k, v in config.params().items()
|
||||
return dict((k, v) for k, v in config.param.params().items()
|
||||
if reason_not_overridable(v) is None)
|
||||
|
||||
|
||||
|
@ -424,16 +432,17 @@ def report_on_overrides(config: Any, overrides_to_apply: Dict[str, Any], keys_to
|
|||
:param keys_to_ignore: set of dictionary keys not to report on
|
||||
"""
|
||||
assert isinstance(config, param.Parameterized)
|
||||
current_params = config.param.params()
|
||||
for key, desired in overrides_to_apply.items():
|
||||
if key in keys_to_ignore:
|
||||
continue
|
||||
actual = getattr(config, key, None)
|
||||
if actual == desired:
|
||||
continue
|
||||
if key not in config.params():
|
||||
if key not in current_params:
|
||||
reason = "parameter is undefined"
|
||||
else:
|
||||
val = config.params()[key]
|
||||
val = current_params[key]
|
||||
reason = reason_not_overridable(val) # type: ignore
|
||||
if reason is None:
|
||||
reason = "for UNKNOWN REASONS"
|
||||
|
@ -456,7 +465,7 @@ def create_from_matching_params(from_object: param.Parameterized, cls_: Type[T])
|
|||
c = cls_()
|
||||
if not isinstance(c, param.Parameterized):
|
||||
raise ValueError(f"The created object must be a subclass of param.Parameterized, but got {type(c)}")
|
||||
for param_name, p in c.params().items():
|
||||
for param_name, p in c.param.params().items():
|
||||
if not p.constant and not p.readonly:
|
||||
setattr(c, param_name, getattr(from_object, param_name))
|
||||
return c
|
||||
|
@ -648,38 +657,39 @@ def determine_run_id_type(run_or_recovery_id: str) -> str:
|
|||
return run_or_recovery_id
|
||||
|
||||
|
||||
def _find_file(file_name: str, stop_at_pythonpath: bool = True) -> Optional[Path]:
|
||||
def find_file_in_parent_folders(file_name: str, stop_at_path: List[Path]) -> Optional[Path]:
|
||||
"""Searches for a file of the given name in the current working directory, or any of its parent folders.
|
||||
Searching stops if either the file is found, or no parent folder can be found, or the search has reached any
|
||||
of the given folders in stop_at_path.
|
||||
|
||||
:param file_name: The name of the file to find.
|
||||
:param stop_at_path: A list of folders. If any of them is reached, search stops.
|
||||
:return: The absolute path of the file if found, or None if it was not found.
|
||||
"""
|
||||
Recurse up the file system, starting at the current working directory, to find a file. Optionally stop when we hit
|
||||
the PYTHONPATH root (defaults to stopping).
|
||||
|
||||
:param file_name: The file name of the file to find.
|
||||
:param stop_at_pythonpath: (Defaults to True.) Whether to stop at the PYTHONPATH root.
|
||||
:return: The path to the file, or None if it cannot be found.
|
||||
"""
|
||||
|
||||
def return_file_or_parent(
|
||||
start_at: Path,
|
||||
file_name: str,
|
||||
stop_at_pythonpath: bool,
|
||||
pythonpaths: List[Path]) -> Optional[Path]:
|
||||
|
||||
logging.info(f"Searching for file {file_name} in {start_at}")
|
||||
def return_file_or_parent(start_at: Path) -> Optional[Path]:
|
||||
logging.debug(f"Searching for file {file_name} in {start_at}")
|
||||
expected = start_at / file_name
|
||||
if expected.is_file() and expected.name == file_name:
|
||||
return expected
|
||||
if start_at.parent == start_at or start_at in pythonpaths:
|
||||
if start_at.parent == start_at or start_at in stop_at_path:
|
||||
return None
|
||||
return return_file_or_parent(start_at.parent, file_name, stop_at_pythonpath, pythonpaths)
|
||||
return return_file_or_parent(start_at.parent)
|
||||
|
||||
return return_file_or_parent(start_at=Path.cwd())
|
||||
|
||||
|
||||
def find_file_in_parent_to_pythonpath(file_name: str) -> Optional[Path]:
|
||||
"""
|
||||
Recurse up the file system, starting at the current working directory, to find a file. Stop when we hit
|
||||
any of the folders in PYTHONPATH.
|
||||
|
||||
:param file_name: The file name of the file to find.
|
||||
:return: The path to the file, or None if it cannot be found.
|
||||
"""
|
||||
pythonpaths: List[Path] = []
|
||||
if 'PYTHONPATH' in os.environ:
|
||||
pythonpaths = [Path(path_string) for path_string in os.environ['PYTHONPATH'].split(os.pathsep)]
|
||||
return return_file_or_parent(
|
||||
start_at=Path.cwd(),
|
||||
file_name=file_name,
|
||||
stop_at_pythonpath=stop_at_pythonpath,
|
||||
pythonpaths=pythonpaths)
|
||||
return find_file_in_parent_folders(file_name=file_name, stop_at_path=pythonpaths)
|
||||
|
||||
|
||||
def get_workspace(aml_workspace: Optional[Workspace] = None, workspace_config_path: Optional[Path] = None) -> Workspace:
|
||||
|
@ -705,7 +715,7 @@ def get_workspace(aml_workspace: Optional[Workspace] = None, workspace_config_pa
|
|||
return aml_workspace
|
||||
|
||||
if workspace_config_path is None:
|
||||
workspace_config_path = _find_file(WORKSPACE_CONFIG_JSON)
|
||||
workspace_config_path = find_file_in_parent_to_pythonpath(WORKSPACE_CONFIG_JSON)
|
||||
if workspace_config_path:
|
||||
logging.info(f"Using the workspace config file {str(workspace_config_path.absolute())}")
|
||||
else:
|
||||
|
@ -715,7 +725,9 @@ def get_workspace(aml_workspace: Optional[Workspace] = None, workspace_config_pa
|
|||
raise ValueError("Workspace config path is not a path, check your input.")
|
||||
elif workspace_config_path.is_file():
|
||||
auth = get_authentication()
|
||||
return Workspace.from_config(path=str(workspace_config_path), auth=auth)
|
||||
workspace = Workspace.from_config(path=str(workspace_config_path), auth=auth)
|
||||
logging.info(f"Logged into AzureML workspace {workspace.name}")
|
||||
return workspace
|
||||
|
||||
raise ValueError("Workspace config file does not exist or cannot be read.")
|
||||
|
||||
|
@ -902,6 +914,52 @@ def _retrieve_unique_deps(dependencies: List[str], keep_method: str = "first") -
|
|||
return unique_deps_list
|
||||
|
||||
|
||||
def _get_pip_dependencies(parsed_yaml: Any) -> Optional[Tuple[int, List[Any]]]:
|
||||
"""Gets the first pip dependencies section of a Conda yaml file. Returns the index at which the pip section
|
||||
was found, and the pip section itself. If no pip section was found, returns None
|
||||
"""
|
||||
if CONDA_DEPENDENCIES in parsed_yaml:
|
||||
for i, dep in enumerate(parsed_yaml.get(CONDA_DEPENDENCIES)):
|
||||
if isinstance(dep, dict) and CONDA_PIP in dep:
|
||||
return i, dep[CONDA_PIP]
|
||||
return None
|
||||
|
||||
|
||||
def is_pip_include_dependency(package: str) -> bool:
|
||||
"""Returns True if the given package name (as used in a Conda environment file) relies on PIP includes,
|
||||
in the format "-r requirements.txt"
|
||||
|
||||
:param package: The name of the PIP dependency to check.
|
||||
:return: True if the package name is a PIP include statement.
|
||||
"""
|
||||
return package.strip().startswith("-r ")
|
||||
|
||||
|
||||
def is_conda_file_with_pip_include(conda_file: Path) -> Tuple[bool, Dict]:
|
||||
"""Checks if the given Conda environment file uses the "include" syntax in the pip section, like
|
||||
`-r requirements.txt`. If it uses pip includes, the function returns True and a modified Conda yaml
|
||||
without all the pip include statements. If no pip include statements are found, False is returned and the
|
||||
unmodified Conda yaml.
|
||||
|
||||
:param conda_file: The path of a Conda environment file.
|
||||
:return: True if the file uses pip includes, False if not. Seconda return value is the modified Conda environment
|
||||
without the PIP include statements.
|
||||
"""
|
||||
conda_yaml = conda_merge.read_file(str(conda_file))
|
||||
pip_dep = _get_pip_dependencies(conda_yaml)
|
||||
if pip_dep is not None:
|
||||
pip_index, pip = pip_dep
|
||||
pip_without_include = [package for package in pip if not is_pip_include_dependency(package)]
|
||||
if len(pip) != len(pip_without_include):
|
||||
if len(pip_without_include) == 0:
|
||||
# Avoid empty PIP dependencies section, this causes a failure in conda_merge
|
||||
conda_yaml.get(CONDA_DEPENDENCIES).pop(pip_index)
|
||||
else:
|
||||
conda_yaml.get(CONDA_DEPENDENCIES)[pip_index] = {CONDA_PIP: pip_without_include}
|
||||
return True, conda_yaml
|
||||
return False, conda_yaml
|
||||
|
||||
|
||||
def merge_conda_files(conda_files: List[Path], result_file: Path, pip_files: Optional[List[Path]] = None,
|
||||
pip_clash_keep_method: str = "first") -> None:
|
||||
"""
|
||||
|
@ -914,43 +972,42 @@ def merge_conda_files(conda_files: List[Path], result_file: Path, pip_files: Opt
|
|||
:param pip_clash_keep_method: If two or more pip packages are specified with the same name, this determines
|
||||
which one should be kept. Current options: ['first', 'last']
|
||||
"""
|
||||
env_definitions = [conda_merge.read_file(str(f)) for f in conda_files]
|
||||
env_definitions: List[Any] = []
|
||||
for file in conda_files:
|
||||
_, pip_without_include = is_conda_file_with_pip_include(file)
|
||||
env_definitions.append(pip_without_include)
|
||||
unified_definition = {}
|
||||
NAME = "name"
|
||||
CHANNELS = "channels"
|
||||
DEPENDENCIES = "dependencies"
|
||||
|
||||
extra_pip_deps = []
|
||||
for pip_file in pip_files or []:
|
||||
with open(pip_file, "r") as f_path:
|
||||
additional_pip_deps = [d for d in f_path.read().split("\n") if d]
|
||||
extra_pip_deps.extend(additional_pip_deps)
|
||||
additional_pip_deps = [d for d in pip_file.read_text().split("\n") if d and not is_pip_include_dependency(d)]
|
||||
extra_pip_deps.extend(additional_pip_deps)
|
||||
|
||||
name = conda_merge.merge_names(env.get(NAME) for env in env_definitions)
|
||||
name = conda_merge.merge_names(env.get(CONDA_NAME) for env in env_definitions)
|
||||
if name:
|
||||
unified_definition[NAME] = name
|
||||
unified_definition[CONDA_NAME] = name
|
||||
|
||||
try:
|
||||
channels = conda_merge.merge_channels(env.get(CHANNELS) for env in env_definitions)
|
||||
channels = conda_merge.merge_channels(env.get(CONDA_CHANNELS) for env in env_definitions)
|
||||
except conda_merge.MergeError:
|
||||
logging.error("Failed to merge channel priorities.")
|
||||
raise
|
||||
if channels:
|
||||
unified_definition[CHANNELS] = channels
|
||||
unified_definition[CONDA_CHANNELS] = channels
|
||||
|
||||
try:
|
||||
deps_to_merge = [env.get(DEPENDENCIES) for env in env_definitions]
|
||||
deps_to_merge = [env.get(CONDA_DEPENDENCIES) for env in env_definitions]
|
||||
if len(extra_pip_deps) > 0:
|
||||
deps_to_merge.extend([[{"pip": extra_pip_deps}]])
|
||||
deps_to_merge.append([{CONDA_PIP: extra_pip_deps}])
|
||||
deps = conda_merge.merge_dependencies(deps_to_merge)
|
||||
|
||||
# Remove duplicated pip packages from merged dependencies sections. Note that for a package that is
|
||||
# duplicated, the first value encountered will be retained.
|
||||
pip_deps_entries = [d for d in deps if isinstance(d, dict) and "pip" in d] # type: ignore
|
||||
pip_deps_entries = [d for d in deps if isinstance(d, dict) and CONDA_PIP in d] # type: ignore
|
||||
if len(pip_deps_entries) == 0:
|
||||
raise ValueError("Didn't find a dictionary with the key 'pip' in the list of dependencies")
|
||||
pip_deps_entry: Dict[str, List[str]] = pip_deps_entries[0]
|
||||
pip_deps = pip_deps_entry["pip"]
|
||||
pip_deps = pip_deps_entry[CONDA_PIP]
|
||||
# temporarily remove pip dependencies from deps to be added back after deduplicaton
|
||||
deps.remove(pip_deps_entry)
|
||||
|
||||
|
@ -960,13 +1017,13 @@ def merge_conda_files(conda_files: List[Path], result_file: Path, pip_files: Opt
|
|||
unique_pip_deps = _retrieve_unique_deps(pip_deps, keep_method=pip_clash_keep_method)
|
||||
|
||||
# finally add back the deduplicated list of dependencies
|
||||
unique_deps.append({"pip": unique_pip_deps}) # type: ignore
|
||||
unique_deps.append({CONDA_PIP: unique_pip_deps}) # type: ignore
|
||||
|
||||
except conda_merge.MergeError:
|
||||
logging.error("Failed to merge dependencies.")
|
||||
raise
|
||||
if unique_deps:
|
||||
unified_definition[DEPENDENCIES] = unique_deps
|
||||
unified_definition[CONDA_DEPENDENCIES] = unique_deps
|
||||
else:
|
||||
raise ValueError("No dependencies found in any of the conda files.")
|
||||
|
||||
|
@ -1439,7 +1496,7 @@ def _get_runs_from_script_config(script_config: AmlRunScriptConfig, workspace: W
|
|||
if script_config.run is None:
|
||||
if script_config.experiment is None:
|
||||
# default to latest run file
|
||||
latest_run_file = _find_file("most_recent_run.txt")
|
||||
latest_run_file = find_file_in_parent_to_pythonpath("most_recent_run.txt")
|
||||
if latest_run_file is None:
|
||||
raise ValueError("Could not find most_recent_run.txt")
|
||||
runs = [get_most_recent_run(latest_run_file, workspace)]
|
||||
|
@ -1678,7 +1735,7 @@ def aml_workspace_for_unittests() -> Workspace:
|
|||
is found, the workspace details are read from environment variables. Authentication information is also read
|
||||
from environment variables.
|
||||
"""
|
||||
config_json = _find_file(WORKSPACE_CONFIG_JSON)
|
||||
config_json = find_file_in_parent_to_pythonpath(WORKSPACE_CONFIG_JSON)
|
||||
if config_json is not None:
|
||||
return Workspace.from_config(path=str(config_json))
|
||||
else:
|
||||
|
|
|
@ -1,6 +0,0 @@
|
|||
name: test-env
|
||||
dependencies:
|
||||
- pip=20.1.1
|
||||
- python=3.7.3
|
||||
- pip:
|
||||
- -r ../test_requirements.txt
|
|
@ -3,13 +3,19 @@
|
|||
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
|
||||
# ------------------------------------------------------------------------------------------
|
||||
import shutil
|
||||
import sys
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Generator
|
||||
|
||||
import pytest
|
||||
|
||||
from health_azure.himl import _package_setup
|
||||
full_folder = str(Path(__file__).parent.parent / "src")
|
||||
if full_folder not in sys.path:
|
||||
print(f"Adding to sys.path for running hi-ml-azure: {full_folder}")
|
||||
sys.path.insert(0, str(full_folder))
|
||||
|
||||
from health_azure.himl import _package_setup # noqa: E402
|
||||
|
||||
|
||||
def outputs_for_tests() -> Path:
|
||||
|
|
|
@ -29,6 +29,7 @@ from azureml.core import Experiment, Run, ScriptRunConfig, Workspace
|
|||
from azureml.core.authentication import ServicePrincipalAuthentication
|
||||
from azureml.core.environment import CondaDependencies
|
||||
from azureml.data.azure_storage_datastore import AzureBlobDatastore
|
||||
from health_azure import paths
|
||||
|
||||
import health_azure.utils as util
|
||||
from health_azure.himl import AML_IGNORE_FILE, append_to_amlignore
|
||||
|
@ -61,10 +62,10 @@ def test_find_file(tmp_path: Path) -> None:
|
|||
start_path.mkdir(exist_ok=False)
|
||||
where_are_we_now = Path.cwd()
|
||||
os.chdir(start_path)
|
||||
found_file = util._find_file(file_name, False)
|
||||
found_file = util.find_file_in_parent_to_pythonpath(file_name)
|
||||
assert found_file
|
||||
with mock.patch.dict(os.environ, {"PYTHONPATH": str(python_root.absolute())}):
|
||||
found_file = util._find_file(file_name)
|
||||
found_file = util.find_file_in_parent_to_pythonpath(file_name)
|
||||
assert not found_file
|
||||
os.chdir(where_are_we_now)
|
||||
|
||||
|
@ -384,6 +385,110 @@ dependencies:
|
|||
util.merge_conda_files(files, merged_file)
|
||||
|
||||
|
||||
def test_merge_conda_pip_include(random_folder: Path) -> None:
|
||||
"""
|
||||
Tests the logic to exclude PIP include statements from Conda environments.
|
||||
"""
|
||||
env1 = """
|
||||
channels:
|
||||
- default
|
||||
dependencies:
|
||||
- conda_both=3.0
|
||||
- pip:
|
||||
- -r requirements.txt
|
||||
- foo==1.0
|
||||
"""
|
||||
file1 = random_folder / "env1.yml"
|
||||
file1.write_text(env1)
|
||||
merged_file = random_folder / "merged.yml"
|
||||
util.merge_conda_files([file1], merged_file)
|
||||
merged_contents = merged_file.read_text()
|
||||
assert "-r requirements.txt" not in merged_contents
|
||||
|
||||
file2 = random_folder / "requirements.txt"
|
||||
file2.write_text("package==1.0.0")
|
||||
merged_file2 = random_folder / "merged2.yml"
|
||||
util.merge_conda_files([file1], merged_file2, pip_files=[file2])
|
||||
merged_contents2 = merged_file2.read_text()
|
||||
assert merged_contents2 == """channels:
|
||||
- default
|
||||
dependencies:
|
||||
- conda_both=3.0
|
||||
- pip:
|
||||
- foo==1.0
|
||||
- package==1.0.0
|
||||
"""
|
||||
|
||||
|
||||
def test_merge_conda_pip_include2(random_folder: Path) -> None:
|
||||
"""
|
||||
Tests the logic to exclude PIP include statements from Conda environments, on the root level environment file.
|
||||
"""
|
||||
if paths.is_himl_used_from_git_repo():
|
||||
root_yaml = paths.git_repo_root_folder() / paths.ENVIRONMENT_YAML_FILE_NAME
|
||||
requirements = paths.git_repo_root_folder() / "hi-ml-azure" / "run_requirements.txt"
|
||||
merged_file2 = random_folder / "merged2.yml"
|
||||
util.merge_conda_files([root_yaml], merged_file2, pip_files=[requirements])
|
||||
|
||||
|
||||
def assert_pip_length(yaml: Any, expected_length: int) -> None:
|
||||
"""Checks if the pip dependencies section of a Conda YAML file has the expected number of entries
|
||||
"""
|
||||
pip = util._get_pip_dependencies(yaml)
|
||||
assert pip is not None
|
||||
assert len(pip[1]) == expected_length
|
||||
|
||||
|
||||
@pytest.mark.fast
|
||||
def test_pip_include_1() -> None:
|
||||
"""Test if Conda files that use PIP include are handled correctly. This uses the top-level environment.yml
|
||||
file in the repository.
|
||||
"""
|
||||
if paths.is_himl_used_from_git_repo():
|
||||
root_yaml = paths.git_repo_root_folder() / paths.ENVIRONMENT_YAML_FILE_NAME
|
||||
assert root_yaml.is_file()
|
||||
original_yaml = conda_merge.read_file(root_yaml)
|
||||
# At the time of writing, the top-level environment file only had 4 include statements in the pip
|
||||
# section, they should all be filtered out.
|
||||
assert_pip_length(original_yaml, 4)
|
||||
uses_pip_include, modified_yaml = util.is_conda_file_with_pip_include(root_yaml)
|
||||
assert uses_pip_include
|
||||
pip = util._get_pip_dependencies(modified_yaml)
|
||||
# The pip section of the top-level yaml has nothing but include statements, so after filtering the
|
||||
# pip section is empty. In this case, no pip section shoudld be present at all.
|
||||
assert pip is None
|
||||
|
||||
|
||||
@pytest.mark.fast
|
||||
def test_pip_include_2(tmp_path: Path) -> None:
|
||||
"""Test if Conda files that use PIP include are recognized.
|
||||
"""
|
||||
# Environment file without a "-r" include statement
|
||||
conda_str = """name: simple-envpip
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk==1.23.0
|
||||
- more_conda
|
||||
"""
|
||||
tmp_conda = tmp_path / "env.yml"
|
||||
tmp_conda.write_text(conda_str)
|
||||
uses_pip_include, modified_yaml = util.is_conda_file_with_pip_include(tmp_conda)
|
||||
assert not uses_pip_include
|
||||
assert_pip_length(modified_yaml, 1)
|
||||
|
||||
# Environment file that has a "-r" include statement
|
||||
conda_str = """name: simple-env
|
||||
dependencies:
|
||||
- pip:
|
||||
- -r foo.txt
|
||||
- any_package
|
||||
"""
|
||||
tmp_conda.write_text(conda_str)
|
||||
uses_pip_include, modified_yaml = util.is_conda_file_with_pip_include(tmp_conda)
|
||||
assert uses_pip_include
|
||||
assert util._get_pip_dependencies(modified_yaml) == (0, ["any_package"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(["s", "expected"],
|
||||
[
|
||||
("1s", 1),
|
||||
|
@ -392,6 +497,7 @@ dependencies:
|
|||
("1.0d", 24 * 3600),
|
||||
("", None),
|
||||
]) # NOQA
|
||||
@pytest.mark.fast
|
||||
def test_run_duration(s: str, expected: Optional[float]) -> None:
|
||||
actual = util.run_duration_string_to_seconds(s)
|
||||
assert actual == expected
|
||||
|
@ -399,11 +505,13 @@ def test_run_duration(s: str, expected: Optional[float]) -> None:
|
|||
assert isinstance(actual, int)
|
||||
|
||||
|
||||
@pytest.mark.fast
|
||||
def test_run_duration_fails() -> None:
|
||||
with pytest.raises(Exception):
|
||||
util.run_duration_string_to_seconds("17b")
|
||||
|
||||
|
||||
@pytest.mark.fast
|
||||
def test_repository_root() -> None:
|
||||
root = repository_root()
|
||||
assert (root / "SECURITY.md").is_file()
|
||||
|
|
|
@ -17,9 +17,12 @@ pip_build:
|
|||
pip_test:
|
||||
$(call call_parent,pip_test)
|
||||
|
||||
# pip install local package in editable mode for development and testing
|
||||
# pip install all requirements for histo, read off the Conda file. This is somewhat hacky,
|
||||
# we could also build a full Conda before starting the tests. Unclear about the performance
|
||||
# impact of that.
|
||||
call_pip_local:
|
||||
ls
|
||||
sed -e '1,/pip:/ d' environment.yml | cut -d "-" -f 2- > temp_requirements.txt
|
||||
pip install -r temp_requirements.txt
|
||||
|
||||
# pip upgrade and install local package in editable mode
|
||||
pip_local: pip_upgrade call_pip_local
|
||||
|
@ -38,13 +41,6 @@ clean:
|
|||
rm -vrf ./testhisto/testhisto/test_outputs ./testhistotestSSL/test_ouputs
|
||||
rm -vf ./coverage ./coverage.txt ./coverage.xml
|
||||
|
||||
# build package, assuming build requirements already installed
|
||||
call_build:
|
||||
ls
|
||||
|
||||
# pip install build requirements and build package
|
||||
build: pip_build call_build
|
||||
|
||||
# run flake8, assuming test requirements already installed
|
||||
call_flake8:
|
||||
flake8 --count --statistics --config=../.flake8 .
|
||||
|
@ -91,4 +87,4 @@ call_pytest_and_coverage:
|
|||
pytest_and_coverage: pip_test call_pytest_and_coverage
|
||||
|
||||
# install test requirements and run all tests
|
||||
test_all: pip_test call_flake8 call_mypy call_pytest_and_coverage
|
||||
test_all: pip_test call_flake8 call_mypy call_pytest_and_coverage
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
# Histopathology Models and Workflows
|
||||
|
||||
## Getting started
|
||||
|
||||
- Build environment
|
||||
- Download config to AzureML workspace
|
||||
- Run a first workflow.
|
||||
|
||||
To be completed.
|
|
@ -4,29 +4,29 @@ channels:
|
|||
- pytorch
|
||||
- conda-forge
|
||||
dependencies:
|
||||
- cudatoolkit=11.1
|
||||
- cudatoolkit=11.3.1
|
||||
- pip=20.1.1
|
||||
- python=3.7.3
|
||||
- pytorch=1.10.0
|
||||
- python-blosc==1.7.0
|
||||
- torchvision=0.11.1
|
||||
- pip:
|
||||
- -r ../test_requirements.txt
|
||||
- azureml-sdk==1.36.0
|
||||
- cryptography==3.3.2
|
||||
- docker==4.3.1
|
||||
- flask==2.0.1
|
||||
- gputil==1.4.0
|
||||
- hi-ml>=0.1.12
|
||||
- joblib==0.16.0
|
||||
- jupyter==1.0.0
|
||||
- jupyter-client==6.1.5
|
||||
- lightning-bolts==0.4.0
|
||||
- mlflow==1.17.0
|
||||
- monai==0.6.0
|
||||
- more-itertools==8.10.0
|
||||
- mypy-extensions==0.4.3
|
||||
- numba==0.51.2
|
||||
- numpy==1.19.1
|
||||
- opencv-python-headless==4.5.1.48
|
||||
- pandas==1.3.4
|
||||
- pillow==9.0.0
|
||||
- psutil==5.7.2
|
||||
- pydicom==2.0.0
|
||||
- pyflakes==2.2.0
|
||||
- PyJWT==1.7.1
|
||||
- rich==5.1.1
|
||||
|
@ -34,8 +34,12 @@ dependencies:
|
|||
- runstats==1.8.0
|
||||
- scikit-image==0.17.2
|
||||
- scipy==1.5.2
|
||||
- simpleitk==1.2.4
|
||||
- six==1.15.0
|
||||
- stopit==1.1.2
|
||||
- tabulate==0.8.7
|
||||
- torchprof==1.3.3
|
||||
- torch>=1.10.0
|
||||
- torchvision>=0.11.1
|
||||
- torchmetrics==0.6.0
|
||||
- umap-learn==0.5.2
|
||||
- yacs==0.1.8
|
||||
|
|
|
@ -3,6 +3,6 @@ testpaths = testhisto testSSL
|
|||
norecursedirs = docs logs outputs test_data
|
||||
log_cli = true
|
||||
log_cli_level = DEBUG
|
||||
adopts = --strict-markers
|
||||
addopts = --strict-markers
|
||||
markers =
|
||||
fast: Tests that should run very fast, and can act as smoke tests to see if something goes terribly wrong.
|
|
@ -51,6 +51,8 @@ class BaseMIL(LightningContainer):
|
|||
"`none` (default),`cpu`, `gpu`")
|
||||
encoding_chunk_size: int = param.Integer(0, doc="If > 0 performs encoding in chunks, by loading"
|
||||
"enconding_chunk_size tiles per chunk")
|
||||
is_finetune: bool = param.Boolean(False, doc="If True, fine-tune the encoder during training. If False, "
|
||||
"keep the encoder frozen.")
|
||||
# local_dataset (used as data module root_path) is declared in DatasetParams superclass
|
||||
|
||||
@property
|
||||
|
|
|
@ -58,8 +58,8 @@ class DeepSMILECrck(BaseMIL):
|
|||
# declared in TrainerParams:
|
||||
max_epochs=50,
|
||||
# declared in WorkflowParams:
|
||||
# number_of_cross_validation_splits=5,
|
||||
# cross_validation_split_index=0,
|
||||
# crossval_count=5,
|
||||
# crossval_index=0,
|
||||
# declared in OptimizerParams:
|
||||
l_rate=5e-4,
|
||||
weight_decay=1e-4,
|
||||
|
@ -122,8 +122,8 @@ class DeepSMILECrck(BaseMIL):
|
|||
cache_mode=self.cache_mode,
|
||||
precache_location=self.precache_location,
|
||||
cache_dir=self.cache_dir,
|
||||
number_of_cross_validation_splits=self.number_of_cross_validation_splits,
|
||||
cross_validation_split_index=self.cross_validation_split_index,
|
||||
crossval_count=self.crossval_count,
|
||||
crossval_index=self.crossval_index,
|
||||
)
|
||||
|
||||
def get_callbacks(self) -> List[Callback]:
|
||||
|
|
|
@ -60,8 +60,8 @@ class DeepSMILEPanda(BaseMIL):
|
|||
# use_mixed_precision = True,
|
||||
|
||||
# declared in WorkflowParams:
|
||||
number_of_cross_validation_splits=5,
|
||||
cross_validation_split_index=0,
|
||||
crossval_count=5,
|
||||
crossval_index=0,
|
||||
|
||||
# declared in OptimizerParams:
|
||||
l_rate=5e-4,
|
||||
|
@ -125,8 +125,8 @@ class DeepSMILEPanda(BaseMIL):
|
|||
cache_mode=self.cache_mode,
|
||||
precache_location=self.precache_location,
|
||||
cache_dir=self.cache_dir,
|
||||
# number_of_cross_validation_splits=self.number_of_cross_validation_splits,
|
||||
# cross_validation_split_index=self.cross_validation_split_index,
|
||||
# crossval_count=self.crossval_count,
|
||||
# crossval_index=self.crossval_index,
|
||||
)
|
||||
|
||||
# TODO: move self.class_names somewhere else since this is almost an exact copy of create_model in BaseMIL
|
||||
|
|
|
@ -39,8 +39,8 @@ class TilesDataModule(LightningDataModule):
|
|||
cache_mode: CacheMode = CacheMode.NONE,
|
||||
precache_location: CacheLocation = CacheLocation.NONE,
|
||||
cache_dir: Optional[Path] = None,
|
||||
number_of_cross_validation_splits: int = 0,
|
||||
cross_validation_split_index: int = 0) -> None:
|
||||
crossval_count: int = 0,
|
||||
crosval_index: int = 0) -> None:
|
||||
"""
|
||||
:param root_path: Root directory of the source dataset.
|
||||
:param max_bag_size: Upper bound on number of tiles in each loaded bag. If 0 (default),
|
||||
|
@ -67,8 +67,8 @@ class TilesDataModule(LightningDataModule):
|
|||
device it was saved from;
|
||||
If cache_mode is `DISK` precache_location `CPU` and `GPU` are equivalent.
|
||||
:param cache_dir: The directory onto which to cache data if caching is enabled.
|
||||
:param number_of_cross_validation_splits: Number of folds to perform.
|
||||
:param cross_validation_split_index: Index of the cross validation split to be performed.
|
||||
:param crossval_count: Number of folds to perform.
|
||||
:param crosval_index: Index of the cross validation split to be performed.
|
||||
"""
|
||||
if precache_location is not CacheLocation.NONE and cache_mode is CacheMode.NONE:
|
||||
raise ValueError("Can only pre-cache if caching is enabled")
|
||||
|
@ -85,8 +85,8 @@ class TilesDataModule(LightningDataModule):
|
|||
self.precache_location = precache_location
|
||||
self.cache_dir = cache_dir
|
||||
self.batch_size = batch_size
|
||||
self.number_of_cross_validation_splits = number_of_cross_validation_splits
|
||||
self.cross_validation_split_index = cross_validation_split_index
|
||||
self.crossval_count = crossval_count
|
||||
self.crosval_index = crosval_index
|
||||
self.train_dataset, self.val_dataset, self.test_dataset = self.get_splits()
|
||||
self.class_weights = self.train_dataset.get_class_weights()
|
||||
self.seed = seed
|
||||
|
|
|
@ -31,9 +31,9 @@ class TcgaCrckTilesDataModule(TilesDataModule):
|
|||
group_column=trainval_dataset.SLIDE_ID_COLUMN,
|
||||
random_seed=5)
|
||||
|
||||
# if self.number_of_cross_validation_splits > 1:
|
||||
# if self.crossval_count > 1:
|
||||
# # Function get_k_fold_cross_validation_splits() will concatenate train and val splits
|
||||
# splits = splits.get_k_fold_cross_validation_splits(self.number_of_cross_validation_splits)
|
||||
# splits = splits.get_k_fold_cross_validation_splits(self.crossval_count)
|
||||
# [self.cross_validation_split_index]
|
||||
|
||||
return (TcgaCrck_TilesDataset(self.root_path, dataset_df=splits.train),
|
||||
|
|
|
@ -116,7 +116,7 @@ def test_ssl_container_cifar10_resnet_simclr() -> None:
|
|||
- checkpoint loading and ImageClassifier module creation
|
||||
- training of image classifier for one epoch.
|
||||
"""
|
||||
model_namespace_simclr = "hi-ml-histopathology.SSL.configs.CIFAR10SimCLR"
|
||||
model_namespace_simclr = "SSL.configs.CIFAR10SimCLR"
|
||||
args = common_test_args + [f"--model={model_namespace_simclr}"]
|
||||
runner = default_runner()
|
||||
with check_config_json(Path.cwd()):
|
||||
|
@ -160,7 +160,7 @@ def test_ssl_container_cifar10_resnet_simclr() -> None:
|
|||
assert SslOnlineEvaluatorHiml.EVALUATOR_STATE_NAME in callback_state
|
||||
|
||||
# Now run the actual SSL classifier off the stored checkpoint
|
||||
model_namespace_cifar = "hi-ml-histopathology.SSL.configs.SSLClassifierCIFAR"
|
||||
model_namespace_cifar = "SSL.configs.SSLClassifierCIFAR"
|
||||
args = common_test_args + [f"--model={model_namespace_cifar}",
|
||||
f"--local_ssl_weights_path={checkpoint_path}"]
|
||||
with check_config_json(Path.cwd()):
|
||||
|
@ -180,7 +180,7 @@ def test_load_ssl_container_cifar10_cifar100_resnet_byol() -> None:
|
|||
Tests that the parameters feed into the BYOL model and online evaluator are
|
||||
indeed the one we fed through our command line args
|
||||
"""
|
||||
model_namespace_byol = "hi-ml-histopathology.SSL.configs.CIFAR10CIFAR100BYOL"
|
||||
model_namespace_byol = "SSL.configs.CIFAR10CIFAR100BYOL"
|
||||
args = common_test_args + [f"--model={model_namespace_byol}"]
|
||||
runner = default_runner()
|
||||
with mock.patch("sys.argv", args):
|
||||
|
@ -199,7 +199,7 @@ def test_ssl_container_rsna() -> None:
|
|||
runner = default_runner()
|
||||
path_to_cxr_test_dataset = TEST_OUTPUTS_PATH / "cxr_test_dataset"
|
||||
# Test training of SSL model
|
||||
model_namespace_byol = "hi-ml-histopathology.SSL.configs.NIH_RSNA_BYOL"
|
||||
model_namespace_byol = "SSL.configs.NIH_RSNA_BYOL"
|
||||
args = common_test_args + [f"--model={model_namespace_byol}",
|
||||
f"--local_datasets={str(path_to_cxr_test_dataset)},{str(path_to_cxr_test_dataset)}",
|
||||
"--use_balanced_binary_loss_for_linear_head=True",
|
||||
|
@ -249,7 +249,7 @@ def test_ssl_container_rsna() -> None:
|
|||
|
||||
# Check that we are able to load the checkpoint and create classifier model
|
||||
checkpoint_path = loaded_config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
|
||||
model_namespace_cxr = "hi-ml-histopathology.SSL.configs.CXRImageClassifier"
|
||||
model_namespace_cxr = "SSL.configs.CXRImageClassifier"
|
||||
args = common_test_args + [f"--model={model_namespace_cxr}",
|
||||
f"--local_datasets={str(path_to_cxr_test_dataset)}",
|
||||
"--use_balanced_binary_loss_for_linear_head=True",
|
||||
|
|
|
@ -0,0 +1,12 @@
|
|||
# This environment definition only specifies python and pytorch itself. It will be picked up automatically
|
||||
# for AzureML jobs that are started from within the hi-ml folder. The submission logic will automatically add the
|
||||
# pip packages required by health_azure and health_ml
|
||||
name: himl-basic
|
||||
channels:
|
||||
- defaults
|
||||
- pytorch
|
||||
dependencies:
|
||||
- pip=20.1.1
|
||||
- python=3.7.3
|
||||
- pytorch=1.10.0
|
||||
- cudatoolkit=11.3.1
|
|
@ -1,6 +1,8 @@
|
|||
[pytest]
|
||||
testpaths = testhiml
|
||||
norecursedirs = outputs test_data
|
||||
log_cli = True
|
||||
log_cli_level = DEBUG
|
||||
adopts = --strict-markers
|
||||
addopts = --strict-markers
|
||||
markers =
|
||||
fast: Tests that should run very fast, and can act as smoke tests to see if something goes terribly wrong.
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
dataclasses-json==0.5.2
|
||||
hi-ml-azure>=0.1.8
|
||||
jinja2==3.0.2
|
||||
matplotlib==3.4.3
|
||||
opencv-python-headless==4.5.1.48
|
||||
pandas==1.3.4
|
||||
pytorch-lightning==1.5.5
|
||||
rpdb==0.1.6
|
||||
torchvision==0.11.1
|
||||
torch>=1.8
|
||||
jinja2>=3.0.2
|
||||
matplotlib>=3.4.3
|
||||
opencv-python-headless>=4.5.1.48
|
||||
pandas>=1.3.4
|
||||
pytorch-lightning>=1.5.5
|
||||
rpdb>=0.1.6
|
||||
torchvision>=0.11.1
|
||||
torch>=1.10.0
|
||||
|
|
|
@ -5,7 +5,6 @@
|
|||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from pytorch_lightning import LightningDataModule, LightningModule
|
||||
from torchmetrics import MeanAbsoluteError
|
||||
|
@ -16,63 +15,91 @@ from torch.utils.data import DataLoader, Dataset
|
|||
from health_ml.lightning_container import LightningContainer
|
||||
|
||||
|
||||
class HelloDataset(Dataset):
|
||||
def _create_1d_regression_dataset(n: int = 100, seed: int = 0) -> torch.Tensor:
|
||||
"""Creates a simple 1-D dataset of a noisy linear function.
|
||||
|
||||
:param n: The number of datapoints to generate, defaults to 100
|
||||
:type n: int, optional
|
||||
:param seed: Random number generator seed, defaults to 0
|
||||
:type seed: int, optional
|
||||
:return: A tensor that contains X values in [:, 0] and Y values in [:, 1]
|
||||
:rtype: torch.Tensor
|
||||
"""
|
||||
A simple 1dim regression task, read from a data file stored in the test data folder.
|
||||
torch.manual_seed(seed)
|
||||
x = torch.rand((n, 1)) * 10
|
||||
y = 0.2 * x + 0.1 * torch.randn(x.size())
|
||||
xy = torch.cat((x, y), dim=1)
|
||||
return xy
|
||||
|
||||
|
||||
def _split_crossval(xy: torch.Tensor, crossval_count: int, crossval_index: int) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
# Creating the data file:
|
||||
# import numpy as np
|
||||
# import torch
|
||||
#
|
||||
# N = 100
|
||||
# x = torch.rand((N, 1)) * 10
|
||||
# y = 0.2 * x + 0.1 * torch.randn(x.size())
|
||||
# xy = torch.cat((x, y), dim=1)
|
||||
# np.savetxt("health_ml/configs/hellocontainer.csv", xy.numpy(), delimiter=",")
|
||||
def __init__(self, raw_data: List[List[float]]) -> None:
|
||||
Generates a split of the given dataset along the first dimension for cross-validation.
|
||||
|
||||
:param xy: The data that should be split. The split will be generated acros dimension 0.
|
||||
:type xy: torch.Tensor
|
||||
:param crossval_count: The number of splits in total
|
||||
:type crossval_count: int
|
||||
:param crossval_index: The index of the split that should be generated (0 <= crossval_index < crossval_count)
|
||||
:type crossval_index: int
|
||||
:return: A tuple of (training data, validation data)
|
||||
:rtype: Tuple[torch.Tensor, torch.Tensor]
|
||||
"""
|
||||
n = xy.shape[0]
|
||||
split_size = n // crossval_count
|
||||
val_start = crossval_index * split_size
|
||||
val_end = (crossval_index + 1) * split_size
|
||||
train1_start = 0 if crossval_index == 0 else (crossval_index - 1) * split_size
|
||||
train1_end = 0 if crossval_index == 0 else val_start
|
||||
train2_start = val_end if crossval_index < (crossval_count - 1) else 0
|
||||
train2_end = n if crossval_index < (crossval_count - 1) else 0
|
||||
val = xy[val_start:val_end]
|
||||
train = torch.concat([xy[train1_start:train1_end], xy[train2_start:train2_end]])
|
||||
return (train, val)
|
||||
|
||||
|
||||
class HelloWorldDataset(Dataset):
|
||||
"""
|
||||
A simple 1dim regression task
|
||||
"""
|
||||
|
||||
def __init__(self, xy: torch.Tensor) -> None:
|
||||
"""
|
||||
Creates the 1-dim regression dataset.
|
||||
|
||||
:param raw_data: The raw data. This must be numeric data which can be converted into a tensor.
|
||||
See the static method from_path_and_indexes for an example call.
|
||||
:param xy: The raw data, x in the first column, y in the second column
|
||||
"""
|
||||
super().__init__() # type: ignore
|
||||
self.data = torch.tensor(raw_data, dtype=torch.float)
|
||||
self.xy = xy
|
||||
|
||||
def __len__(self) -> int:
|
||||
return self.data.shape[0]
|
||||
return self.xy.shape[0]
|
||||
|
||||
def __getitem__(self, item: int) -> Dict[str, torch.Tensor]:
|
||||
return {'x': self.data[item][0:1], 'y': self.data[item][1:2]}
|
||||
|
||||
@staticmethod
|
||||
def from_path_and_indexes(
|
||||
root_folder: Path,
|
||||
start_index: int,
|
||||
end_index: int) -> 'HelloDataset':
|
||||
"""
|
||||
Static method to instantiate a HelloDataset from the root folder with the start and end indexes.
|
||||
|
||||
:param root_folder: The folder in which the data file lives ("hellocontainer.csv")
|
||||
:param start_index: The first row to read.
|
||||
:param end_index: The last row to read (exclusive)
|
||||
:return: A new instance based on the root folder and the start and end indexes.
|
||||
"""
|
||||
raw_data = np.loadtxt(root_folder / "hellocontainer.csv", delimiter=",")[start_index:end_index]
|
||||
return HelloDataset(raw_data)
|
||||
return {"x": self.xy[item][0:1], "y": self.xy[item][1:2]}
|
||||
|
||||
|
||||
class HelloDataModule(LightningDataModule):
|
||||
class HelloWorldDataModule(LightningDataModule):
|
||||
"""
|
||||
A data module that gives the training, validation and test data for a simple 1-dim regression task.
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
root_folder: Path) -> None:
|
||||
|
||||
def __init__(self, crossval_count: int, crossval_index: int) -> None:
|
||||
super().__init__()
|
||||
self.train = HelloDataset.from_path_and_indexes(root_folder, start_index=0, end_index=50)
|
||||
self.val = HelloDataset.from_path_and_indexes(root_folder, start_index=50, end_index=70)
|
||||
self.test = HelloDataset.from_path_and_indexes(root_folder, start_index=70, end_index=100)
|
||||
n_total = 200
|
||||
xy = _create_1d_regression_dataset(n=n_total)
|
||||
n_test = 40
|
||||
n_val = 50
|
||||
self.test = HelloWorldDataset(xy=xy[:n_test])
|
||||
if crossval_count <= 1:
|
||||
self.val = HelloWorldDataset(xy=xy[n_test:(n_test + n_val)])
|
||||
self.train = HelloWorldDataset(xy=xy[(n_test + n_val):])
|
||||
else:
|
||||
# This could be done via a library function like sklearn's KFold function, but we don't want to add
|
||||
# scikit-learn as a dependency just for this example.
|
||||
train, val = _split_crossval(xy[n_test:], crossval_count=crossval_count, crossval_index=crossval_index)
|
||||
self.val = HelloWorldDataset(xy=val)
|
||||
self.train = HelloWorldDataset(xy=train)
|
||||
|
||||
def prepare_data(self, *args: Any, **kwargs: Any) -> None:
|
||||
pass
|
||||
|
@ -126,8 +153,9 @@ class HelloRegression(LightningModule):
|
|||
self.log("loss", loss, on_epoch=True, on_step=False)
|
||||
return loss
|
||||
|
||||
def validation_step(self, batch: Dict[str, torch.Tensor], *args: Any, # type: ignore
|
||||
**kwargs: Any) -> torch.Tensor:
|
||||
def validation_step( # type: ignore
|
||||
self, batch: Dict[str, torch.Tensor], *args: Any, **kwargs: Any
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
This method is part of the standard PyTorch Lightning interface. For an introduction, please see
|
||||
https://pytorch-lightning.readthedocs.io/en/stable/starter/converting.html
|
||||
|
@ -208,7 +236,7 @@ class HelloRegression(LightningModule):
|
|||
Path("test_mae.txt").write_text(str(self.test_mae.compute().item()))
|
||||
|
||||
|
||||
class HelloContainer(LightningContainer):
|
||||
class HelloWorld(LightningContainer):
|
||||
"""
|
||||
An example container for using the hi-ml runner. This container has methods
|
||||
to generate the actual Lightning model, and read out the datamodule that will be used for training.
|
||||
|
@ -231,5 +259,7 @@ class HelloContainer(LightningContainer):
|
|||
# in turn contains 3 data loaders for training, validation, and test set.
|
||||
def get_data_module(self) -> LightningDataModule:
|
||||
assert self.local_dataset_dir is not None
|
||||
return HelloDataModule(
|
||||
root_folder=self.local_dataset_dir) # type: ignore
|
||||
# If you would like to use the built-in cross validation functionality that runs training in parallel,
|
||||
# you need to provide the crossvalidation parameters in the LightningContainer to the datamodule. The
|
||||
# datamodule must carry out appropriate splitting of the data.
|
||||
return HelloWorldDataModule(crossval_count=self.crossval_count, crossval_index=self.crossval_index)
|
|
@ -10,15 +10,18 @@ from pathlib import Path
|
|||
from typing import List, Optional
|
||||
|
||||
import param
|
||||
from azureml.train.hyperdrive import HyperDriveConfig
|
||||
from param import Parameterized
|
||||
|
||||
from health_azure import create_crossval_hyperdrive_config
|
||||
from health_azure.utils import RUN_CONTEXT, PathOrString, is_running_in_azure_ml
|
||||
|
||||
from health_ml.utils import fixed_paths
|
||||
from health_ml.utils.common_utils import (CHECKPOINT_FOLDER,
|
||||
create_unique_timestamp_id,
|
||||
DEFAULT_AML_UPLOAD_DIR,
|
||||
DEFAULT_LOGS_DIR_NAME, is_windows, parse_model_id_and_version)
|
||||
DEFAULT_LOGS_DIR_NAME,
|
||||
parse_model_id_and_version)
|
||||
from health_ml.utils.type_annotations import TupleFloat2
|
||||
|
||||
|
||||
|
@ -43,16 +46,6 @@ class LRSchedulerType(Enum):
|
|||
MultiStep = "MultiStep"
|
||||
|
||||
|
||||
@unique
|
||||
class MultiprocessingStartMethod(Enum):
|
||||
"""
|
||||
Different methods for starting data loader processes.
|
||||
"""
|
||||
fork = "fork"
|
||||
forkserver = "forkserver"
|
||||
spawn = "spawn"
|
||||
|
||||
|
||||
@unique
|
||||
class OptimizerType(Enum):
|
||||
"""
|
||||
|
@ -79,7 +72,7 @@ class ExperimentFolderHandler(Parameterized):
|
|||
def create(project_root: Path,
|
||||
is_offline_run: bool,
|
||||
model_name: str,
|
||||
output_to: Path = Path()) -> ExperimentFolderHandler:
|
||||
output_to: Optional[Path] = None) -> ExperimentFolderHandler:
|
||||
"""
|
||||
Creates a new object that holds output folder configurations. When running inside of AzureML, the output
|
||||
folders will be directly under the project root. If not running inside AzureML, a folder with a timestamp
|
||||
|
@ -96,8 +89,7 @@ class ExperimentFolderHandler(Parameterized):
|
|||
"""
|
||||
if not project_root.is_absolute():
|
||||
raise ValueError(f"The project root is required to be an absolute path, but got {project_root}")
|
||||
# output_to by default will be Path() which is not None, but Path().stem is None
|
||||
if is_offline_run or output_to.stem:
|
||||
if is_offline_run or output_to:
|
||||
if output_to:
|
||||
logging.info(f"All results will be written to the specified output folder {output_to}")
|
||||
root = Path(output_to).absolute()
|
||||
|
@ -140,20 +132,17 @@ class WorkflowParams(param.Parameterized):
|
|||
model_id: str = param.String(default="",
|
||||
doc="A model id string in the form 'model name:version' "
|
||||
"to use a registered model for inference.")
|
||||
multiprocessing_start_method: MultiprocessingStartMethod = \
|
||||
param.ClassSelector(class_=MultiprocessingStartMethod,
|
||||
default=(MultiprocessingStartMethod.spawn if is_windows()
|
||||
else MultiprocessingStartMethod.fork),
|
||||
doc="Method to be used to start child processes in pytorch. Should be one of forkserver, "
|
||||
"fork or spawn. If not specified, fork is used on Linux and spawn on Windows. "
|
||||
"Set to forkserver as a possible remedy for stuck jobs.")
|
||||
regression_test_folder: Optional[Path] = \
|
||||
param.ClassSelector(class_=Path, default=None, allow_None=True,
|
||||
doc="A path to a folder that contains a set of files. At the end of training and "
|
||||
"model evaluation, all files given in that folder must be present in the job's output "
|
||||
"folder, and their contents must match exactly. When running in AzureML, you need to "
|
||||
"ensure that this folder is part of the snapshot that gets uploaded. The path should "
|
||||
"be relative to the repository root directory.")
|
||||
crossval_count: int = param.Integer(default=1, bounds=(0, None),
|
||||
doc="The number of splits to use when doing cross-validation. "
|
||||
"Use 1 to disable cross-validation")
|
||||
crossval_index: int = param.Integer(default=0, bounds=(0, None),
|
||||
doc="When doing cross validation, this is the index of the current "
|
||||
"split. Valid values: 0 .. (crossval_count -1)")
|
||||
hyperdrive: bool = param.Boolean(False, doc="If True, use the Hyperdrive configuration specified in the "
|
||||
"LightningContainer to run hyperparameter tuning. If False, just "
|
||||
"run a plain single training job.")
|
||||
CROSSVAL_INDEX_ARG_NAME = "crossval_index"
|
||||
CROSSVAL_COUNT_ARG_NAME = "crossval_count"
|
||||
|
||||
def validate(self) -> None:
|
||||
if sum([bool(param) for param in [self.weights_url, self.local_weights_path, self.model_id]]) > 1:
|
||||
|
@ -162,6 +151,10 @@ class WorkflowParams(param.Parameterized):
|
|||
if self.model_id:
|
||||
parse_model_id_and_version(self.model_id)
|
||||
|
||||
if self.crossval_count > 1:
|
||||
if not (0 <= self.crossval_index < (self.crossval_count - 1)):
|
||||
raise ValueError(f"Attribute crossval_index out of bounds (crossval_count = {self.crossval_count})")
|
||||
|
||||
@property
|
||||
def is_running_in_aml(self) -> bool:
|
||||
"""
|
||||
|
@ -180,21 +173,37 @@ class WorkflowParams(param.Parameterized):
|
|||
seed = self.random_seed
|
||||
return seed
|
||||
|
||||
@property
|
||||
def is_crossvalidation_enabled(self) -> bool:
|
||||
"""
|
||||
Returns True if the present parameters indicate that cross-validation should be used.
|
||||
"""
|
||||
return self.crossval_count > 1
|
||||
|
||||
def get_crossval_hyperdrive_config(self) -> HyperDriveConfig:
|
||||
# For crossvalidation, the name of the metric to monitor does not matter because no early termination or such
|
||||
# is specified.
|
||||
return create_crossval_hyperdrive_config(num_splits=self.crossval_count,
|
||||
cross_val_index_arg_name=self.CROSSVAL_INDEX_ARG_NAME,
|
||||
metric_name="val/loss"
|
||||
)
|
||||
|
||||
|
||||
class DatasetParams(param.Parameterized):
|
||||
azure_datasets: List[str] = param.List(default=[], class_=str,
|
||||
doc="If provided, the ID of one or more datasets to use when running in"
|
||||
" AzureML.This dataset must exist as a folder of the same name in the"
|
||||
" 'datasets' container in the datasets storage account. This dataset"
|
||||
" will be mounted and made available at the 'local_dataset' path"
|
||||
" when running in AzureML.")
|
||||
" AzureML. This dataset must exist as a folder of the same name "
|
||||
"in the 'datasets' container in the datasets storage account. This "
|
||||
"dataset will be mounted and made available at the 'local_dataset' "
|
||||
"path when running in AzureML.")
|
||||
local_datasets: List[Path] = param.List(default=[], class_=Path,
|
||||
doc="A list of one or more paths to the dataset to use, when training"
|
||||
" outside of Azure ML.")
|
||||
dataset_mountpoints: List[Path] = param.List(default=[], class_=Path,
|
||||
doc="The path at which the AzureML dataset should be made available "
|
||||
"via mounting or downloading. This only affects jobs running in "
|
||||
"AzureML. If empty, use a random mount/download point.")
|
||||
doc="The path at which the AzureML dataset should be made "
|
||||
"available via mounting or downloading. This only affects "
|
||||
"jobs running in AzureML. If empty, use a random "
|
||||
"mount/download point.")
|
||||
|
||||
def validate(self) -> None:
|
||||
if (not self.azure_datasets) and (not self.local_datasets):
|
||||
|
@ -207,10 +216,10 @@ class DatasetParams(param.Parameterized):
|
|||
|
||||
|
||||
class OutputParams(param.Parameterized):
|
||||
output_to: Path = param.ClassSelector(class_=Path, default=Path(),
|
||||
doc="If provided, the run outputs will be written to the given folder. If "
|
||||
"not provided, outputs will go into a subfolder of the project root "
|
||||
"folder.")
|
||||
output_to: Optional[Path] = param.ClassSelector(class_=Path, default=None,
|
||||
doc="If provided, the run outputs will be written to the given "
|
||||
"folder. If not provided, outputs will go into a subfolder "
|
||||
"of the project root folder.")
|
||||
file_system_config: ExperimentFolderHandler = param.ClassSelector(default=ExperimentFolderHandler(),
|
||||
class_=ExperimentFolderHandler,
|
||||
instantiate=False,
|
||||
|
@ -229,14 +238,15 @@ class OutputParams(param.Parameterized):
|
|||
|
||||
def set_output_to(self, output_to: PathOrString) -> None:
|
||||
"""
|
||||
Adjusts the file system settings in the present object such that all outputs are written to the given folder.
|
||||
Adjusts the file system settings in the present object such that all outputs are written to the given
|
||||
folder.
|
||||
|
||||
:param output_to: The absolute path to a folder that should contain the outputs.
|
||||
"""
|
||||
self.output_to = Path(output_to)
|
||||
self.create_filesystem()
|
||||
self.create_filesystem(project_root=fixed_paths.repository_root_directory())
|
||||
|
||||
def create_filesystem(self, project_root: Path = fixed_paths.repository_root_directory()) -> None:
|
||||
def create_filesystem(self, project_root: Path) -> None:
|
||||
"""
|
||||
Creates new file system settings (outputs folder, logs folder) based on the information stored in the
|
||||
present object. If any of the folders do not yet exist, they are created.
|
||||
|
@ -268,7 +278,8 @@ class OutputParams(param.Parameterized):
|
|||
|
||||
class OptimizerParams(param.Parameterized):
|
||||
l_rate: float = param.Number(1e-4, doc="The initial learning rate", bounds=(0, None))
|
||||
_min_l_rate: float = param.Number(0.0, doc="The minimum learning rate for the Polynomial and Cosine schedulers.",
|
||||
_min_l_rate: float = param.Number(0.0,
|
||||
doc="The minimum learning rate for the Polynomial and Cosine schedulers.",
|
||||
bounds=(0.0, None))
|
||||
l_rate_scheduler: LRSchedulerType = param.ClassSelector(default=LRSchedulerType.Polynomial,
|
||||
class_=LRSchedulerType,
|
||||
|
@ -338,19 +349,20 @@ class TrainerParams(param.Parameterized):
|
|||
autosave_every_n_val_epochs: int = param.Integer(1, bounds=(0, None),
|
||||
doc="Save epoch checkpoints every N validation epochs. "
|
||||
"If pl_check_val_every_n_epoch > 1, this means that "
|
||||
"checkpoints are saved every N * pl_check_val_every_n_epoch "
|
||||
"training epochs.")
|
||||
"checkpoints are saved every "
|
||||
"N * pl_check_val_every_n_epoch training epochs.")
|
||||
detect_anomaly: bool = param.Boolean(False, doc="If true, test gradients for anomalies (NaN or Inf) during "
|
||||
"training.")
|
||||
use_mixed_precision: bool = param.Boolean(False, doc="If true, mixed precision training is activated during "
|
||||
"training.")
|
||||
max_num_gpus: int = param.Integer(default=-1, doc="The maximum number of GPUS to use. If set to a value < 0, use"
|
||||
"all available GPUs. In distributed training, this is the "
|
||||
"maximum number of GPUs per node.")
|
||||
max_num_gpus: int = param.Integer(default=-1,
|
||||
doc="The maximum number of GPUS to use. If set to a value < 0, use"
|
||||
"all available GPUs. In distributed training, this is the "
|
||||
"maximum number of GPUs per node.")
|
||||
pl_progress_bar_refresh_rate: Optional[int] = \
|
||||
param.Integer(default=None,
|
||||
doc="PyTorch Lightning trainer flag 'progress_bar_refresh_rate': How often to refresh progress "
|
||||
"bar (in steps). Value 0 disables progress bar. Value None chooses automatically.")
|
||||
doc="PyTorch Lightning trainer flag 'progress_bar_refresh_rate': How often to refresh "
|
||||
"progress bar (in steps). Value 0 disables progress bar. If None choose, automatically.")
|
||||
pl_num_sanity_val_steps: int = \
|
||||
param.Integer(default=0,
|
||||
doc="PyTorch Lightning trainer flag 'num_sanity_val_steps': Number of validation "
|
||||
|
@ -358,8 +370,8 @@ class TrainerParams(param.Parameterized):
|
|||
pl_deterministic: bool = \
|
||||
param.Boolean(default=False,
|
||||
doc="Controls the PyTorch Lightning trainer flags 'deterministic' and 'benchmark'. If "
|
||||
"'pl_deterministic' is True, results are perfectly reproducible. If False, they are not, but "
|
||||
"you may see training speed increases.")
|
||||
"'pl_deterministic' is True, results are perfectly reproducible. If False, they are not, "
|
||||
"but you may see training speed increases.")
|
||||
pl_find_unused_parameters: bool = \
|
||||
param.Boolean(default=False,
|
||||
doc="Controls the PyTorch Lightning flag 'find_unused_parameters' for the DDP plugin. "
|
||||
|
@ -382,9 +394,9 @@ class TrainerParams(param.Parameterized):
|
|||
monitor_loading: bool = param.Boolean(default=False,
|
||||
doc="If True, add the BatchTimeCallback callback to the Lightning trainer "
|
||||
"object. This will monitor how long individual batches take to load.")
|
||||
additional_env_files: List[str] = param.List(class_=Path, default=[],
|
||||
doc="Additional conda environment (.yml) files to merge into the"
|
||||
" overall environment definition")
|
||||
additional_env_files: List[Path] = param.List(class_=Path, default=[],
|
||||
doc="Additional conda environment (.yml) files to merge into the"
|
||||
" overall environment definition")
|
||||
|
||||
@property
|
||||
def use_gpu(self) -> bool:
|
||||
|
@ -411,5 +423,6 @@ class TrainerParams(param.Parameterized):
|
|||
num_gpus = self.max_num_gpus
|
||||
logging.info(f"Restricting the number of GPUs to {num_gpus}")
|
||||
elif self.max_num_gpus > num_gpus:
|
||||
logging.warning(f"You requested max_num_gpus {self.max_num_gpus} but there are only {num_gpus} available.")
|
||||
logging.warning(
|
||||
f"You requested max_num_gpus {self.max_num_gpus} but there are only {num_gpus} available.")
|
||||
return num_gpus
|
||||
|
|
|
@ -1,11 +1,10 @@
|
|||
import param
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class ExperimentConfig(param.Parameterized):
|
||||
cluster: Optional[str] = param.String(default=None, allow_None=True,
|
||||
doc="The name of the GPU or CPU cluster inside the AzureML workspace"
|
||||
"that should execute the job.")
|
||||
cluster: str = param.String(default="", allow_None=False,
|
||||
doc="The name of the GPU or CPU cluster inside the AzureML workspace"
|
||||
"that should execute the job.")
|
||||
num_nodes: int = param.Integer(default=1, doc="The number of virtual machines that will be allocated for this"
|
||||
"job in AzureML.")
|
||||
model: str = param.String(doc="The fully qualified name of the model to train/test -e.g."
|
||||
|
|
|
@ -30,6 +30,7 @@ class LightningContainer(WorkflowParams,
|
|||
should be trained is returned by the `get_model` method. The training data must be returned in the form of
|
||||
a LightningDataModule, by the `get_data_module` method.
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs: Any) -> None:
|
||||
super().__init__(**kwargs)
|
||||
self._model: Optional[LightningModule] = None
|
||||
|
@ -77,12 +78,20 @@ class LightningContainer(WorkflowParams,
|
|||
"""
|
||||
return []
|
||||
|
||||
def get_parameter_search_hyperdrive_config(self, _: ScriptRunConfig) -> HyperDriveConfig: # type: ignore
|
||||
def get_parameter_tuning_config(self, run_config: ScriptRunConfig) -> HyperDriveConfig: # type: ignore
|
||||
"""
|
||||
Parameter search is not implemented. It should be implemented in a sub class if needed.
|
||||
Returns a configuration for hyperparameter tuning via AzureML's Hyperdrive capability.
|
||||
Hyperparameter tuning can be triggered on the commandline via the "--hyperdrive" flag.
|
||||
Override this method in your LightningContainer to use hyperparameter tuning.
|
||||
|
||||
The HyperDriveConfig config object needs to specify which parameters should be searched over, and which
|
||||
metric should be monitored.
|
||||
|
||||
:param run_config: The ScriptRunConfig object that needs to be passed into the constructor of
|
||||
HyperDriveConfig.
|
||||
"""
|
||||
raise NotImplementedError("Parameter search is not implemented. It should be implemented in"
|
||||
"a sub class if needed.")
|
||||
raise NotImplementedError("Parameter search is not implemented. Please override 'get_parameter_tuning_config' "
|
||||
"in your model container.")
|
||||
|
||||
def update_experiment_config(self, experiment_config: ExperimentConfig) -> None:
|
||||
"""
|
||||
|
@ -148,14 +157,17 @@ class LightningContainer(WorkflowParams,
|
|||
self._model._optimizer_params = create_from_matching_params(self, OptimizerParams)
|
||||
self._model._trainer_params = create_from_matching_params(self, TrainerParams)
|
||||
|
||||
def get_hyperdrive_config(self, run_config: ScriptRunConfig) -> HyperDriveConfig:
|
||||
def get_hyperdrive_config(self) -> Optional[HyperDriveConfig]:
|
||||
"""
|
||||
Returns the HyperDrive config for either parameter search
|
||||
Returns the HyperDrive config for either hyperparameter tuning or cross validation.
|
||||
|
||||
:param run_config: AzureML estimator
|
||||
:return: HyperDriveConfigs
|
||||
:return: A configuration object for HyperDrive
|
||||
"""
|
||||
return self.get_parameter_search_hyperdrive_config(run_config)
|
||||
if self.is_crossvalidation_enabled:
|
||||
return self.get_crossval_hyperdrive_config()
|
||||
if self.hyperdrive:
|
||||
return self.get_parameter_tuning_config(ScriptRunConfig(source_directory=""))
|
||||
return None
|
||||
|
||||
def load_model_checkpoint(self, checkpoint_path: Path) -> None:
|
||||
"""
|
||||
|
|
|
@ -19,11 +19,10 @@ from health_azure.utils import (ENV_GLOBAL_RANK, ENV_LOCAL_RANK, ENV_NODE_RANK,
|
|||
from health_ml.lightning_container import LightningContainer
|
||||
from health_ml.utils import AzureMLLogger, AzureMLProgressBar
|
||||
from health_ml.utils.checkpoint_utils import cleanup_checkpoints
|
||||
from health_ml.utils.common_utils import AUTOSAVE_CHECKPOINT_FILE_NAME, EXPERIMENT_SUMMARY_FILE
|
||||
from health_ml.utils.common_utils import (AUTOSAVE_CHECKPOINT_FILE_NAME, EXPERIMENT_SUMMARY_FILE,
|
||||
change_working_directory)
|
||||
from health_ml.utils.lightning_loggers import StoringLogger
|
||||
|
||||
TEMP_PREFIX = "temp/"
|
||||
|
||||
T = TypeVar('T')
|
||||
|
||||
|
||||
|
@ -213,9 +212,11 @@ def model_train(checkpoint_path: Optional[Path],
|
|||
logging.info(f"Environment variables: {rank_info}. trainer.global_rank: {trainer.global_rank}")
|
||||
|
||||
# get recovery checkpoint if it exists
|
||||
|
||||
logging.info("Starting training")
|
||||
trainer.fit(lightning_model, datamodule=data_module)
|
||||
# Change to the outputs folder so that the model can write to current working directory, and still everything
|
||||
# is put into the right place in AzureML (only the contents of the "outputs" folder is treated as a result file)
|
||||
with change_working_directory(container.outputs_folder):
|
||||
trainer.fit(lightning_model, datamodule=data_module)
|
||||
assert trainer.logger is not None
|
||||
trainer.logger.finalize('success')
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@ from health_ml.model_trainer import create_lightning_trainer, model_train
|
|||
from health_ml.utils import fixed_paths
|
||||
from health_ml.utils.checkpoint_utils import CheckpointHandler
|
||||
from health_ml.utils.common_utils import (
|
||||
EFFECTIVE_RANDOM_SEED_KEY_NAME, logging_section,
|
||||
EFFECTIVE_RANDOM_SEED_KEY_NAME, change_working_directory, logging_section,
|
||||
RUN_RECOVERY_ID_KEY, RUN_RECOVERY_FROM_ID_KEY_NAME)
|
||||
from health_ml.utils.lightning_loggers import StoringLogger
|
||||
from health_ml.utils.type_annotations import PathOrString
|
||||
|
@ -181,10 +181,13 @@ class MLRunner:
|
|||
trainer, _ = create_lightning_trainer(self.container, num_nodes=1)
|
||||
|
||||
self.container.load_model_checkpoint(checkpoint_path=checkpoint_paths[0])
|
||||
# Change the current working directory to ensure that test files go to thr right folder
|
||||
data_module = self.container.get_data_module()
|
||||
|
||||
_ = trainer.test(self.container.model, datamodule=data_module)
|
||||
# Change to the outputs folder so that the model can write to current working directory, and still
|
||||
# everything is put into the right place in AzureML (there, only the contents of the "outputs" folder
|
||||
# retained)
|
||||
with change_working_directory(self.container.outputs_folder):
|
||||
_ = trainer.test(self.container.model, datamodule=data_module)
|
||||
|
||||
else:
|
||||
logging.warning("None of the suitable test methods is overridden. Skipping inference completely.")
|
||||
|
|
|
@ -9,27 +9,23 @@ import param
|
|||
import sys
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from typing import Any, Dict, Optional, Tuple
|
||||
|
||||
import matplotlib
|
||||
from azureml.core import Workspace
|
||||
|
||||
# Add hi-ml packages to sys.path so that AML can find them
|
||||
# Optionally add the histopathology module, if this exists
|
||||
# Add hi-ml packages to sys.path so that AML can find them if we are using the runner directly from the git repo
|
||||
himl_root = Path(__file__).absolute().parent.parent.parent.parent
|
||||
print(f"Starting the himl runner at {himl_root}")
|
||||
print(f"health_ml pkg root: {himl_root}")
|
||||
health_ml_pkg = himl_root / "hi-ml" / "src"
|
||||
health_azure_pkg = himl_root / "hi-ml-azure" / "src"
|
||||
health_histopathology_dir = himl_root / "hi-ml-histopathology" / "src"
|
||||
|
||||
if health_histopathology_dir.exists():
|
||||
sys.path.insert(0, str(health_histopathology_dir))
|
||||
sys.path.insert(0, str(health_azure_pkg))
|
||||
sys.path.insert(0, str(health_ml_pkg))
|
||||
print(f"sys path: {sys.path}")
|
||||
folders_to_add = [himl_root / "hi-ml" / "src",
|
||||
himl_root / "hi-ml-azure" / "src",
|
||||
himl_root / "hi-ml-histopathology" / "src"]
|
||||
for folder in folders_to_add:
|
||||
if folder.is_dir():
|
||||
sys.path.insert(0, str(folder))
|
||||
|
||||
from health_azure import AzureRunInfo, submit_to_azure_if_needed # noqa: E402
|
||||
from health_azure.datasets import create_dataset_configs # noqa: E402
|
||||
from health_azure.paths import is_himl_used_from_git_repo # noqa: E402
|
||||
from health_azure.utils import (get_workspace, is_local_rank_zero, merge_conda_files, # noqa: E402
|
||||
set_environment_variables_for_multi_node, create_argparser, parse_arguments,
|
||||
ParserResult, apply_overrides)
|
||||
|
@ -38,12 +34,11 @@ from health_ml.experiment_config import ExperimentConfig # noqa: E402
|
|||
from health_ml.lightning_container import LightningContainer # noqa: E402
|
||||
from health_ml.run_ml import MLRunner # noqa: E402
|
||||
from health_ml.utils import fixed_paths # noqa: E402
|
||||
from health_ml.utils.common_utils import (get_all_environment_files, # noqa: E402
|
||||
from health_ml.utils.common_utils import (check_conda_environments, get_all_environment_files, # noqa: E402
|
||||
get_all_pip_requirements_files,
|
||||
is_linux, logging_to_stdout)
|
||||
from health_ml.utils.config_loader import ModelConfigLoader # noqa: E402
|
||||
|
||||
|
||||
DEFAULT_DOCKER_BASE_IMAGE = "mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04"
|
||||
|
||||
|
||||
|
@ -142,8 +137,7 @@ class Runner:
|
|||
self.experiment_config = experiment_config
|
||||
if not experiment_config.model:
|
||||
raise ValueError("Parameter 'model' needs to be set to specify which model to run.")
|
||||
print(f"Creating model loader with the following args: {parser_result.args}")
|
||||
model_config_loader: ModelConfigLoader = ModelConfigLoader(**parser_result.args)
|
||||
model_config_loader: ModelConfigLoader = ModelConfigLoader()
|
||||
# Create the model as per the "model" commandline option. This is a LightningContainer.
|
||||
container = model_config_loader.create_model_config_from_name(model_name=experiment_config.model)
|
||||
|
||||
|
@ -161,6 +155,23 @@ class Runner:
|
|||
|
||||
return parser_result_
|
||||
|
||||
def validate(self) -> None:
|
||||
"""
|
||||
Runs sanity checks on the whole experiment.
|
||||
"""
|
||||
if not self.experiment_config.azureml:
|
||||
if self.lightning_container.hyperdrive:
|
||||
logging.info("You have turned on HyperDrive for parameter tuning. This can "
|
||||
"only be run in AzureML. We switched on submitting to AzureML.")
|
||||
self.experiment_config.azureml = True
|
||||
if self.lightning_container.is_crossvalidation_enabled:
|
||||
logging.info("You have turned on cross-validation. This can "
|
||||
"only be run in AzureML. We switched on submitting to AzureML.")
|
||||
self.experiment_config.azureml = True
|
||||
if self.experiment_config.cluster:
|
||||
logging.info("You have provided a compute cluster name, hence we switched on submitting to AzureML.")
|
||||
self.experiment_config.azureml = True
|
||||
|
||||
def run(self) -> Tuple[LightningContainer, AzureRunInfo]:
|
||||
"""
|
||||
The main entry point for training and testing models from the commandline. This chooses a model to train
|
||||
|
@ -174,6 +185,7 @@ class Runner:
|
|||
logging_to_stdout(logging.INFO if is_local_rank_zero() else "ERROR")
|
||||
initialize_rpdb()
|
||||
self.parse_and_load_model()
|
||||
self.validate()
|
||||
azure_run_info = self.submit_to_azureml_if_needed()
|
||||
self.run_in_situ(azure_run_info)
|
||||
return self.lightning_container, azure_run_info
|
||||
|
@ -191,30 +203,18 @@ class Runner:
|
|||
entry_script = Path(sys.argv[0]).resolve()
|
||||
script_params = sys.argv[1:]
|
||||
|
||||
additional_conda_env_files = self.lightning_container.additional_env_files
|
||||
additional_env_files: Optional[List[Path]]
|
||||
if additional_conda_env_files is not None:
|
||||
additional_env_files = [Path(f) for f in additional_conda_env_files]
|
||||
else:
|
||||
additional_env_files = None
|
||||
|
||||
conda_dependencies_files = get_all_environment_files(self.project_root,
|
||||
additional_files=additional_env_files)
|
||||
pip_requirements_files = get_all_pip_requirements_files()
|
||||
|
||||
# Merge the project-specific dependencies with the packages and write unified definition
|
||||
# to temp file. In case of version conflicts, the package version in the outer project is given priority.
|
||||
temp_conda: Optional[Path] = None
|
||||
if len(conda_dependencies_files) > 1 or len(pip_requirements_files) > 0:
|
||||
temp_conda = root_folder / f"temp_environment-{uuid.uuid4().hex[:8]}.yml"
|
||||
merge_conda_files(conda_dependencies_files, temp_conda, pip_files=pip_requirements_files)
|
||||
|
||||
# TODO: Update environment variables
|
||||
environment_variables: Dict[str, Any] = {}
|
||||
|
||||
# get default datastore from provided workspace
|
||||
workspace = get_workspace()
|
||||
default_datastore = workspace.get_default_datastore().name
|
||||
# Get default datastore from the provided workspace. Authentication can take a few seconds, hence only do
|
||||
# that if we are really submitting to AzureML.
|
||||
workspace: Optional[Workspace] = None
|
||||
if self.experiment_config.azureml:
|
||||
try:
|
||||
workspace = get_workspace()
|
||||
except ValueError:
|
||||
logging.warning("No configuration file for an AzureML workspace was found.")
|
||||
default_datastore = workspace.get_default_datastore().name if workspace is not None else ""
|
||||
|
||||
local_datasets = self.lightning_container.local_datasets
|
||||
all_local_datasets = [Path(p) for p in local_datasets] if len(local_datasets) > 0 else []
|
||||
|
@ -223,27 +223,48 @@ class Runner:
|
|||
all_dataset_mountpoints=self.lightning_container.dataset_mountpoints,
|
||||
all_local_datasets=all_local_datasets, # type: ignore
|
||||
datastore=default_datastore)
|
||||
if self.lightning_container.is_crossvalidation_enabled and not self.experiment_config.azureml:
|
||||
raise ValueError("Cross-validation is only supported when submitting the job to AzureML.")
|
||||
hyperdrive_config = self.lightning_container.get_hyperdrive_config()
|
||||
temp_conda: Optional[Path] = None
|
||||
try:
|
||||
if self.experiment_config.azureml:
|
||||
conda_files = get_all_environment_files(root_folder,
|
||||
additional_files=self.lightning_container.additional_env_files)
|
||||
check_conda_environments(conda_files)
|
||||
# This adds all pip packages required by hi-ml and hi-ml-azure in case the code is used directly from
|
||||
# source (submodule) rather than installed as a package.
|
||||
pip_requirements_files = get_all_pip_requirements_files()
|
||||
|
||||
# Merge the project-specific dependencies with the packages and write unified definition to temp file.
|
||||
if len(conda_files) > 1 or len(pip_requirements_files) > 0:
|
||||
temp_conda = root_folder / f"temp_environment-{uuid.uuid4().hex[:8]}.yml"
|
||||
merge_conda_files(conda_files, temp_conda, pip_files=pip_requirements_files)
|
||||
|
||||
if workspace is None:
|
||||
raise ValueError("Unable to submit the script to AzureML because no workspace configuration file "
|
||||
"(config.json) was found.")
|
||||
if not self.experiment_config.cluster:
|
||||
raise ValueError("You need to specify a cluster name via '--cluster NAME' to submit"
|
||||
raise ValueError("You need to specify a cluster name via '--cluster NAME' to submit "
|
||||
"the script to run in AzureML")
|
||||
azure_run_info = submit_to_azure_if_needed(
|
||||
entry_script=entry_script,
|
||||
snapshot_root_directory=root_folder,
|
||||
script_params=script_params,
|
||||
conda_environment_file=temp_conda or conda_dependencies_files[0],
|
||||
conda_environment_file=temp_conda or conda_files[0],
|
||||
aml_workspace=workspace,
|
||||
compute_cluster_name=self.experiment_config.cluster,
|
||||
environment_variables=environment_variables,
|
||||
default_datastore=default_datastore,
|
||||
experiment_name=self.lightning_container.name, # create_experiment_name(),
|
||||
experiment_name=self.lightning_container.model_name, # create_experiment_name(),
|
||||
input_datasets=input_datasets, # type: ignore
|
||||
num_nodes=self.experiment_config.num_nodes,
|
||||
wait_for_completion=False,
|
||||
ignored_folders=[],
|
||||
submit_to_azureml=self.experiment_config.azureml,
|
||||
docker_base_image=DEFAULT_DOCKER_BASE_IMAGE,
|
||||
hyperdrive_config=hyperdrive_config,
|
||||
create_output_folders=False,
|
||||
tags=additional_run_tags(
|
||||
commandline_args=" ".join(script_params))
|
||||
)
|
||||
|
@ -252,7 +273,7 @@ class Runner:
|
|||
input_datasets=input_datasets, # type: ignore
|
||||
submit_to_azureml=False)
|
||||
finally:
|
||||
if temp_conda:
|
||||
if temp_conda and temp_conda.is_file():
|
||||
temp_conda.unlink()
|
||||
# submit_to_azure_if_needed calls sys.exit after submitting to AzureML. We only reach this when running
|
||||
# the script locally or in AzureML.
|
||||
|
@ -296,7 +317,7 @@ def run(project_root: Path) -> Tuple[LightningContainer, AzureRunInfo]:
|
|||
|
||||
|
||||
def main() -> None:
|
||||
run(project_root=fixed_paths.repository_root_directory())
|
||||
run(project_root=fixed_paths.repository_root_directory() if is_himl_used_from_git_repo() else Path.cwd())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -10,10 +10,11 @@ from typing import Any, Generator, Iterable, List, Optional, Union
|
|||
|
||||
import torch
|
||||
from torch.nn import Module
|
||||
from health_azure import utils
|
||||
from health_azure import paths
|
||||
from health_azure.paths import ENVIRONMENT_YAML_FILE_NAME, git_repo_root_folder, is_himl_used_from_git_repo
|
||||
|
||||
from health_azure.utils import PathOrString
|
||||
|
||||
from health_ml.utils import fixed_paths
|
||||
from health_azure.utils import PathOrString, is_conda_file_with_pip_include
|
||||
|
||||
|
||||
MAX_PATH_LENGTH = 260
|
||||
|
@ -25,15 +26,17 @@ string_to_path = lambda x: None if (x is None or len(x.strip()) == 0) else Path(
|
|||
# file and directory names
|
||||
CHECKPOINT_SUFFIX = ".ckpt"
|
||||
AUTOSAVE_CHECKPOINT_FILE_NAME = "autosave"
|
||||
AUTOSAVE_CHECKPOINT_CANDIDATES = [AUTOSAVE_CHECKPOINT_FILE_NAME + CHECKPOINT_SUFFIX,
|
||||
AUTOSAVE_CHECKPOINT_FILE_NAME + "-v1" + CHECKPOINT_SUFFIX]
|
||||
AUTOSAVE_CHECKPOINT_CANDIDATES = [
|
||||
AUTOSAVE_CHECKPOINT_FILE_NAME + CHECKPOINT_SUFFIX,
|
||||
AUTOSAVE_CHECKPOINT_FILE_NAME + "-v1" + CHECKPOINT_SUFFIX,
|
||||
]
|
||||
CHECKPOINT_FOLDER = "checkpoints"
|
||||
DEFAULT_AML_UPLOAD_DIR = "outputs"
|
||||
DEFAULT_LOGS_DIR_NAME = "logs"
|
||||
EXPERIMENT_SUMMARY_FILE = "experiment_summary.txt"
|
||||
|
||||
# run recovery
|
||||
RUN_RECOVERY_ID_KEY = 'run_recovery_id'
|
||||
RUN_RECOVERY_ID_KEY = "run_recovery_id"
|
||||
RUN_RECOVERY_FROM_ID_KEY_NAME = "recovered_from"
|
||||
|
||||
# other
|
||||
|
@ -45,6 +48,7 @@ class ModelExecutionMode(Enum):
|
|||
"""
|
||||
Model execution mode
|
||||
"""
|
||||
|
||||
TRAIN = "Train"
|
||||
TEST = "Test"
|
||||
VAL = "Val"
|
||||
|
@ -111,8 +115,7 @@ def _add_formatter(handler: logging.StreamHandler) -> None:
|
|||
"""
|
||||
Adds a logging formatter that includes the timestamp and the logging level.
|
||||
"""
|
||||
formatter = logging.Formatter(fmt="%(asctime)s %(levelname)-8s %(message)s",
|
||||
datefmt="%Y-%m-%dT%H:%M:%SZ")
|
||||
formatter = logging.Formatter(fmt="%(asctime)s %(levelname)-8s %(message)s", datefmt="%Y-%m-%dT%H:%M:%SZ")
|
||||
# noinspection PyTypeHints
|
||||
formatter.converter = time.gmtime # type: ignore
|
||||
handler.setFormatter(formatter)
|
||||
|
@ -129,6 +132,7 @@ def logging_section(gerund: str) -> Generator:
|
|||
:param gerund: string expressing what happens in this section of the log.
|
||||
"""
|
||||
from time import time
|
||||
|
||||
logging.info("")
|
||||
msg = f"**** STARTING: {gerund} "
|
||||
logging.info(msg + (100 - len(msg)) * "*")
|
||||
|
@ -152,14 +156,14 @@ def is_windows() -> bool:
|
|||
"""
|
||||
Returns True if the host operating system is Windows.
|
||||
"""
|
||||
return os.name == 'nt'
|
||||
return os.name == "nt"
|
||||
|
||||
|
||||
def is_linux() -> bool:
|
||||
"""
|
||||
Returns True if the host operating system is a flavour of Linux.
|
||||
"""
|
||||
return os.name == 'posix'
|
||||
return os.name == "posix"
|
||||
|
||||
|
||||
def check_properties_are_not_none(obj: Any, ignore: Optional[List[str]] = None) -> None:
|
||||
|
@ -206,16 +210,31 @@ def _create_generator(seed: Optional[int] = None) -> torch.Generator:
|
|||
def get_all_environment_files(project_root: Path, additional_files: Optional[List[Path]] = None) -> List[Path]:
|
||||
"""
|
||||
Returns a list of all Conda environment files that should be used. This is just an
|
||||
environment.yml file that lives at the project root folder, plus any additional files provided.
|
||||
environment.yml file that lives at the project root folder, plus any additional files provided in the model.
|
||||
|
||||
:param project_root: The root folder of the code that starts the present training run.
|
||||
:param additional_files: Optional list of additional environment files to merge
|
||||
:return: A list with 1 entry that is the root level repo's conda environment files.
|
||||
:return: A list of Conda environment files to use.
|
||||
"""
|
||||
env_files = []
|
||||
project_yaml = project_root / fixed_paths.ENVIRONMENT_YAML_FILE_NAME
|
||||
if project_yaml.exists():
|
||||
project_yaml = project_root / paths.ENVIRONMENT_YAML_FILE_NAME
|
||||
if paths.is_himl_used_from_git_repo():
|
||||
logging.info("Searching for Conda files in the parent folders")
|
||||
git_repo_root = paths.git_repo_root_folder()
|
||||
env_file = utils.find_file_in_parent_folders(
|
||||
file_name=paths.ENVIRONMENT_YAML_FILE_NAME, stop_at_path=[git_repo_root]
|
||||
)
|
||||
assert env_file is not None, "Expected to find at least the environment definition file at repo root"
|
||||
logging.info(f"Using Conda environment in {env_file}")
|
||||
env_files.append(env_file)
|
||||
elif project_yaml.exists():
|
||||
logging.info(f"Using Conda environment in current folder: {project_yaml}")
|
||||
env_files.append(project_yaml)
|
||||
|
||||
if not env_files and not additional_files:
|
||||
raise ValueError(
|
||||
"No Conda environment files were found in the repository, and none were specified in the " "model itself."
|
||||
)
|
||||
if additional_files:
|
||||
for additional_file in additional_files:
|
||||
if additional_file.exists():
|
||||
|
@ -223,26 +242,44 @@ def get_all_environment_files(project_root: Path, additional_files: Optional[Lis
|
|||
return env_files
|
||||
|
||||
|
||||
def check_conda_environments(env_files: List[Path]) -> None:
|
||||
"""Tests if all conda environment files are valid. In particular, they must not contain "include" statements
|
||||
in the pip section.
|
||||
|
||||
:param env_files: The list of Conda environment YAML files to check.
|
||||
"""
|
||||
if is_himl_used_from_git_repo():
|
||||
repo_root_yaml: Optional[Path] = git_repo_root_folder() / ENVIRONMENT_YAML_FILE_NAME
|
||||
else:
|
||||
repo_root_yaml = None
|
||||
for file in env_files:
|
||||
has_pip_include, _ = is_conda_file_with_pip_include(file)
|
||||
# PIP include statements are only valid when reading from the repository root YAML file, because we
|
||||
# are manually adding the included files in get_all_pip_requirements_files
|
||||
if has_pip_include and file != repo_root_yaml:
|
||||
raise ValueError(
|
||||
f"The Conda environment definition in {file} uses '-r' to reference pip requirements "
|
||||
"files. This does not work in AzureML. Please add the pip dependencies directly."
|
||||
)
|
||||
|
||||
|
||||
def get_all_pip_requirements_files() -> List[Path]:
|
||||
"""
|
||||
If the root level hi-ml directory is available (e.g. it has been installed as a submodule or
|
||||
downloaded directly into a parent repo) then we must add it's pip requirements to any environment
|
||||
definition. This function returns a list of the necessary pip requirements files. If the hi-ml
|
||||
root directory does not exist (e.g. hi-ml has been installed as a pip package, this is not necessary
|
||||
and so this function returns None)
|
||||
and so this function returns an empty list.)
|
||||
|
||||
:return: An list list of pip requirements files in the hi-ml and hi-ml-azure packages if relevant,
|
||||
or else an empty list
|
||||
"""
|
||||
files = []
|
||||
himl_root_dir = fixed_paths.himl_root_dir()
|
||||
if himl_root_dir is not None:
|
||||
himl_yaml = himl_root_dir / "hi-ml" / "run_requirements.txt"
|
||||
himl_az_yaml = himl_root_dir / "hi-ml-azure" / "run_requirements.txt"
|
||||
files.append(himl_yaml)
|
||||
files.append(himl_az_yaml)
|
||||
return files
|
||||
return []
|
||||
if paths.is_himl_used_from_git_repo():
|
||||
git_root = paths.git_repo_root_folder()
|
||||
for folder in [Path("hi-ml") / "run_requirements.txt", Path("hi-ml-azure") / "run_requirements.txt"]:
|
||||
files.append(git_root / folder)
|
||||
return files
|
||||
|
||||
|
||||
def create_unique_timestamp_id() -> str:
|
||||
|
@ -270,8 +307,7 @@ def parse_model_id_and_version(model_id_and_version: str) -> None:
|
|||
expected format
|
||||
"""
|
||||
if len(model_id_and_version.split(":")) != 2:
|
||||
raise ValueError(
|
||||
f"model id should be in the form 'model_name:version', got {model_id_and_version}")
|
||||
raise ValueError(f"model id should be in the form 'model_name:version', got {model_id_and_version}")
|
||||
|
||||
|
||||
@contextmanager
|
||||
|
|
|
@ -7,171 +7,157 @@ from __future__ import annotations
|
|||
import importlib
|
||||
import inspect
|
||||
import logging
|
||||
import sys
|
||||
from importlib._bootstrap import ModuleSpec
|
||||
from importlib.util import find_spec
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import param
|
||||
from importlib._bootstrap import ModuleSpec
|
||||
|
||||
from health_azure.utils import PathOrString
|
||||
from health_ml.lightning_container import LightningContainer
|
||||
from health_ml.utils import fixed_paths
|
||||
|
||||
|
||||
class ModelConfigLoader(param.Parameterized):
|
||||
class ModelConfigLoader:
|
||||
"""
|
||||
Helper class to manage model config loading.
|
||||
"""
|
||||
|
||||
def __init__(self, **params: Any):
|
||||
super().__init__(**params)
|
||||
default_module = self.get_default_search_module()
|
||||
self.module_search_specs: List[ModuleSpec] = [importlib.util.find_spec(default_module)] # type: ignore
|
||||
self._find_module_search_specs()
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
def _find_module_search_specs(self) -> None:
|
||||
"""
|
||||
Given the fully qualified model name, append the root folder to the system path (so that the config
|
||||
file can be discovered) and try to find a spec for the specifed module. If found, appends the spec
|
||||
to self.module_search_specs
|
||||
"""
|
||||
model_namespace_parts = self.model.split(".")
|
||||
if len(model_namespace_parts) == 1:
|
||||
# config must be in the default path. This is already in module_search_specs so we dont need to do anything
|
||||
return
|
||||
else:
|
||||
# Get the root folder of the fully qualified model name and ensure it is in the path to enable
|
||||
# discovery of the config file
|
||||
root_namespace = str(Path(model_namespace_parts[0]).absolute())
|
||||
if root_namespace not in sys.path:
|
||||
print(f"Adding {str(root_namespace)} to path")
|
||||
sys.path.insert(0, str(root_namespace))
|
||||
|
||||
# Strip the root folder (now in the path) and the class name from the model namespace, leaving the
|
||||
# module name - e.g. "mymodule.configs"
|
||||
model_namespace = ".".join([str(p) for p in model_namespace_parts[1:-1]]) # type: ignore
|
||||
|
||||
custom_spec = importlib.util.find_spec(model_namespace) # type: ignore
|
||||
if custom_spec is None:
|
||||
raise ValueError(f"Search namespace {model_namespace} was not found.")
|
||||
self.module_search_specs.append(custom_spec)
|
||||
|
||||
@staticmethod
|
||||
def get_default_search_module() -> str:
|
||||
def default_module_spec(self) -> ModuleSpec:
|
||||
from health_ml import configs # type: ignore
|
||||
return configs.__name__
|
||||
|
||||
default_module = configs.__name__
|
||||
return find_spec(default_module)
|
||||
|
||||
def find_module_search_specs(self, model_name: str) -> ModuleSpec:
|
||||
"""
|
||||
Given model name (either only the class name or fully qualified), return the ModuleSpec that should be used for
|
||||
loading. If the model name is only the class name, the function will return the result of calling
|
||||
default_module_spec. Otherwise, this will return the module of the (fully qualified) model name.
|
||||
"""
|
||||
model_namespace_parts = model_name.split(".")
|
||||
if len(model_namespace_parts) == 1:
|
||||
# config must be in the default path, nothing to be done
|
||||
return self.default_module_spec()
|
||||
|
||||
module_name = ".".join(model_namespace_parts[:-1])
|
||||
logging.debug(f"Getting specification for module {module_name}")
|
||||
try:
|
||||
custom_spec: Optional[ModuleSpec] = find_spec(module_name)
|
||||
except Exception:
|
||||
custom_spec = None
|
||||
if custom_spec is None:
|
||||
raise ValueError(f"Module {module_name} was not found.")
|
||||
return custom_spec
|
||||
|
||||
def _get_model_config(self, module_spec: ModuleSpec, model_name: str) -> Optional[LightningContainer]:
|
||||
"""
|
||||
Given a module specification check to see if it has a class property with
|
||||
the <model_name> provided, and instantiate that config class with the
|
||||
provided <config_overrides>. Otherwise, return None.
|
||||
|
||||
:param module_spec:
|
||||
:return: Instantiated model config if it was found.
|
||||
"""
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
logging.debug(f"Importing {module_spec.name}")
|
||||
target_module = importlib.import_module(module_spec.name)
|
||||
# The "if" clause checks that obj is a class, of the desired name, that is
|
||||
# defined in this module rather than being imported into it (and hence potentially
|
||||
# being found twice).
|
||||
_class = next(
|
||||
obj
|
||||
for name, obj in inspect.getmembers(target_module)
|
||||
if inspect.isclass(obj) and name == model_name and inspect.getmodule(obj) == target_module
|
||||
)
|
||||
logging.info(f"Found class {_class} in file {module_spec.origin}")
|
||||
# ignore the exception which will occur if the provided module cannot be loaded
|
||||
# or the loaded module does not have the required class as a member
|
||||
except Exception as e:
|
||||
exception_text = str(e)
|
||||
if exception_text != "":
|
||||
logging.warning(f"Error when trying to import module {module_spec.name}: {exception_text}")
|
||||
return None
|
||||
model_config = _class()
|
||||
return model_config
|
||||
|
||||
def _search_recursively_and_store(self, module_spec: ModuleSpec, model_name: str) -> Dict[str, LightningContainer]:
|
||||
"""
|
||||
Given a root namespace eg: A.B.C searches recursively in all child namespaces
|
||||
for class property with the <model_name> provided. If found, this is
|
||||
instantiated with the provided overrides, and added to the configs dictionary.
|
||||
|
||||
:param module_search_spec:
|
||||
"""
|
||||
configs: Dict[str, LightningContainer] = {}
|
||||
root_namespace = module_spec.name
|
||||
namespaces_to_search: List[str] = []
|
||||
if module_spec.submodule_search_locations:
|
||||
logging.debug(
|
||||
f"Searching through {len(module_spec.submodule_search_locations)} folders that match namespace "
|
||||
f"{module_spec.name}: {module_spec.submodule_search_locations}"
|
||||
)
|
||||
for root in module_spec.submodule_search_locations:
|
||||
# List all python files in all the dirs under root, except for private dirs (prefixed with .)
|
||||
all_py_files = [x for x in Path(root).rglob("*.py") if ".." not in str(x)]
|
||||
for f in all_py_files:
|
||||
if f.is_file() and "__pycache__" not in str(f) and f.name != "setup.py":
|
||||
sub_namespace = path_to_namespace(f, root=root)
|
||||
namespaces_to_search.append(root_namespace + "." + sub_namespace)
|
||||
elif module_spec.origin:
|
||||
# The module search spec already points to a python file: Search only that.
|
||||
namespaces_to_search.append(module_spec.name)
|
||||
else:
|
||||
raise ValueError(f"Unable to process module spec: {module_spec}")
|
||||
|
||||
for n in namespaces_to_search: # type: ignore
|
||||
_module_spec = None
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
_module_spec = find_spec(n) # type: ignore
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if _module_spec:
|
||||
config = self._get_model_config(_module_spec, model_name=model_name)
|
||||
if config:
|
||||
configs[n] = config # type: ignore
|
||||
return configs
|
||||
|
||||
def create_model_config_from_name(self, model_name: str) -> LightningContainer:
|
||||
"""
|
||||
Returns a model configuration for a model of the given name.
|
||||
To avoid having to import torch here, there are no references to LightningContainer.
|
||||
Searching for a class member called <model_name> in the search modules provided recursively.
|
||||
|
||||
:param model_name: Fully qualified name of the model for which to get the configs for - i.e.
|
||||
mymodule.configs.MyConfig
|
||||
:param model_name: Class name (for example, "HelloWorld") if the model config is in the default search
|
||||
namespace, or fully qualified name of the model, like mymodule.configs.MyConfig)
|
||||
"""
|
||||
if not model_name:
|
||||
raise ValueError("Unable to load a model configuration because the model name is missing.")
|
||||
|
||||
# get the class name from the fully qualified name
|
||||
model_name = model_name.split(".")[-1]
|
||||
|
||||
configs: Dict[str, LightningContainer] = {}
|
||||
|
||||
def _get_model_config(module_spec: ModuleSpec) -> Optional[LightningContainer]:
|
||||
"""
|
||||
Given a module specification check to see if it has a class property with
|
||||
the <model_name> provided, and instantiate that config class with the
|
||||
provided <config_overrides>. Otherwise, return None.
|
||||
|
||||
:param module_spec:
|
||||
:return: Instantiated model config if it was found.
|
||||
"""
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
logging.debug(f"Importing {module_spec.name}")
|
||||
target_module = importlib.import_module(module_spec.name)
|
||||
# The "if" clause checks that obj is a class, of the desired name, that is
|
||||
# defined in this module rather than being imported into it (and hence potentially
|
||||
# being found twice).
|
||||
_class = next(obj for name, obj in inspect.getmembers(target_module)
|
||||
if inspect.isclass(obj)
|
||||
and name == model_name # noqa: W503
|
||||
and inspect.getmodule(obj) == target_module) # noqa: W503
|
||||
logging.info(f"Found class {_class} in file {module_spec.origin}")
|
||||
# ignore the exception which will occur if the provided module cannot be loaded
|
||||
# or the loaded module does not have the required class as a member
|
||||
except Exception as e:
|
||||
exception_text = str(e)
|
||||
if exception_text != "":
|
||||
logging.warning(f"(from attempt to import module {module_spec.name}): {exception_text}")
|
||||
return None
|
||||
model_config = _class()
|
||||
return model_config
|
||||
|
||||
def _search_recursively_and_store(module_search_spec: ModuleSpec) -> None:
|
||||
"""
|
||||
Given a root namespace eg: A.B.C searches recursively in all child namespaces
|
||||
for class property with the <model_name> provided. If found, this is
|
||||
instantiated with the provided overrides, and added to the configs dictionary.
|
||||
|
||||
:param module_search_spec:
|
||||
"""
|
||||
root_namespace = module_search_spec.name
|
||||
namespaces_to_search: List[str] = []
|
||||
if module_search_spec.submodule_search_locations:
|
||||
logging.debug(f"Searching through {len(module_search_spec.submodule_search_locations)} folders that "
|
||||
f"match namespace {module_search_spec.name}: "
|
||||
f"{module_search_spec.submodule_search_locations}")
|
||||
for root in module_search_spec.submodule_search_locations:
|
||||
# List all python files in all the dirs under root, except for private dirs (prefixed with .)
|
||||
all_py_files = [x for x in Path(root).rglob("*.py") if ".." not in str(x)]
|
||||
for f in all_py_files:
|
||||
if f.is_file() and "__pycache__" not in str(f) and f.name != "setup.py":
|
||||
sub_namespace = path_to_namespace(f, root=root)
|
||||
namespaces_to_search.append(root_namespace + "." + sub_namespace)
|
||||
elif module_search_spec.origin:
|
||||
# The module search spec already points to a python file: Search only that.
|
||||
namespaces_to_search.append(module_search_spec.name)
|
||||
else:
|
||||
raise ValueError(f"Unable to process module spec: {module_search_spec}")
|
||||
|
||||
for n in namespaces_to_search: # type: ignore
|
||||
_module_spec = None
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
_module_spec = find_spec(n) # type: ignore
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if _module_spec:
|
||||
config = _get_model_config(_module_spec)
|
||||
if config:
|
||||
configs[n] = config # type: ignore
|
||||
|
||||
for search_spec in self.module_search_specs:
|
||||
_search_recursively_and_store(search_spec)
|
||||
logging.info(f"Trying to locate model {model_name}")
|
||||
|
||||
name_parts = model_name.split(".")
|
||||
class_name = name_parts[-1]
|
||||
module_spec = self.find_module_search_specs(model_name)
|
||||
configs = self._search_recursively_and_store(module_spec=module_spec, model_name=class_name)
|
||||
if len(configs) == 0:
|
||||
raise ValueError(
|
||||
f"Model name {model_name} was not found in search namespaces: "
|
||||
f"{[s.name for s in self.module_search_specs]}.")
|
||||
raise ValueError(f"Model '{model_name}' was not found in search namespace {module_spec.name}")
|
||||
elif len(configs) > 1:
|
||||
raise ValueError(
|
||||
f"Multiple instances of model name {model_name} were found in namespaces: {configs.keys()}.")
|
||||
f"Multiple instances of model '{model_name}' were found in namespaces: {[*configs.keys()]}"
|
||||
)
|
||||
else:
|
||||
return list(configs.values())[0]
|
||||
|
||||
|
||||
def path_to_namespace(path: Path, root: PathOrString = fixed_paths.repository_root_directory()) -> str:
|
||||
def path_to_namespace(path: Path, root: Path) -> str:
|
||||
"""
|
||||
Given a path (in form R/A/B/C) and an optional root directory R, create a namespace A.B.C.
|
||||
If root is provided, then path must be a relative child to it.
|
||||
Given a path (in form R/A/B/C) and a root directory R, create a namespace string A.B.C.
|
||||
The path must be located under the root directory.
|
||||
|
||||
:param path: Path to convert to namespace
|
||||
:param root: Path prefix to remove from namespace (default is project root)
|
||||
:return:
|
||||
:param root: Path prefix to remove from namespace.
|
||||
:return: A Python namespace string
|
||||
"""
|
||||
return ".".join([Path(x).stem for x in path.relative_to(root).parts])
|
||||
|
|
|
@ -4,61 +4,21 @@
|
|||
# ------------------------------------------------------------------------------------------
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from health_azure.utils import PathOrString
|
||||
|
||||
ENVIRONMENT_YAML_FILE_NAME = "environment.yml"
|
||||
from health_azure.paths import git_repo_root_folder, is_himl_used_from_git_repo
|
||||
|
||||
|
||||
def get_environment_yaml_file() -> Path:
|
||||
"""
|
||||
Returns the path where the environment.yml file is located, in the repository root directory.
|
||||
The function throws an exception if the file is not found
|
||||
|
||||
:return: The full path to the environment files.
|
||||
"""
|
||||
# The environment file is copied into the package folder in setup.py.
|
||||
root_dir = repository_root_directory()
|
||||
env = root_dir / ENVIRONMENT_YAML_FILE_NAME
|
||||
if not env.exists():
|
||||
raise ValueError(f"File {ENVIRONMENT_YAML_FILE_NAME} was not found not found in in the repository root"
|
||||
f"{root_dir}.")
|
||||
return env
|
||||
|
||||
|
||||
def repository_root_directory(path: Optional[PathOrString] = None) -> Path:
|
||||
def repository_root_directory() -> Path:
|
||||
"""
|
||||
Gets the full path to the root directory that holds the present repository.
|
||||
This function should only be called if the repository is available (for example, in unit tests).
|
||||
It will raise a ValueError if the repo is not available.
|
||||
|
||||
:param path: if provided, a relative path to append to the absolute path to the repository root.
|
||||
:return: The full path to the repository's root directory, with symlinks resolved if any.
|
||||
"""
|
||||
root = Path.cwd()
|
||||
if path:
|
||||
full_path = root / path
|
||||
assert full_path.exists(), f"Path {full_path} doesn't exist"
|
||||
return root / path
|
||||
else:
|
||||
return root
|
||||
|
||||
|
||||
def himl_root_dir() -> Optional[Path]:
|
||||
"""
|
||||
Attempts to return the path to the top-level hi-ml repo that contains the hi-ml and hi-ml-azure packages.
|
||||
This top level repo will only be present if hi-ml has been installed as a git submodule, or the repo has
|
||||
been directly downlaoded. Otherwise (e.g.if hi-ml has been installed as a pip package) returns None
|
||||
|
||||
return: Path to the himl root dir if it exists, else None
|
||||
"""
|
||||
health_ml_root = Path(__file__).parent.parent
|
||||
print(f"health ml root: {health_ml_root}")
|
||||
if health_ml_root.parent.stem == "site-packages":
|
||||
return None
|
||||
himl_root = health_ml_root.parent.parent.parent
|
||||
assert (himl_root / "hi-ml").is_dir(), f"no such dir {himl_root / 'hi-ml'}"
|
||||
assert (himl_root / "hi-ml-azure").is_dir(), f"no such dir {himl_root / 'hi-ml-azure'}"
|
||||
return himl_root
|
||||
if is_himl_used_from_git_repo():
|
||||
return git_repo_root_folder()
|
||||
raise ValueError("This function should not be used if hi-ml is used as an installed package.")
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
|
@ -66,6 +26,7 @@ class OutputFolderForTests:
|
|||
"""
|
||||
Data class for the output directories for a given test
|
||||
"""
|
||||
|
||||
root_dir: Path
|
||||
|
||||
def create_file_or_folder_path(self, file_or_folder_name: str) -> Path:
|
||||
|
|
|
@ -1,10 +0,0 @@
|
|||
name: test-env
|
||||
channels:
|
||||
- defaults
|
||||
- pytorch
|
||||
dependencies:
|
||||
- pip=20.1.1
|
||||
- python=3.7.3
|
||||
- pytorch=1.8.0
|
||||
- pip:
|
||||
- -r ../test_requirements.txt
|
|
@ -0,0 +1,14 @@
|
|||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
root = Path(__file__).parent.parent.parent
|
||||
for folder in ["hi-ml-azure", "hi-ml"]:
|
||||
full_folder = str(root / folder / "src")
|
||||
if full_folder not in sys.path:
|
||||
print(f"Adding to sys.path for running hi-ml: {full_folder}")
|
||||
sys.path.insert(0, full_folder)
|
||||
|
||||
# Matplotlib is very talkative in DEBUG mode
|
||||
logging.getLogger('matplotlib').setLevel(logging.INFO)
|
|
@ -1,139 +1,88 @@
|
|||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
from health_azure.utils import is_running_on_azure_agent
|
||||
from health_ml.configs import hello_world as hello_config
|
||||
from health_ml.lightning_container import LightningContainer
|
||||
from health_ml.utils.config_loader import ModelConfigLoader, path_to_namespace
|
||||
from testhiml.utils.fixed_paths_for_tests import full_test_data_path, tests_root_directory
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def config_loader() -> ModelConfigLoader:
|
||||
return ModelConfigLoader(**{"model": "HelloContainer"})
|
||||
def test_find_module_search_specs() -> None:
|
||||
config_loader = ModelConfigLoader()
|
||||
module_spec = config_loader.find_module_search_specs(model_name="health_ml.utils.config_loader.Foo")
|
||||
assert module_spec.name == "health_ml.utils.config_loader"
|
||||
module_spec = config_loader.find_module_search_specs(model_name="DoesNotExist")
|
||||
assert module_spec.name == "health_ml.configs"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def hello_config() -> Any:
|
||||
from health_ml.configs import hello_container # type: ignore
|
||||
assert Path(hello_container.__file__).exists(), "Can't find hello_container config"
|
||||
return hello_container
|
||||
def test_get_default_search_module() -> None:
|
||||
config_loader = ModelConfigLoader()
|
||||
search_module = config_loader.default_module_spec()
|
||||
assert search_module.name == "health_ml.configs"
|
||||
|
||||
|
||||
def test_find_module_search_specs(config_loader: ModelConfigLoader) -> None:
|
||||
# By default, property module_search_specs includes the default config path - health_ml.configs
|
||||
len_search_specs_before = len(config_loader.module_search_specs)
|
||||
assert any([m.name == "health_ml.configs" for m in config_loader.module_search_specs])
|
||||
config_loader._find_module_search_specs()
|
||||
# nothing should have been added to module_search_specs
|
||||
assert len(config_loader.module_search_specs) == len_search_specs_before
|
||||
|
||||
|
||||
def test_find_module_search_specs_outside_default_dir() -> None:
|
||||
if is_running_on_azure_agent():
|
||||
return
|
||||
model_name = "NewConfig"
|
||||
|
||||
dummy_config_dir = Path.cwd() / "test_configs"
|
||||
dummy_config_dir.mkdir()
|
||||
dummy_config_path = dummy_config_dir / "new_config.py"
|
||||
dummy_config = f"""class {model_name}:
|
||||
def __init__(self):
|
||||
pass
|
||||
"""
|
||||
dummy_config_path.touch()
|
||||
dummy_config_path.write_text(dummy_config)
|
||||
|
||||
dummy_config_namespace = f"test_configs.new_config.{model_name}"
|
||||
config_loader2 = ModelConfigLoader(**{"model": f"{dummy_config_namespace}"})
|
||||
# The root "testhiml" should now be in the system path and the module "outputs" should be in module_search_specs
|
||||
# this wont be in the previous results, since the default path was used. The default search_spec (health_ml.configs)
|
||||
# should also be in the results for hte new
|
||||
assert any([m.name == "new_config" for m in config_loader2.module_search_specs])
|
||||
assert any([m.name == "health_ml.configs" for m in config_loader2.module_search_specs])
|
||||
|
||||
# If the file doesnt exist but the parent module does, the module will still be appended to module_search_specs
|
||||
# at this stage
|
||||
config_loader3 = ModelConfigLoader(**{"model": "test_configs.new_config.idontexist"})
|
||||
assert any([m.name == "new_config" for m in config_loader3.module_search_specs])
|
||||
|
||||
# If the parent module doesn't exist, an Exception should be raised
|
||||
with pytest.raises(Exception) as e:
|
||||
ModelConfigLoader(**{"model": "testhiml.idontexist.idontexist"})
|
||||
assert "was not found" in str(e)
|
||||
|
||||
shutil.rmtree(dummy_config_dir)
|
||||
|
||||
|
||||
def test_get_default_search_module(config_loader: ModelConfigLoader) -> None:
|
||||
search_module = config_loader.get_default_search_module()
|
||||
assert search_module == "health_ml.configs"
|
||||
|
||||
|
||||
def test_create_model_config_from_name(config_loader: ModelConfigLoader, hello_config: Any
|
||||
) -> None:
|
||||
def test_create_model_config_from_name_errors() -> None:
|
||||
config_loader = ModelConfigLoader()
|
||||
# if no model name is given, an exception should be raised
|
||||
with pytest.raises(Exception) as e:
|
||||
config_loader.create_model_config_from_name("")
|
||||
assert "the model name is missing" in str(e)
|
||||
assert "the model name is missing" in str(e)
|
||||
|
||||
# if no config is found matching the model name, an exception should be raised
|
||||
with pytest.raises(Exception) as e:
|
||||
config_loader.create_model_config_from_name("idontexist")
|
||||
assert "was not found in search namespaces" in str(e)
|
||||
assert "was not found in search namespace" in str(e)
|
||||
|
||||
# if > 1 config is found matching the model name, an exception should be raised
|
||||
config_name = "HelloContainer"
|
||||
hello_config_path = Path(hello_config.__file__)
|
||||
duplicate_config_file = hello_config_path.parent / "hello_container_2.py"
|
||||
duplicate_config_file.touch()
|
||||
shutil.copyfile(str(hello_config_path), str(duplicate_config_file))
|
||||
with pytest.raises(Exception) as e:
|
||||
config_loader.create_model_config_from_name(config_name)
|
||||
assert "Multiple instances of model name " in str(e)
|
||||
duplicate_config_file.unlink()
|
||||
config_loader.create_model_config_from_name("testhiml.idontexist.idontexist")
|
||||
assert "Module testhiml.idontexist was not found" in str(e)
|
||||
|
||||
|
||||
def test_create_model_config_from_name_duplicates() -> None:
|
||||
config_loader = ModelConfigLoader()
|
||||
config_name = "HelloWorld"
|
||||
# if exactly one config is found, expect a LightningContainer to be returned
|
||||
container = config_loader.create_model_config_from_name(config_name)
|
||||
assert isinstance(container, LightningContainer)
|
||||
assert container.model_name == config_name
|
||||
|
||||
|
||||
def test_config_in_dif_location(tmp_path: Path, hello_config: Any) -> None:
|
||||
himl_root = Path(hello_config.__file__).parent.parent
|
||||
model_name = "HelloContainer"
|
||||
new_config_path = himl_root / "hello_container_to_delete.py"
|
||||
new_config_path.touch()
|
||||
# if > 1 config is found matching the model name, an exception should be raised
|
||||
hello_config_path = Path(hello_config.__file__)
|
||||
shutil.copyfile(str(hello_config_path), str(new_config_path))
|
||||
config_loader = ModelConfigLoader(model=model_name)
|
||||
|
||||
# Trying to find this config should now cause an exception as it should find it in both "health_ml" and
|
||||
# in "health_ml.configs"
|
||||
# This file must be excluded from coverage reports, check .coveragerc
|
||||
duplicate_config_file = hello_config_path.parent / "temp_config_for_unittests.py"
|
||||
shutil.copyfile(hello_config_path, duplicate_config_file)
|
||||
with pytest.raises(Exception) as e:
|
||||
config_loader.create_model_config_from_name(model_name)
|
||||
assert "Multiple instances of model name HelloContainer were found in namespaces: " \
|
||||
"dict_keys(['health_ml.configs.hello_container', 'health_ml.hello_container_to_delete']) " in str(e)
|
||||
new_config_path.unlink()
|
||||
config_loader.create_model_config_from_name(config_name)
|
||||
assert "Multiple instances of model " in str(e)
|
||||
duplicate_config_file.unlink()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("is_external", [True, False])
|
||||
def test_path_to_namespace(is_external: bool) -> None:
|
||||
def test_path_to_namespace() -> None:
|
||||
"""
|
||||
A test to check conversion between namespace to path for InnerEye and external namespaces
|
||||
A test to check conversion between paths and python namespaces.
|
||||
"""
|
||||
tests_root_dir = tests_root_directory()
|
||||
if is_external:
|
||||
folder_name = "logs"
|
||||
full_folder = tests_root_dir / folder_name
|
||||
assert path_to_namespace(
|
||||
path=full_folder,
|
||||
root=tests_root_dir
|
||||
) == folder_name
|
||||
else:
|
||||
assert path_to_namespace(
|
||||
path=full_test_data_path(),
|
||||
root=tests_root_dir
|
||||
) == "test_data"
|
||||
assert path_to_namespace(Path("/foo/bar/baz"), root=Path("/foo")) == "bar.baz"
|
||||
|
||||
|
||||
def test_config_fully_qualified() -> None:
|
||||
"""
|
||||
Test if we can load model configs when giving a full Python namespace.
|
||||
"""
|
||||
# This name was deliberately chosen to be outside the default searchar namespace
|
||||
model_name = "health_ml.utils.config_loader.ModelConfigLoader"
|
||||
config_loader = ModelConfigLoader()
|
||||
model = config_loader.create_model_config_from_name(model_name=model_name)
|
||||
assert type(model).__name__ == "ModelConfigLoader"
|
||||
|
||||
|
||||
def test_config_fully_qualified_invalid() -> None:
|
||||
"""
|
||||
Test error handling if giving a too long namespace
|
||||
"""
|
||||
namespace = "health_ml.utils.config_loader.foo"
|
||||
model_name = namespace + ".Foo"
|
||||
config_loader = ModelConfigLoader()
|
||||
with pytest.raises(ValueError) as ex:
|
||||
config_loader.create_model_config_from_name(model_name=model_name)
|
||||
assert f"Module {namespace} was not found" in str(ex)
|
||||
|
|
|
@ -60,6 +60,7 @@ def test_workflow_params_get_effective_random_seed() -> None:
|
|||
assert seed == params.random_seed
|
||||
|
||||
|
||||
@pytest.mark.fast
|
||||
def test_validate_dataset_params() -> None:
|
||||
# DatasetParams cannot be initialized with neither of azure_datasets or local_datasets set
|
||||
with pytest.raises(ValueError) as ex:
|
||||
|
@ -78,11 +79,11 @@ def test_validate_dataset_params() -> None:
|
|||
# local datasets and dataset_mountpoints must be Paths
|
||||
with pytest.raises(Exception) as e:
|
||||
DatasetParams(local_datasets=["foo"])
|
||||
assert "is not an instance of" in str(e)
|
||||
assert "items must be instances of type <class 'pathlib.Path'>" in str(e)
|
||||
|
||||
with pytest.raises(Exception) as e:
|
||||
DatasetParams(dataset_mountpoints=["foo"])
|
||||
assert "is not an instance of" in str(e)
|
||||
assert "items must be instances of type <class 'pathlib.Path'>" in str(e)
|
||||
|
||||
# The following should be okay
|
||||
DatasetParams(local_datasets=[Path("foo")]).validate()
|
||||
|
|
|
@ -5,7 +5,7 @@ from unittest.mock import MagicMock, patch, Mock
|
|||
from pytorch_lightning import Callback, Trainer
|
||||
from pytorch_lightning.callbacks import GradientAccumulationScheduler, ModelCheckpoint, ModelSummary, TQDMProgressBar
|
||||
|
||||
from health_ml.configs.hello_container import HelloContainer # type: ignore
|
||||
from health_ml.configs.hello_world import HelloWorld # type: ignore
|
||||
from health_ml.lightning_container import LightningContainer
|
||||
from health_ml.model_trainer import (create_lightning_trainer, write_experiment_summary_file, model_train)
|
||||
from health_ml.utils.common_utils import EXPERIMENT_SUMMARY_FILE
|
||||
|
@ -65,8 +65,8 @@ def test_create_lightning_trainer_with_callbacks() -> None:
|
|||
callbacks = [MyCallback()]
|
||||
return {"callbacks": callbacks}
|
||||
|
||||
model_name = "HelloContainer"
|
||||
model_config_loader = ModelConfigLoader(model=model_name)
|
||||
model_name = "HelloWorld"
|
||||
model_config_loader = ModelConfigLoader()
|
||||
container = model_config_loader.create_model_config_from_name(model_name)
|
||||
container.monitor_gpu = False
|
||||
container.monitor_loading = False
|
||||
|
@ -88,7 +88,7 @@ def test_create_lightning_trainer_with_callbacks() -> None:
|
|||
|
||||
|
||||
def test_model_train() -> None:
|
||||
container = HelloContainer()
|
||||
container = HelloWorld()
|
||||
container.create_lightning_module_and_store()
|
||||
|
||||
with patch.object(container, "get_data_module"):
|
||||
|
|
|
@ -5,7 +5,7 @@ import pytest
|
|||
from typing import Generator, Tuple
|
||||
from unittest.mock import patch
|
||||
|
||||
from health_ml.configs.hello_container import HelloContainer
|
||||
from health_ml.configs.hello_world import HelloWorld
|
||||
from health_ml.experiment_config import ExperimentConfig
|
||||
from health_ml.lightning_container import LightningContainer
|
||||
from health_ml.run_ml import MLRunner
|
||||
|
@ -34,7 +34,7 @@ def ml_runner() -> Generator:
|
|||
@pytest.fixture(scope="module")
|
||||
def ml_runner_with_container() -> Generator:
|
||||
experiment_config = ExperimentConfig(model="HelloContainer")
|
||||
container = HelloContainer()
|
||||
container = HelloWorld()
|
||||
runner = MLRunner(experiment_config=experiment_config, container=container)
|
||||
runner.setup()
|
||||
yield runner
|
||||
|
@ -92,10 +92,12 @@ def test_run_inference(ml_runner_with_container: MLRunner, tmp_path: Path) -> No
|
|||
"""
|
||||
Test that run_inference gets called as expected.
|
||||
"""
|
||||
def _expected_files_exist() -> int:
|
||||
def _expected_files_exist() -> bool:
|
||||
output_dir = ml_runner_with_container.container.outputs_folder
|
||||
expected_files = [Path("test_mse.txt"), Path("test_mae.txt")]
|
||||
return sum([p.exists() for p in expected_files] + [output_dir.is_dir()])
|
||||
if not output_dir.is_dir():
|
||||
return False
|
||||
expected_files = ["test_mse.txt", "test_mae.txt"]
|
||||
return all([(output_dir / p).exists() for p in expected_files])
|
||||
|
||||
# create the test data
|
||||
import numpy as np
|
||||
|
@ -112,7 +114,7 @@ def test_run_inference(ml_runner_with_container: MLRunner, tmp_path: Path) -> No
|
|||
assert not expected_ckpt_path.exists()
|
||||
# update the container to look for test data at this location
|
||||
ml_runner_with_container.container.local_dataset_dir = tmp_path
|
||||
assert _expected_files_exist() == 0
|
||||
assert not _expected_files_exist()
|
||||
|
||||
actual_train_ckpt_path = ml_runner_with_container.checkpoint_handler.get_recovery_or_checkpoint_path_train()
|
||||
assert actual_train_ckpt_path is None
|
||||
|
@ -123,8 +125,8 @@ def test_run_inference(ml_runner_with_container: MLRunner, tmp_path: Path) -> No
|
|||
actual_test_ckpt_path = ml_runner_with_container.checkpoint_handler.get_checkpoints_to_test()
|
||||
assert actual_test_ckpt_path == [expected_ckpt_path]
|
||||
assert actual_test_ckpt_path[0].exists()
|
||||
# After training, the outputs directory should now exist
|
||||
assert _expected_files_exist() == 3
|
||||
# After training, the outputs directory should now exist and contain the 2 error files
|
||||
assert _expected_files_exist()
|
||||
|
||||
# if no checkpoint handler, no checkpoint paths will be saved and these are required for
|
||||
# inference so ValueError will be raised
|
||||
|
@ -132,6 +134,3 @@ def test_run_inference(ml_runner_with_container: MLRunner, tmp_path: Path) -> No
|
|||
ml_runner_with_container.checkpoint_handler = None # type: ignore
|
||||
ml_runner_with_container.run()
|
||||
assert "expects exactly 1 checkpoint for inference, but got 0" in str(e)
|
||||
|
||||
Path("test_mae.txt").unlink()
|
||||
Path("test_mse.txt").unlink()
|
||||
|
|
|
@ -8,8 +8,11 @@ from typing import List, Optional
|
|||
from unittest.mock import patch, MagicMock
|
||||
|
||||
import pytest
|
||||
from azureml.train.hyperdrive import HyperDriveConfig
|
||||
|
||||
from health_azure import AzureRunInfo, DatasetConfig
|
||||
from health_ml.configs.hello_world import HelloWorld
|
||||
from health_ml.deep_learning_config import WorkflowParams
|
||||
from health_ml.lightning_container import LightningContainer
|
||||
from health_ml.runner import Runner
|
||||
|
||||
|
@ -21,12 +24,12 @@ def mock_runner(tmp_path: Path) -> Runner:
|
|||
|
||||
|
||||
@pytest.mark.parametrize("model_name, cluster, num_nodes, should_raise_value_error", [
|
||||
("HelloContainer", "dummyCluster", 1, False),
|
||||
("HelloWorld", "dummyCluster", 1, False),
|
||||
("", "", None, True),
|
||||
("HelloContainer", "", None, False),
|
||||
("HelloWorld", "", None, False),
|
||||
("a", None, 0, True),
|
||||
(None, "b", 10, True),
|
||||
("HelloContainer", "b", 10, False)
|
||||
("HelloWorld", "b", 10, False)
|
||||
])
|
||||
def test_parse_and_load_model(mock_runner: Runner, model_name: Optional[str], cluster: Optional[str],
|
||||
num_nodes: Optional[int], should_raise_value_error: bool) -> None:
|
||||
|
@ -63,7 +66,7 @@ def test_parse_and_load_model(mock_runner: Runner, model_name: Optional[str], cl
|
|||
|
||||
|
||||
def test_run(mock_runner: Runner) -> None:
|
||||
model_name = "HelloContainer"
|
||||
model_name = "HelloWorld"
|
||||
arguments = ["", f"--model={model_name}"]
|
||||
with patch("health_ml.runner.Runner.run_in_situ") as mock_run_in_situ:
|
||||
with patch("health_ml.runner.get_workspace"):
|
||||
|
@ -113,3 +116,94 @@ def test_submit_to_azureml_if_needed(mock_get_workspace: MagicMock,
|
|||
assert run_info.input_datasets == []
|
||||
assert run_info.is_running_in_azure_ml is False
|
||||
assert run_info.output_folder is None
|
||||
|
||||
|
||||
def test_crossvalidation_flag() -> None:
|
||||
"""
|
||||
Checks the basic use of the flags that trigger cross validation
|
||||
:return:
|
||||
"""
|
||||
container = HelloWorld()
|
||||
assert not container.is_crossvalidation_enabled
|
||||
container.crossval_count = 2
|
||||
assert container.is_crossvalidation_enabled
|
||||
container.validate()
|
||||
# Validation should fail if the cross validation index is out of bounds
|
||||
container.crossval_index = container.crossval_count
|
||||
with pytest.raises(ValueError):
|
||||
container.validate()
|
||||
|
||||
|
||||
def test_crossval_config() -> None:
|
||||
"""
|
||||
Check if the flags to trigger Hyperdrive runs work as expected.
|
||||
"""
|
||||
mock_tuning_config = "foo"
|
||||
container = HelloWorld()
|
||||
with patch("health_ml.configs.hello_world.HelloWorld.get_parameter_tuning_config",
|
||||
return_value=mock_tuning_config):
|
||||
# Without any flags set, no Hyperdrive config should be returned
|
||||
assert container.get_hyperdrive_config() is None
|
||||
# To trigger a hyperparameter search, the commandline flag for hyperdrive must be present
|
||||
container.hyperdrive = True
|
||||
assert container.get_hyperdrive_config() == mock_tuning_config
|
||||
# Triggering cross validation works by just setting crossval_count
|
||||
container.hyperdrive = False
|
||||
container.crossval_count = 2
|
||||
assert container.is_crossvalidation_enabled
|
||||
crossval_config = container.get_hyperdrive_config()
|
||||
assert isinstance(crossval_config, HyperDriveConfig)
|
||||
|
||||
|
||||
def test_crossval_argument_names() -> None:
|
||||
"""
|
||||
Cross validation uses hardcoded argument names, check if they match the field names
|
||||
"""
|
||||
container = HelloWorld()
|
||||
crossval_count = 8
|
||||
crossval_index = 5
|
||||
container.crossval_count = crossval_count
|
||||
container.crossval_index = crossval_index
|
||||
assert getattr(container, container.CROSSVAL_INDEX_ARG_NAME) == crossval_index
|
||||
|
||||
|
||||
def test_submit_to_azure_hyperdrive(mock_runner: Runner) -> None:
|
||||
"""
|
||||
Test if the hyperdrive configurations are passed to the submission function.
|
||||
"""
|
||||
model_name = "HelloWorld"
|
||||
crossval_count = 2
|
||||
arguments = ["", f"--model={model_name}", "--cluster=foo", "--crossval_count", str(crossval_count)]
|
||||
with patch("health_ml.runner.Runner.run_in_situ") as mock_run_in_situ:
|
||||
with patch("health_ml.runner.get_workspace"):
|
||||
with patch.object(sys, "argv", arguments):
|
||||
with patch("health_ml.runner.submit_to_azure_if_needed") as mock_submit_to_aml:
|
||||
mock_runner.run()
|
||||
mock_run_in_situ.assert_called_once()
|
||||
mock_submit_to_aml.assert_called_once()
|
||||
# call_args is a tuple of (args, kwargs)
|
||||
call_kwargs = mock_submit_to_aml.call_args[1]
|
||||
# Submission to AzureML should have been turned on because a cluster name was supplied
|
||||
assert mock_runner.experiment_config.azureml
|
||||
assert call_kwargs["submit_to_azureml"]
|
||||
# Check details of the Hyperdrive config
|
||||
hyperdrive_config = call_kwargs["hyperdrive_config"]
|
||||
parameter_space = hyperdrive_config._generator_config["parameter_space"]
|
||||
assert parameter_space[WorkflowParams.CROSSVAL_INDEX_ARG_NAME] == ["choice", [list(range(crossval_count))]]
|
||||
|
||||
|
||||
def test_run_hello_world(mock_runner: Runner) -> None:
|
||||
"""Test running a model end-to-end via the commandline runner
|
||||
"""
|
||||
model_name = "HelloWorld"
|
||||
arguments = ["", f"--model={model_name}"]
|
||||
with patch("health_ml.runner.get_workspace") as mock_get_workspace:
|
||||
with patch.object(sys, "argv", arguments):
|
||||
mock_runner.run()
|
||||
# get_workspace should not be called when using the runner outside AzureML, to not go through the
|
||||
# time-consuming auth
|
||||
mock_get_workspace.assert_not_called()
|
||||
# Summary.txt is written at start, the other files during inference
|
||||
expected_files = ["experiment_summary.txt", "test_mae.txt", "test_mse.txt"]
|
||||
for file in expected_files:
|
||||
assert (mock_runner.lightning_container.outputs_folder / file).is_file(), f"Missing file: {file}"
|
||||
|
|
|
@ -14,16 +14,26 @@
|
|||
"reportPrivateImportUsage": false,
|
||||
"executionEnvironments": [
|
||||
{
|
||||
"root": "hi-ml/src"
|
||||
"root": "hi-ml/src",
|
||||
"extraPaths": [
|
||||
"hi-ml-azure/src"
|
||||
]
|
||||
},
|
||||
{
|
||||
"root": "hi-ml/testhiml"
|
||||
"root": "hi-ml/testhiml",
|
||||
"extraPaths": [
|
||||
"hi-ml-azure/src",
|
||||
"hi-ml/src",
|
||||
]
|
||||
},
|
||||
{
|
||||
"root": "hi-ml-azure/src"
|
||||
},
|
||||
{
|
||||
"root": "hi-ml-azure/testazure"
|
||||
"root": "hi-ml-azure/testazure",
|
||||
"extraPaths": [
|
||||
"hi-ml-azure/src",
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
|
@ -1,30 +1,10 @@
|
|||
coverage==5.5
|
||||
conda-merge==0.1.5
|
||||
flake8==3.8.4
|
||||
gitpython==3.1.7
|
||||
lightning-bolts==0.4.0
|
||||
matplotlib==3.3.0
|
||||
monai==0.6.0
|
||||
more-itertools==8.10.0
|
||||
mypy==0.910
|
||||
opencv-python-headless==4.5.1.48
|
||||
pandas==1.3.4
|
||||
param==1.9.3
|
||||
pillow==9.0.0
|
||||
pydicom==2.0.0
|
||||
pylint==2.9.5
|
||||
black==22.1.0
|
||||
coverage==6.3.2
|
||||
flake8==4.0.1
|
||||
mypy==0.931
|
||||
pylint==2.12.2
|
||||
pycobertura==2.0.1
|
||||
pytest==6.2.2
|
||||
pytest-cov==2.11.1
|
||||
pytest-timeout==2.0.1
|
||||
pytorch-lightning==1.5.5
|
||||
ruamel.yaml==0.16.12
|
||||
rpdb==0.1.6
|
||||
scikit-learn==1.0
|
||||
seaborn==0.10.1
|
||||
simpleitk==1.2.4
|
||||
torch==1.10.0
|
||||
torchmetrics==0.6.0
|
||||
torchvision==0.11.1
|
||||
types-requests==2.25.6
|
||||
yacs==0.1.8
|
||||
scikit-learn # This is needed to make pyright pass on the docs folder
|
||||
|
|
Загрузка…
Ссылка в новой задаче