Add `--pl_deterministic` to build training jobs (#605)

2021-12-06 19:28:23 +00:00 · 2021-12-06 19:28:23 +00:00 · f5b7298c57
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -60,6 +60,7 @@ gets uploaded to AzureML, by skipping all test folders.
 - ([#566](https://github.com/microsoft/InnerEye-DeepLearning/pull/566)) Update `hi-ml` dependency to `hi-ml-azure`.
 - ([#572](https://github.com/microsoft/InnerEye-DeepLearning/pull/572)) Updated to new version of hi-ml package
 - ([#596](https://github.com/microsoft/InnerEye-DeepLearning/pull/596)) Add `cudatoolkit=11.1` specification to environment.yml.
+- ([#605](https://github.com/microsoft/InnerEye-DeepLearning/pull/605)) Make build jobs deterministic for regression testing.

 ### Fixed
 - ([#593](https://github.com/microsoft/InnerEye-DeepLearning/pull/593)) Bug fix for hi-ml 0.1.11 issue (#130): empty mount point is turned into ".", which fails the AML job
--- a/RegressionTestResults/PR_BasicModel2Epochs/OUTPUT/Train/epoch_metrics.csv
+++ b/RegressionTestResults/PR_BasicModel2Epochs/OUTPUT/Train/epoch_metrics.csv
@ -1,3 +1,3 @@
 subject_count,loss,learning_rate,Dice/AverageAcrossStructures,Dice/spinalcord,Dice/lung_r,Dice/lung_l,VoxelCount/spinalcord,VoxelCount/lung_r,VoxelCount/lung_l,epoch,cross_validation_split_index
 2.000000,0.718559,0.000100,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,98256.000000,0,-1
-2.000000,0.792988,0.000090,0.000000,0.000000,0.000000,0.000000,0.000000,43307.000000,13992.500000,1,-1
+2.000000,0.792989,0.000090,0.000000,0.000000,0.000000,0.000000,0.000000,43307.000000,13992.500000,1,-1
--- a/azure-pipelines/build-pr.yml
+++ b/azure-pipelines/build-pr.yml
@ -7,7 +7,7 @@ name: PR-$(Date:yyyyMMdd)$(Rev:-r)
 variables:
  model: 'BasicModel2Epochs'
  train: 'True'
-  more_switches: '--log_level=DEBUG'
+  more_switches: '--log_level=DEBUG --pl_deterministic'
  run_recovery_id: ''
  tag: ''
  number_of_cross_validation_splits: 0
@ -34,7 +34,7 @@ jobs:
      - name: tag
        value: 'TrainBasicModel'
      - name: more_switches
-        value: '--log_level=DEBUG --use_dataset_mount=True --regression_test_folder=RegressionTestResults/PR_BasicModel2Epochs'
+        value: '--log_level=DEBUG --pl_deterministic --use_dataset_mount=True --regression_test_folder=RegressionTestResults/PR_BasicModel2Epochs'
    pool:
      vmImage: 'ubuntu-18.04'
    steps:
@ -98,7 +98,7 @@ jobs:
      - name: tag
        value: 'TrainEnsemble'
      - name: more_switches
-        value: '--regression_test_folder=RegressionTestResults/PR_TrainEnsemble'
+        value: '--pl_deterministic --regression_test_folder=RegressionTestResults/PR_TrainEnsemble'
    pool:
      vmImage: 'ubuntu-18.04'
    steps:
@ -120,7 +120,7 @@ jobs:
      - name: tag
        value: 'Train2Nodes'
      - name: more_switches
-        value: '--log_level=DEBUG --num_nodes=2 --regression_test_folder=RegressionTestResults/PR_Train2Nodes'
+        value: '--log_level=DEBUG --pl_deterministic --num_nodes=2 --regression_test_folder=RegressionTestResults/PR_Train2Nodes'
    pool:
      vmImage: 'ubuntu-18.04'
    steps:
@ -158,7 +158,7 @@ jobs:
      - name: tag
        value: 'HelloContainerPR'
      - name: more_switches
-        value: '--num_nodes=2 --max_num_gpus=2 --regression_test_folder=RegressionTestResults/PR_HelloContainer'
+        value: '--pl_deterministic --num_nodes=2 --max_num_gpus=2 --regression_test_folder=RegressionTestResults/PR_HelloContainer'
    pool:
      vmImage: 'ubuntu-18.04'
    steps:
@ -182,7 +182,7 @@ jobs:
      - name: tag
        value: 'LungPR'
      - name: more_switches
-        value: '--num_epochs=1 --feature_channels=16 --show_patch_sampling=0 --train_batch_size=4 --inference_on_val_set=False --inference_on_test_set=False '
+        value: '--pl_deterministic --num_epochs=1 --feature_channels=16 --show_patch_sampling=0 --train_batch_size=4 --inference_on_val_set=False --inference_on_test_set=False '
    pool:
      vmImage: 'ubuntu-18.04'
    steps: