From b1f0ad70ba7901ecab7d3e4d1ea601599cd6bd2a Mon Sep 17 00:00:00 2001 From: Mark Hillebrand Date: Wed, 4 Nov 2015 15:23:02 +0000 Subject: [PATCH] Fix two perf problems 1. On Linux, use ACML MP as default ACML library. 2. For the parallel end-to-end tests, limit concurrency based on available hardware threads. --- Makefile | 2 +- .../NoQuantization/DoublePrecision/run-test | 4 +++- .../NoQuantization/SinglePrecision/run-test | 4 +++- Tests/Speech/DNN/DiscriminativePreTraining/run-test | 2 -- Tests/Speech/DNN/Parallel1BitQuantization/run-test | 4 +++- Tests/Speech/DNN/ParallelNoQuantization/run-test | 4 +++- Tests/run-test-common | 8 ++++++++ configure | 2 +- 8 files changed, 22 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index a8401da98..5a411b15d 100644 --- a/Makefile +++ b/Makefile @@ -111,7 +111,7 @@ endif ifeq ("$(MATHLIB)","acml") INCLUDEPATH += $(ACML_PATH)/include LIBPATH += $(ACML_PATH)/lib - LIBS += -lacml -lm -lpthread + LIBS += -lacml_mp -liomp5 -lm -lpthread CPPFLAGS += -DUSE_ACML endif diff --git a/Tests/ParallelTraining/NoQuantization/DoublePrecision/run-test b/Tests/ParallelTraining/NoQuantization/DoublePrecision/run-test index ad6def657..8e322e875 100755 --- a/Tests/ParallelTraining/NoQuantization/DoublePrecision/run-test +++ b/Tests/ParallelTraining/NoQuantization/DoublePrecision/run-test @@ -4,9 +4,11 @@ ConfigDir=$TEST_DIR/../.. LogFileName=stderr +Instances=4 +NumCPUThreads=$(threadsPerInstance $Instances) # cntkmpirun -cntkmpirun "-n 4" SimpleMultiGPU.config "precision=double SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=64]]]]" +cntkmpirun "-n $Instances" SimpleMultiGPU.config "numCPUThreads=$NumCPUThreads precision=double SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=64]]]]" ExitCode=$? sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/"$LogFileName"_SimpleMultiGPU.logrank0 sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/"$LogFileName"_SimpleMultiGPU.logrank1 diff --git a/Tests/ParallelTraining/NoQuantization/SinglePrecision/run-test b/Tests/ParallelTraining/NoQuantization/SinglePrecision/run-test index 8fa1ffccb..fad087b22 100755 --- a/Tests/ParallelTraining/NoQuantization/SinglePrecision/run-test +++ b/Tests/ParallelTraining/NoQuantization/SinglePrecision/run-test @@ -4,9 +4,11 @@ ConfigDir=$TEST_DIR/../.. LogFileName=stderr +Instances=4 +NumCPUThreads=$(threadsPerInstance $Instances) # cntkmpirun -cntkmpirun "-n 4" SimpleMultiGPU.config "precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]]" +cntkmpirun "-n $Instances" SimpleMultiGPU.config "numCPUThreads=$NumCPUThreads precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]]" ExitCode=$? sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/"$LogFileName"_SimpleMultiGPU.logrank0 sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/"$LogFileName"_SimpleMultiGPU.logrank1 diff --git a/Tests/Speech/DNN/DiscriminativePreTraining/run-test b/Tests/Speech/DNN/DiscriminativePreTraining/run-test index 358e9ba85..5209af7fa 100755 --- a/Tests/Speech/DNN/DiscriminativePreTraining/run-test +++ b/Tests/Speech/DNN/DiscriminativePreTraining/run-test @@ -1,7 +1,5 @@ #!/bin/bash -#!/bin/bash - . $TEST_ROOT_DIR/run-test-common # cntkrun diff --git a/Tests/Speech/DNN/Parallel1BitQuantization/run-test b/Tests/Speech/DNN/Parallel1BitQuantization/run-test index 223850cff..3df9a4777 100755 --- a/Tests/Speech/DNN/Parallel1BitQuantization/run-test +++ b/Tests/Speech/DNN/Parallel1BitQuantization/run-test @@ -4,9 +4,11 @@ ConfigDir=$TEST_DIR/.. LogFileName=stderr +Instances=3 +NumCPUThreads=$(threadsPerInstance $Instances) # cntkmpirun -cntkmpirun "-n 3" cntk.config "precision=double speechTrain=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=1]]]] speechTrain=[SGD=[ParallelTrain=[parallelizationStartEpoch=2]]]" +cntkmpirun "-n $Instances" cntk.config "numCPUThreads=$NumCPUThreads precision=double speechTrain=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=1]]]] speechTrain=[SGD=[ParallelTrain=[parallelizationStartEpoch=2]]]" ExitCode=$? sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank0 sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank1 diff --git a/Tests/Speech/DNN/ParallelNoQuantization/run-test b/Tests/Speech/DNN/ParallelNoQuantization/run-test index 86f9c57d3..315112218 100755 --- a/Tests/Speech/DNN/ParallelNoQuantization/run-test +++ b/Tests/Speech/DNN/ParallelNoQuantization/run-test @@ -4,9 +4,11 @@ ConfigDir=$TEST_DIR/.. LogFileName=stderr +Instances=3 +NumCPUThreads=$(threadsPerInstance $Instances) # cntkmpirun -cntkmpirun "-n 3" cntk.config +cntkmpirun "-n $Instances" cntk.config "numCPUThreads=$NumCPUThreads" ExitCode=$? sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank0 sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank1 diff --git a/Tests/run-test-common b/Tests/run-test-common index 32ee39352..66c581a46 100755 --- a/Tests/run-test-common +++ b/Tests/run-test-common @@ -88,6 +88,14 @@ cntkrun() return $? } +# Given number of instances, return number of hardware threads we can use per +# instance +threadsPerInstance() +{ + local threads=$((`nproc` / $1)) + [[ $threads -eq 0 ]] && echo 1 || echo $threads +} + # Function for launching a parallel CNTK run with MPI # cntkmpirun cntkmpirun() diff --git a/configure b/configure index 2a7bdbfc8..e59b53a9e 100755 --- a/configure +++ b/configure @@ -41,7 +41,7 @@ mathlib= default_path_list="/usr /usr/local /opt /opt/local" # List from best to worst choice -default_acmls="acml5.3.1/ifort64" +default_acmls="acml5.3.1/ifort64_mp" default_mkls="" # NOTE: Will get compilation errors with cuda-6.0