Fix two perf problems

1. On Linux, use ACML MP as default ACML library. 2. For the parallel end-to-end tests, limit concurrency based on available hardware threads.
2015-11-04 15:23:02 +00:00 · 2015-11-04 15:23:02 +00:00 · b1f0ad70ba
--- a/2
+++ b/2
@ -111,7 +111,7 @@ endif
 ifeq ("$(MATHLIB)","acml")
  INCLUDEPATH += $(ACML_PATH)/include
  LIBPATH += $(ACML_PATH)/lib
-  LIBS += -lacml -lm -lpthread
+  LIBS += -lacml_mp -liomp5 -lm -lpthread
  CPPFLAGS += -DUSE_ACML
 endif
--- a/Tests/ParallelTraining/NoQuantization/DoublePrecision/run-test
+++ b/Tests/ParallelTraining/NoQuantization/DoublePrecision/run-test
@ -4,9 +4,11 @@
 ConfigDir=$TEST_DIR/../..
 LogFileName=stderr
 Instances=4
 NumCPUThreads=$(threadsPerInstance $Instances)
 # cntkmpirun <MPI args> <CNTK config file name> <additional CNTK args>
-cntkmpirun "-n 4" SimpleMultiGPU.config "precision=double SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=64]]]]"
+cntkmpirun "-n $Instances" SimpleMultiGPU.config "numCPUThreads=$NumCPUThreads precision=double SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=64]]]]"
 ExitCode=$?
 sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/"$LogFileName"_SimpleMultiGPU.logrank0
 sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/"$LogFileName"_SimpleMultiGPU.logrank1
--- a/Tests/ParallelTraining/NoQuantization/SinglePrecision/run-test
+++ b/Tests/ParallelTraining/NoQuantization/SinglePrecision/run-test
@ -4,9 +4,11 @@
 ConfigDir=$TEST_DIR/../..
 LogFileName=stderr
 Instances=4
 NumCPUThreads=$(threadsPerInstance $Instances)
 # cntkmpirun <MPI args> <CNTK config file name> <additional CNTK args>
-cntkmpirun "-n 4" SimpleMultiGPU.config "precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]]"
+cntkmpirun "-n $Instances" SimpleMultiGPU.config "numCPUThreads=$NumCPUThreads precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]]"
 ExitCode=$?
 sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/"$LogFileName"_SimpleMultiGPU.logrank0
 sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/"$LogFileName"_SimpleMultiGPU.logrank1
--- a/Tests/Speech/DNN/DiscriminativePreTraining/run-test
+++ b/Tests/Speech/DNN/DiscriminativePreTraining/run-test
@ -1,7 +1,5 @@
 #!/bin/bash
 #!/bin/bash
 . $TEST_ROOT_DIR/run-test-common
 # cntkrun <CNTK config file name> <additional CNTK args>
--- a/Tests/Speech/DNN/Parallel1BitQuantization/run-test
+++ b/Tests/Speech/DNN/Parallel1BitQuantization/run-test
@ -4,9 +4,11 @@
 ConfigDir=$TEST_DIR/..
 LogFileName=stderr
 Instances=3
 NumCPUThreads=$(threadsPerInstance $Instances)
 # cntkmpirun <MPI args> <CNTK config file name> <additional CNTK args>
-cntkmpirun "-n 3" cntk.config "precision=double speechTrain=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=1]]]] speechTrain=[SGD=[ParallelTrain=[parallelizationStartEpoch=2]]]"
+cntkmpirun "-n $Instances" cntk.config "numCPUThreads=$NumCPUThreads precision=double speechTrain=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=1]]]] speechTrain=[SGD=[ParallelTrain=[parallelizationStartEpoch=2]]]"
 ExitCode=$?
 sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank0
 sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank1
--- a/Tests/Speech/DNN/ParallelNoQuantization/run-test
+++ b/Tests/Speech/DNN/ParallelNoQuantization/run-test
@ -4,9 +4,11 @@
 ConfigDir=$TEST_DIR/..
 LogFileName=stderr
 Instances=3
 NumCPUThreads=$(threadsPerInstance $Instances)
 # cntkmpirun <MPI args> <CNTK config file name> <additional CNTK args>
-cntkmpirun "-n 3" cntk.config
+cntkmpirun "-n $Instances" cntk.config "numCPUThreads=$NumCPUThreads"
 ExitCode=$?
 sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank0
 sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank1
--- a/Tests/run-test-common
+++ b/Tests/run-test-common
@ -88,6 +88,14 @@ cntkrun()
  return $?
 }
 # Given number of instances, return number of hardware threads we can use per
 # instance
 threadsPerInstance()
 {
  local threads=$((`nproc` / $1))
  [[ $threads -eq 0 ]] && echo 1 || echo $threads
 }
 # Function for launching a parallel CNTK run with MPI
 # cntkmpirun <MPI args> <CNTK config file name> <additional CNTK args>
 cntkmpirun()
--- a/2
+++ b/2
@ -41,7 +41,7 @@ mathlib=
 default_path_list="/usr /usr/local /opt /opt/local"
 # List from best to worst choice
-default_acmls="acml5.3.1/ifort64"
+default_acmls="acml5.3.1/ifort64_mp"
 default_mkls=""
 # NOTE: Will get compilation errors with cuda-6.0