Fixing omp

2016-12-12 14:16:10 +01:00 · 2016-12-12 14:16:10 +01:00 · eabd0e37d8
--- a/2
+++ b/2
@ -449,7 +449,7 @@ $(CNTKLIBRARY_LIB): $(CNTKLIBRARY_OBJ) | $(CNTKMATH_LIB)
 	@echo $(SEPARATOR)
 	@mkdir -p $(dir $@)
 	@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
-	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) $(PROTOBUF_PATH)/lib/libprotobuf.a
+	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) $(PROTOBUF_PATH)/lib/libprotobuf.a -fopenmp

 ########################################
 # CNTKLibrary tests
--- a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
@ -3539,8 +3539,9 @@ namespace CNTK
        Variable    m_testSampleCountVar;
        LearnersPtr m_parameterLearners;
        bool        m_distributed;
+        ValuePtr    m_rootGradientValue;

-        size_t m_prevMinibatchNumSamples;
+        size_t   m_prevMinibatchNumSamples;
        ValuePtr m_prevMinibatchAggregateTrainingLossValue;
        ValuePtr m_prevMinibatchAggregateEvalCriterionValue;
    };
--- a/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
@ -243,6 +243,7 @@ namespace CNTK
        CNTK_API void ForceSynchronousCUDAKernelExecutions();

        CNTK_API void ForceDeterministicAlgorithms();
+        CNTK_API bool ShouldForceDeterministicAlgorithms();

        CNTK_API void SetFixedRandomSeed(unsigned long fixedRandomSeed);

@ -259,5 +260,10 @@ namespace CNTK
        CNTK_API bool AreEqual(const ::CNTK::Value& value1, const ::CNTK::Value& value2, double relativeTolerance = 0.0, double absoluteTolerance = 0.0);

        class VariableResolver;
+
+        ///
+        /// Returns true if num CPU Threads was set.
+        ///
+        bool MaxNumCPUThreadsSet();
    }
 }
--- a/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj
+++ b/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj
@ -61,6 +61,7 @@
      <AdditionalIncludeDirectories>.\API;.\proto;$(SolutionDir)Source\SGDLib;$(SolutionDir)Source\Readers\ReaderLib;$(SolutionDir)Source\ComputationNetworkLib;$(SolutionDir)Source\SequenceTrainingLib;$(SolutionDir)Source\Math;$(SolutionDir)Source\Common\Include;$(SolutionDir)Source\CNTK\BrainScript;$(SolutionDir)Source\ActionsLib;$(MSMPI_INC);$(NvmlInclude);$(ProtobufInclude)</AdditionalIncludeDirectories>
      <AdditionalIncludeDirectories Condition="'$(CNTK_ENABLE_1BitSGD)'=='true'">$(SolutionDir)Source\1BitSGD;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
      <PreprocessorDefinitions Condition="'$(CNTK_ENABLE_1BitSGD)'=='true'">CNTK_PARALLEL_TRAINING_SUPPORT;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <OpenMPSupport>true</OpenMPSupport>
    </ClCompile>
    <Link>
      <AdditionalLibraryDirectories>$(SolutionDir)Source\ComputationNetworkLib;$(SolutionDir)Source\Math;$(MSMPI_LIB64);$(SolutionDir)$(Platform)\$(Configuration);$(NvmlLibPath);$(ProtobufLibPath)</AdditionalLibraryDirectories>
@ -97,7 +98,6 @@
      <PrecompiledHeader>NotUsing</PrecompiledHeader>
      <PreprocessorDefinitions>CNTKV2LIBRARYDLL;WIN32;NDEBUG;_WINDOWS;_USRDLL;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <SDLCheck>true</SDLCheck>
-      <OpenMPSupport>false</OpenMPSupport>
      <AdditionalOptions>/d2Zi+ /bigobj %(AdditionalOptions)</AdditionalOptions>
      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
      <TreatWarningAsError>true</TreatWarningAsError>
@ -187,4 +187,4 @@
  <Target Name="CheckDependencies">
    <Warning Condition="!$(HasProtobuf)" Text="CNTKv2LibraryDll requires Protocol Buffers to build. Please see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#protobuf for installation instructions." />
  </Target>
-</Project>
+</Project>
--- a/Source/CNTKv2LibraryDll/Common.cpp
+++ b/Source/CNTKv2LibraryDll/Common.cpp
@ -351,6 +351,17 @@ namespace CNTK
        {
            Microsoft::MSR::CNTK::Globals::ForceDeterministicAlgorithms();
        }
+
+        bool ShouldForceDeterministicAlgorithms()
+        {
+            return Microsoft::MSR::CNTK::Globals::ShouldForceDeterministicAlgorithms();
+        }
+
+        static std::atomic<bool> s_theadsAreSet(false);
+        bool MaxNumCPUThreadsSet()
+        {
+            return s_theadsAreSet;
+        }
    }

    /*static*/ const NDShape NDShape::Unknown(1, SentinelDimValueForUnknownShape);
@ -490,21 +501,19 @@ namespace CNTK
        return s_allStaticAxes;
    }

-
    void Axis::RegisterAxisName(const std::wstring& axisName)
    {
        s_uniqueDynamicAxisNames.RegisterAxisName(axisName);
    }

-    std::atomic<size_t> s_maxNumCPUThreads(std::thread::hardware_concurrency());
    void SetMaxNumCPUThreads(size_t numCPUThreads)
    {
-        s_maxNumCPUThreads.store(numCPUThreads);
+        Internal::s_theadsAreSet = true;
        Microsoft::MSR::CNTK::CPUMatrix<float>::SetNumThreads((int)numCPUThreads);
    }

    size_t GetMaxNumCPUThreads()
    {
-        return s_maxNumCPUThreads.load();
+        return Microsoft::MSR::CNTK::CPUMatrix<float>::GetMaxNumThreads();
    }
 }
--- a/Source/CNTKv2LibraryDll/Trainer.cpp
+++ b/Source/CNTKv2LibraryDll/Trainer.cpp
@ -7,7 +7,6 @@
 #include "CNTKLibrary.h"
 #include "Utils.h"
 #include "Learner.h"
-
 namespace
 {
    const std::wstring learnersPropertyName = L"Learners";
@ -28,6 +27,10 @@ namespace CNTK
          m_prevMinibatchNumSamples(1),
          m_distributed(false)
    {
+        // By default we set the number of threads to hardware concurrency.
+        if (!Internal::MaxNumCPUThreadsSet())
+            SetMaxNumCPUThreads(std::thread::hardware_concurrency());
+
        std::vector<Variable> combinedFunctionArgs = { m_model, m_lossFunction };
        if (!m_lossFunction->Output().DynamicAxes().empty())
        {
@ -214,11 +217,19 @@ namespace CNTK
                outputsToFetch[outputToFetch.first] = outputs[outputToFetch.first];
        }

-        ValuePtr rootGradientValue = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(m_aggregatedLossFunction->Output().GetDataType(), m_prevMinibatchAggregateTrainingLossValue->Shape(), computeDevice), outputs.at(m_aggregatedLossFunction)->Mask());
+        if(!m_rootGradientValue ||
+            m_aggregatedLossFunction->Output().GetDataType() != m_rootGradientValue->GetDataType() ||
+            m_prevMinibatchAggregateTrainingLossValue->Shape() != m_rootGradientValue->Shape() ||
+            computeDevice != m_rootGradientValue->Device() ||
+            outputs.at(m_aggregatedLossFunction)->Mask() != m_rootGradientValue->Mask())
+        {
+            m_rootGradientValue = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(m_aggregatedLossFunction->Output().GetDataType(), m_prevMinibatchAggregateTrainingLossValue->Shape(), computeDevice), outputs.at(m_aggregatedLossFunction)->Mask());
+        }
+
        if (m_aggregatedLossFunction->Output().GetDataType() == DataType::Float)
-            rootGradientValue->Data()->SetValue(1.0f);
+            m_rootGradientValue->Data()->SetValue(1.0f);
        else
-            rootGradientValue->Data()->SetValue(1.0);
+            m_rootGradientValue->Data()->SetValue(1.0);

        auto modelParameters = m_combinedTrainingFunction->Parameters();
        for (const auto& parameter : modelParameters)
@ -227,7 +238,7 @@ namespace CNTK
        }

        // TODO: Why Backward signature does not take Parameter instead of Variable for gradients?
-        m_combinedTrainingFunction->Backward(backPropSate, { { m_aggregatedLossFunction, rootGradientValue } }, parameterGradients);
+        m_combinedTrainingFunction->Backward(backPropSate, { { m_aggregatedLossFunction, m_rootGradientValue } }, parameterGradients);
        m_prevMinibatchNumSamples = GetSampleCount(m_trainingSampleCountVar, outputs[m_trainingSampleCountVar]);
    }

--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@ -6077,6 +6077,16 @@ int CPUMatrix<ElemType>::SetNumThreads(int numThreads)
    return numThreads;
 }

+template <class ElemType>
+int CPUMatrix<ElemType>::GetMaxNumThreads()
+{
+    int numThreads = (int)std::thread::hardware_concurrency();
+#ifdef _OPENMP
+    numThreads = omp_get_max_threads();
+#endif
+    return numThreads;
+}
+
 // To ensure Intel MKL calls return the same results on all Intel or Intel compatible CPUs,
 // the function set CBWR compatible mode.
 template <class ElemType>
--- a/Source/Math/CPUMatrix.h
+++ b/Source/Math/CPUMatrix.h
@ -390,6 +390,8 @@ public:
 public:
    // This functions do not depend on <ElemType>, i.e. you can call them on any <ElemType>
    static int SetNumThreads(int numThreads);
+    static int GetMaxNumThreads();
+
    static void SetCompatibleMode();

    // static BLAS functions
--- a/Source/Readers/CompositeDataReader/CompositeDataReader.vcxproj
+++ b/Source/Readers/CompositeDataReader/CompositeDataReader.vcxproj
@ -54,6 +54,7 @@
      <SDLCheck>true</SDLCheck>
      <TreatWarningAsError>true</TreatWarningAsError>
      <AdditionalIncludeDirectories>$(SolutionDir)Source\Common\Include;$(SolutionDir)Source\Math;$(SolutionDir)Source\Readers\ReaderLib</AdditionalIncludeDirectories>
+      <OpenMPSupport>true</OpenMPSupport>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@ -101,4 +102,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/bindings/python/cntk/cntk_py.i
+++ b/bindings/python/cntk/cntk_py.i
@ -57,7 +57,7 @@
 %ignore CNTK::Internal::Gather;
 %ignore CNTK::Internal::Scatter;
 %ignore CNTK::Internal::Slice;
-%ignore CNTK::DistributedCommunicator::AggregateAsync;
+%ignore CNTK::Internal::MaxNumCPUThreadsSet;

 // These aren't exported from the CNTK C++ library
 %ignore CNTK::Internal::IsReversingTensorShapesInErrorMessagesEnabled;