### Description <!-- Describe your changes. --> See #19921 Just to address one comment: https://github.com/microsoft/onnxruntime/pull/19921#discussion_r1543398640 since this is an external branch. need to open another pull request for this. ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> --------- Co-authored-by: Sai Kishan Pampana <sai.kishan.pampana@intel.com> Co-authored-by: rachguo <rachguo@rachguos-Mini.attlocal.net> Co-authored-by: Jian Chen <cjian@microsoft.com>
This commit is contained in:
Родитель
12e2538065
Коммит
19793de1b3
|
@ -32,7 +32,7 @@ limitations under the License.
|
|||
#include "core/common/span_utils.h"
|
||||
#include "core/platform/env.h"
|
||||
#include "core/platform/scoped_resource.h"
|
||||
#if defined(_M_X64) && !defined(_M_ARM64EC) && defined(ONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH)
|
||||
#if defined(_M_X64) && !defined(_M_ARM64EC)
|
||||
#include "core/platform/windows/hardware_core_enumerator.h"
|
||||
#endif
|
||||
#include <unsupported/Eigen/CXX11/ThreadPool>
|
||||
|
@ -252,7 +252,7 @@ void WindowsEnv::SleepForMicroseconds(int64_t micros) const {
|
|||
}
|
||||
|
||||
// EIGEN_NO_CPUID is not defined in any C/C++ source code. It is a compile option.
|
||||
#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID) && defined(ONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH)
|
||||
#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID)
|
||||
static constexpr std::array<int, 3> kVendorID_Intel = {0x756e6547, 0x6c65746e, 0x49656e69}; // "GenuntelineI"
|
||||
#endif
|
||||
int WindowsEnv::DefaultNumCores() {
|
||||
|
@ -261,7 +261,7 @@ int WindowsEnv::DefaultNumCores() {
|
|||
|
||||
int WindowsEnv::GetNumPhysicalCpuCores() const {
|
||||
// EIGEN_NO_CPUID is not defined in any C/C++ source code. It is a compile option.
|
||||
#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID) && defined(ONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH)
|
||||
#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID)
|
||||
// The following code is a temporary fix for a perf problem on Intel's Meteor Lake CPUs. The Intel compute platform has
|
||||
// a hybrid architecture that some CPU cores runs significant slower than the others. If we distribute our compute work
|
||||
// evenly to all CPU cores, the slowest CPU core will drag the performance down. So, instead, we reduce the total number
|
||||
|
|
|
@ -15,7 +15,7 @@ struct LogicalProcessorInformation {
|
|||
|
||||
struct CoreCounter {
|
||||
uint32_t PhysicalCores = 0;
|
||||
uint32_t SocDieCores = 0;
|
||||
uint32_t LLCCores = 0;
|
||||
};
|
||||
|
||||
static LogicalProcessorInformation GetLogicalProcessorInfos(LOGICAL_PROCESSOR_RELATIONSHIP relationship) {
|
||||
|
@ -42,7 +42,7 @@ uint32_t CountSetBits(DWORD input) {
|
|||
return c;
|
||||
}
|
||||
|
||||
static CoreCounter GetNumberOPhysicalAndEngineeringCores() {
|
||||
static CoreCounter GetCoreInfo() {
|
||||
auto logicalProcessorInformation = GetLogicalProcessorInfos(RelationAll);
|
||||
|
||||
CoreCounter cores;
|
||||
|
@ -73,17 +73,18 @@ static CoreCounter GetNumberOPhysicalAndEngineeringCores() {
|
|||
|
||||
read += currentProcessorInfo->Size;
|
||||
}
|
||||
// Cores with L2 and LLC cache levels = # Physical Cores - # logical cores without LLC
|
||||
cores.LLCCores = cores.PhysicalCores - CountSetBits(dwLevel2GroupMask & ~dwLevel3GroupMask);
|
||||
|
||||
cores.SocDieCores = CountSetBits(dwLevel2GroupMask & ~dwLevel3GroupMask);
|
||||
return cores;
|
||||
}
|
||||
|
||||
uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() {
|
||||
// # of physical cores = # of P cores + # of E Cores + # of Soc Cores.
|
||||
// # of logical cores = # of P cores x 2 (if hyper threading is enabled) + # of E cores + # of Soc Cores.
|
||||
auto cores = GetNumberOPhysicalAndEngineeringCores();
|
||||
// We want to use the number of physical cores, but exclude soc cores
|
||||
return cores.PhysicalCores - cores.SocDieCores;
|
||||
auto cores = GetCoreInfo();
|
||||
|
||||
return cores.LLCCores;
|
||||
}
|
||||
|
||||
} // namespace onnxruntime
|
||||
|
|
|
@ -14,7 +14,7 @@ struct LogicalProcessorInformation {
|
|||
|
||||
struct CoreCounter {
|
||||
uint32_t PhysicalCores = 0;
|
||||
uint32_t Num2CacheCores = 0;
|
||||
uint32_t LLCCores = 0;
|
||||
};
|
||||
|
||||
static LogicalProcessorInformation GetLogicalProcessorInfos(LOGICAL_PROCESSOR_RELATIONSHIP relationship) {
|
||||
|
@ -42,7 +42,7 @@ uint32_t CountSetBits(DWORD input) {
|
|||
return c;
|
||||
}
|
||||
|
||||
static CoreCounter GetNumberOPhysicalAndEngineeringCores() {
|
||||
static CoreCounter GetCoreInfo() {
|
||||
auto logicalProcessorInformation = GetLogicalProcessorInfos(RelationAll);
|
||||
|
||||
CoreCounter cores;
|
||||
|
@ -64,6 +64,7 @@ static CoreCounter GetNumberOPhysicalAndEngineeringCores() {
|
|||
cores.PhysicalCores++;
|
||||
break;
|
||||
case RelationCache:
|
||||
//Cache level masks count Logicial processors
|
||||
if (currentProcessorInfo->Cache.Level == 2) {
|
||||
dwLevel2GroupMask |= currentProcessorInfo->Cache.GroupMask.Mask;
|
||||
} else if (currentProcessorInfo->Cache.Level == 3) {
|
||||
|
@ -75,14 +76,15 @@ static CoreCounter GetNumberOPhysicalAndEngineeringCores() {
|
|||
read += currentProcessorInfo->Size;
|
||||
}
|
||||
|
||||
cores.Num2CacheCores = CountSetBits(dwLevel2GroupMask & ~dwLevel3GroupMask);
|
||||
cores.LLCCores = cores.PhysicalCores - CountSetBits(dwLevel2GroupMask & ~dwLevel3GroupMask);
|
||||
|
||||
return cores;
|
||||
}
|
||||
|
||||
uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() {
|
||||
// # of physical cores = # of P cores + # of E Cores + # of Soc Cores.
|
||||
// # of logical cores = # of P cores x 2 (if hyper threading is enabled) + # of E cores + # of Soc Cores.
|
||||
auto cores = GetNumberOPhysicalAndEngineeringCores();
|
||||
auto cores = GetCoreInfo();
|
||||
|
||||
#if !defined(_M_ARM64EC) && !defined(_M_ARM64) && !defined(__aarch64__)
|
||||
const int kVendorID_Intel[3] = {0x756e6547, 0x6c65746e, 0x49656e69}; // "GenuntelineI"
|
||||
|
@ -97,9 +99,8 @@ uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() {
|
|||
auto isHybrid = (regs_leaf7[3] & (1 << 15));
|
||||
|
||||
if (isIntel && isHybrid) {
|
||||
// We want to use the number of physical cores, but exclude soc cores
|
||||
// On Intel Hybrid processors, numSocCores == cores.Num2CacheCores
|
||||
return cores.PhysicalCores - cores.Num2CacheCores;
|
||||
// We want to use the number of physical cores, but exclude cores without an LLC
|
||||
return cores.LLCCores;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче