Printing new information from hardware (cpu, gpu) in cntk and tests. Also, extracting info from baseline files.
Signed-off-by: Ivan Rodriguez <t-ivrodr@microsoft.com>
This commit is contained in:
Родитель
d412ea0aaf
Коммит
404896a847
|
@ -370,6 +370,27 @@ void PrintUsageInfo()
|
|||
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
|
||||
}
|
||||
|
||||
void PrintGpuInfo()
|
||||
{
|
||||
std::vector<GpuData> gpusData = GetGpusData();
|
||||
|
||||
if (gpusData.empty())
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
|
||||
LOGPRINTF(stderr, "GPU info: \n\n");
|
||||
|
||||
for (GpuData data : gpusData)
|
||||
{
|
||||
LOGPRINTF(stderr, "\t\tDevice ID: %d\n", data.deviceId);
|
||||
LOGPRINTF(stderr, "\t\tGPU Compute Capability: %d.%d\n", data.major, data.minor);
|
||||
LOGPRINTF(stderr, "\t\tCUDA cores: %d\n\n", data.cudaCores);
|
||||
}
|
||||
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// main() for use with BrainScript
|
||||
// ---------------------------------------------------------------------------
|
||||
|
@ -499,6 +520,9 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
|
|||
// echo config info to log
|
||||
PrintBuiltInfo();
|
||||
|
||||
//echo gpu info to log
|
||||
PrintGpuInfo();
|
||||
|
||||
// execute the actions
|
||||
// std::string type = config(L"precision", "float");
|
||||
int numCPUThreads = config(L"numCPUThreads", 0);
|
||||
|
@ -599,6 +623,8 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])
|
|||
}
|
||||
|
||||
PrintBuiltInfo(); // this one goes to log file
|
||||
PrintGpuInfo();
|
||||
|
||||
std::string timestamp = TimeDateStamp();
|
||||
|
||||
// dump config info
|
||||
|
|
|
@ -45,7 +45,6 @@ int bestGPUDummy = 42; // put something into this CPP, as to avoid a linker warn
|
|||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <algorithm>
|
||||
|
||||
#include <memory>
|
||||
#include "CrossProcessMutex.h"
|
||||
|
||||
|
@ -122,6 +121,8 @@ public:
|
|||
static const int AllDevices = -1; // can be used to specify all GPUs in GetDevices() call
|
||||
static const int RequeryDevices = -2; // Requery refreshing statistics and picking the same number as last query
|
||||
std::vector<int> GetDevices(int number = AllDevices, BestGpuFlags flags = bestGpuNormal); // get multiple devices
|
||||
std::vector<ProcessorData *> GetProcessorData();
|
||||
|
||||
private:
|
||||
bool LockDevice(int deviceId, bool trial = true);
|
||||
};
|
||||
|
@ -527,6 +528,33 @@ std::vector<int> BestGpu::GetDevices(int number, BestGpuFlags p_bestFlags)
|
|||
return best; // return the array of the best GPUs
|
||||
}
|
||||
|
||||
std::vector<GpuData> GetGpusData()
|
||||
{
|
||||
std::vector<GpuData> data;
|
||||
|
||||
auto bestGpu = make_unique<BestGpu>();
|
||||
|
||||
std::vector<ProcessorData*> processorData = bestGpu->GetProcessorData();
|
||||
if (!processorData.empty())
|
||||
{
|
||||
for (ProcessorData* pd : processorData)
|
||||
{
|
||||
GpuData gpuData;
|
||||
gpuData.major = pd->deviceProp.major;
|
||||
gpuData.minor = pd->deviceProp.minor;
|
||||
gpuData.cudaCores = pd->cores;
|
||||
gpuData.deviceId = pd->deviceId;
|
||||
data.push_back(gpuData);
|
||||
}
|
||||
}
|
||||
|
||||
return std::move(data);
|
||||
}
|
||||
|
||||
std::vector<ProcessorData*> BestGpu::GetProcessorData(){
|
||||
return m_procData;
|
||||
}
|
||||
|
||||
// QueryNvmlData - Query data from the Nvidia Management Library, and accumulate counters,
|
||||
// In case failure, this function simply backs out without filling in the data structure and without setting m_nvmlData.
|
||||
void BestGpu::QueryNvmlData()
|
||||
|
|
|
@ -8,15 +8,27 @@
|
|||
// #define CPUONLY // #define this to build without GPU support nor needing the SDK installed
|
||||
#include "CommonMatrix.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
// define IConfigRecord and ConfigParameters as incomplete types, in order to avoid having to include "ScriptableObjects.h" and "Config.h", as that confuses some .CU code
|
||||
namespace Microsoft { namespace MSR { namespace ScriptableObjects { struct IConfigRecord; }}}
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
#ifndef CPUONLY
|
||||
|
||||
struct GpuData{
|
||||
int major;
|
||||
int minor;
|
||||
int deviceId;
|
||||
int cudaCores;
|
||||
};
|
||||
|
||||
std::vector<GpuData> GetGpusData();
|
||||
|
||||
class ConfigParameters;
|
||||
DEVICEID_TYPE DeviceFromConfig(const ConfigParameters& config);
|
||||
DEVICEID_TYPE DeviceFromConfig(const ScriptableObjects::IConfigRecord& config);
|
||||
|
||||
#else
|
||||
template <class ConfigRecordType>
|
||||
static inline DEVICEID_TYPE DeviceFromConfig(const ConfigRecordType& /*config*/)
|
||||
|
|
|
@ -0,0 +1,149 @@
|
|||
#!/usr/bin/env python
|
||||
# ----------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# ---------------------------------------------------------
|
||||
# This is a test driver for running end-to-end CNTK tests
|
||||
|
||||
import sys, os, traceback, subprocess, random, re, time, stat
|
||||
|
||||
try:
|
||||
import six
|
||||
except ImportError:
|
||||
print("Python package 'six' not installed. Please run 'pip install six'.")
|
||||
sys.exit(1)
|
||||
|
||||
thisDir = os.path.dirname(os.path.realpath(__file__))
|
||||
windows = os.getenv("OS")=="Windows_NT"
|
||||
|
||||
def cygpath(path, relative=False):
|
||||
if windows:
|
||||
if path.startswith('/'):
|
||||
return path
|
||||
path = os.path.abspath(path)
|
||||
if not relative and path[1]==':': # Windows drive
|
||||
path = '/cygdrive/' + path[0] + path[2:]
|
||||
path = path.replace('\\','/')
|
||||
|
||||
return path
|
||||
|
||||
# This class encapsulates an instance of the example
|
||||
class Example:
|
||||
# "Suite/TestName" => instance of Test
|
||||
allExamplesIndexedByFullName = {}
|
||||
|
||||
def __init__(self, suite, name, testDir):
|
||||
self.suite = suite
|
||||
self.name = name
|
||||
self.fullName = suite + "/" + name
|
||||
self.testDir = testDir
|
||||
self.testResult = ""
|
||||
self.trainResult = ""
|
||||
|
||||
# Populates Tests.allTestsIndexedByFullName by scanning directory tree
|
||||
@staticmethod
|
||||
def discoverAllExamples():
|
||||
testsDir = thisDir
|
||||
for dirName, subdirList, fileList in os.walk(testsDir):
|
||||
if 'testcases.yml' in fileList:
|
||||
testDir = dirName
|
||||
exampleName = os.path.basename(dirName)
|
||||
suiteDir = os.path.dirname(dirName)
|
||||
# suite name will be derived from the path components
|
||||
suiteName = os.path.relpath(suiteDir, testsDir).replace('\\', '/')
|
||||
|
||||
#if suiteName.startswith("Examples"):
|
||||
example = Example(suiteName, exampleName, testDir)
|
||||
Example.allExamplesIndexedByFullName[example.fullName.lower()] = example
|
||||
|
||||
# Finds a location of a baseline file by probing different names in the following order:
|
||||
# baseline.$os.$flavor.$device.txt
|
||||
# baseline.$os.$flavor.txt
|
||||
# baseline.$os.$device.txt
|
||||
# baseline.$os.txt
|
||||
# baseline.$flavor.$device.txt
|
||||
# baseline.$flavor.txt
|
||||
# baseline.$device.txt
|
||||
# baseline.txt
|
||||
def findBaselineFilesList(self):
|
||||
baselineFilesList = []
|
||||
|
||||
oses = [".windows", ".linux", ""]
|
||||
devices = [".cpu", ".gpu", ""]
|
||||
flavors = [".debug", ".release", ""]
|
||||
|
||||
for o in oses:
|
||||
for device in devices:
|
||||
for flavor in flavors:
|
||||
candidateName = "baseline" + o + flavor + device + ".txt"
|
||||
fullPath = cygpath(os.path.join(self.testDir, candidateName), relative=True)
|
||||
if os.path.isfile(fullPath):
|
||||
baselineFilesList.append(fullPath)
|
||||
|
||||
return baselineFilesList
|
||||
|
||||
def getLastTestResult(line):
|
||||
return line[0] + line[1] + "\n" + line[2].replace('; ', '\n').replace(' ','\n')
|
||||
|
||||
def getLastTrainResult(line):
|
||||
separator = "\n[Training]\n"
|
||||
epochsInfo, parameters = line[0], line[1]
|
||||
return epochsInfo + separator + parameters.replace('; ', '\n')
|
||||
|
||||
def runCommand():
|
||||
Example.allExamplesIndexedByFullName = list(sorted(Example.allExamplesIndexedByFullName.values(), key=lambda test: test.fullName))
|
||||
allExamples = Example.allExamplesIndexedByFullName
|
||||
|
||||
print ("CNTK - Metrics collector")
|
||||
six.print_("Getting examples: " + " ".join([y.fullName for y in allExamples]))
|
||||
|
||||
for example in allExamples:
|
||||
baselineListForExample = example.findBaselineFilesList()
|
||||
for baseline in baselineListForExample:
|
||||
with open(baseline, "r") as f:
|
||||
baselineContent = f.read()
|
||||
trainResults = re.findall('.*(Finished Epoch\[[ ]*\d+ of \d+\]\:) \[Training\] (.*)', baselineContent, re.MULTILINE)
|
||||
testResults = re.findall('.*(Final Results: Minibatch\[1-\d+\]:(\s+\* \d+|))\s+(.*)', baselineContent, re.MULTILINE)
|
||||
if trainResults:
|
||||
six.print_("==============================================================================")
|
||||
six.print_("Suite Name " + example.suite)
|
||||
six.print_("Example " + example.name)
|
||||
six.print_("Baseline: " + baseline + "\n")
|
||||
six.print_(getLastTrainResult(trainResults[-1]))
|
||||
six.print_("")
|
||||
if testResults:
|
||||
six.print_(getLastTestResult(testResults[-1]))
|
||||
gitHash = re.search('.*Build SHA1:\s([a-z0-9]{40})\s', baselineContent)
|
||||
if gitHash is not None:
|
||||
six.print_("\nBuild Hash: ")
|
||||
six.print_(gitHash.group(1))
|
||||
hardwareInfo = re.search(".*Hardware info:\s+"
|
||||
"CPU Model Mame:\s*(.*)\s+"
|
||||
"CPU cores:\s*(.*)\s+"
|
||||
"Hardware threads: (\d+)\s+"
|
||||
"Total Memory:\s*(.*)\s+"
|
||||
"GPU Model Name: (.*)?\s+"
|
||||
"GPU Memory: (.*)?", baselineContent)
|
||||
if hardwareInfo is not None:
|
||||
six.print_("Hardware information information: ")
|
||||
six.print_("CPU model " + hardwareInfo.groups(1))
|
||||
six.print_("CPU cores " + hardwareInfo.groups(2))
|
||||
six.print_("Hardware threads: " + hardwareInfo.groups(3))
|
||||
six.print_("Total memory: " + hardwareInfo.groups(4))
|
||||
six.print_("GPU name: " + hardwareInfo.groups(5))
|
||||
six.print_("GPU name: " + hardwareInfo.groups(6))
|
||||
gpuInfo = re.search(".*GPU info:\s+Device ID: (\d+)\s+"
|
||||
"Compute Capability: (\d\.\d)\s+"
|
||||
"CUDA cores: (\d+)", baselineContent)
|
||||
if gpuInfo is not None:
|
||||
six.print_("Additional GPU information: ")
|
||||
six.print_("CPU model " + hardwareInfo.groups(1))
|
||||
six.print_("CPU cores " + hardwareInfo.groups(2))
|
||||
six.print_("Hardware threads: " + hardwareInfo.groups(3))
|
||||
|
||||
six.print_("==============================================================================")
|
||||
|
||||
# ======================= Entry point =======================
|
||||
# discover all the tests
|
||||
Example.discoverAllExamples()
|
||||
|
||||
runCommand()
|
|
@ -26,6 +26,33 @@ MPIArgs=
|
|||
DeleteExistingModels=1
|
||||
DeleteModelsAfterTest=1
|
||||
|
||||
printHardwareInfo()
|
||||
{
|
||||
cpuName=$(cat /proc/cpuinfo | grep -m 1 'model name' | cut -d ":" -f 2 | tr -s " ")
|
||||
cpuCores=$(cat /proc/cpuinfo | grep -m 1 'cpu cores' | cut -d ":" -f 2 | tr -s " ")
|
||||
totalMemory=$(cat /proc/meminfo | grep 'MemTotal' | cut -d ":" -f 2 | tr -s " ")
|
||||
nproc=$(nproc)
|
||||
nvidiaSmiPath="nvidia-smi"
|
||||
|
||||
if [ "$OS" == "Windows_NT" ]; then
|
||||
nvidiaSmiPath="/cygdrive/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi"
|
||||
fi
|
||||
|
||||
if [ -f "$nvidiaSmiPath" ]; then
|
||||
gpuName=$("$nvidiaSmiPath" --query-gpu=gpu_name --format=csv,noheader)
|
||||
gpuMem=$("$nvidiaSmiPath" --query-gpu=memory.total --format=csv,noheader)
|
||||
fi
|
||||
|
||||
echo "Hardware info:
|
||||
CPU Model Mame:$cpuName
|
||||
CPU cores:$cpuCores
|
||||
Hardware threads: $nproc
|
||||
Total Memory:$totalMemory
|
||||
GPU Model Name: $gpuName
|
||||
GPU Memory: $gpuMem
|
||||
"
|
||||
}
|
||||
|
||||
# Helper function to print and run a command
|
||||
run()
|
||||
{
|
||||
|
@ -119,3 +146,5 @@ cntkmpirun()
|
|||
cntkrun "$2" "$3"
|
||||
return $?
|
||||
}
|
||||
|
||||
printHardwareInfo
|
||||
|
|
Загрузка…
Ссылка в новой задаче