Printing new information from hardware (cpu, gpu) in cntk and tests. Also, extracting info from baseline files.

Signed-off-by: Ivan Rodriguez <t-ivrodr@microsoft.com>
This commit is contained in:
Ivan Rodriguez 2016-06-01 14:04:42 +02:00
Родитель d412ea0aaf
Коммит 404896a847
5 изменённых файлов: 246 добавлений и 2 удалений

Просмотреть файл

@ -370,6 +370,27 @@ void PrintUsageInfo()
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
}
void PrintGpuInfo()
{
std::vector<GpuData> gpusData = GetGpusData();
if (gpusData.empty())
{
return;
}
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
LOGPRINTF(stderr, "GPU info: \n\n");
for (GpuData data : gpusData)
{
LOGPRINTF(stderr, "\t\tDevice ID: %d\n", data.deviceId);
LOGPRINTF(stderr, "\t\tGPU Compute Capability: %d.%d\n", data.major, data.minor);
LOGPRINTF(stderr, "\t\tCUDA cores: %d\n\n", data.cudaCores);
}
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
}
// ---------------------------------------------------------------------------
// main() for use with BrainScript
// ---------------------------------------------------------------------------
@ -499,6 +520,9 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
// echo config info to log
PrintBuiltInfo();
//echo gpu info to log
PrintGpuInfo();
// execute the actions
// std::string type = config(L"precision", "float");
int numCPUThreads = config(L"numCPUThreads", 0);
@ -599,6 +623,8 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])
}
PrintBuiltInfo(); // this one goes to log file
PrintGpuInfo();
std::string timestamp = TimeDateStamp();
// dump config info

Просмотреть файл

@ -45,7 +45,6 @@ int bestGPUDummy = 42; // put something into this CPP, as to avoid a linker warn
#include <stdio.h>
#include <string.h>
#include <algorithm>
#include <memory>
#include "CrossProcessMutex.h"
@ -122,6 +121,8 @@ public:
static const int AllDevices = -1; // can be used to specify all GPUs in GetDevices() call
static const int RequeryDevices = -2; // Requery refreshing statistics and picking the same number as last query
std::vector<int> GetDevices(int number = AllDevices, BestGpuFlags flags = bestGpuNormal); // get multiple devices
std::vector<ProcessorData *> GetProcessorData();
private:
bool LockDevice(int deviceId, bool trial = true);
};
@ -527,6 +528,33 @@ std::vector<int> BestGpu::GetDevices(int number, BestGpuFlags p_bestFlags)
return best; // return the array of the best GPUs
}
std::vector<GpuData> GetGpusData()
{
std::vector<GpuData> data;
auto bestGpu = make_unique<BestGpu>();
std::vector<ProcessorData*> processorData = bestGpu->GetProcessorData();
if (!processorData.empty())
{
for (ProcessorData* pd : processorData)
{
GpuData gpuData;
gpuData.major = pd->deviceProp.major;
gpuData.minor = pd->deviceProp.minor;
gpuData.cudaCores = pd->cores;
gpuData.deviceId = pd->deviceId;
data.push_back(gpuData);
}
}
return std::move(data);
}
std::vector<ProcessorData*> BestGpu::GetProcessorData(){
return m_procData;
}
// QueryNvmlData - Query data from the Nvidia Management Library, and accumulate counters,
// In case failure, this function simply backs out without filling in the data structure and without setting m_nvmlData.
void BestGpu::QueryNvmlData()

Просмотреть файл

@ -8,15 +8,27 @@
// #define CPUONLY // #define this to build without GPU support nor needing the SDK installed
#include "CommonMatrix.h"
#include <vector>
// define IConfigRecord and ConfigParameters as incomplete types, in order to avoid having to include "ScriptableObjects.h" and "Config.h", as that confuses some .CU code
namespace Microsoft { namespace MSR { namespace ScriptableObjects { struct IConfigRecord; }}}
namespace Microsoft { namespace MSR { namespace CNTK {
#ifndef CPUONLY
struct GpuData{
int major;
int minor;
int deviceId;
int cudaCores;
};
std::vector<GpuData> GetGpusData();
class ConfigParameters;
DEVICEID_TYPE DeviceFromConfig(const ConfigParameters& config);
DEVICEID_TYPE DeviceFromConfig(const ScriptableObjects::IConfigRecord& config);
#else
template <class ConfigRecordType>
static inline DEVICEID_TYPE DeviceFromConfig(const ConfigRecordType& /*config*/)

Просмотреть файл

@ -0,0 +1,149 @@
#!/usr/bin/env python
# ----------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
# This is a test driver for running end-to-end CNTK tests
import sys, os, traceback, subprocess, random, re, time, stat
try:
import six
except ImportError:
print("Python package 'six' not installed. Please run 'pip install six'.")
sys.exit(1)
thisDir = os.path.dirname(os.path.realpath(__file__))
windows = os.getenv("OS")=="Windows_NT"
def cygpath(path, relative=False):
if windows:
if path.startswith('/'):
return path
path = os.path.abspath(path)
if not relative and path[1]==':': # Windows drive
path = '/cygdrive/' + path[0] + path[2:]
path = path.replace('\\','/')
return path
# This class encapsulates an instance of the example
class Example:
# "Suite/TestName" => instance of Test
allExamplesIndexedByFullName = {}
def __init__(self, suite, name, testDir):
self.suite = suite
self.name = name
self.fullName = suite + "/" + name
self.testDir = testDir
self.testResult = ""
self.trainResult = ""
# Populates Tests.allTestsIndexedByFullName by scanning directory tree
@staticmethod
def discoverAllExamples():
testsDir = thisDir
for dirName, subdirList, fileList in os.walk(testsDir):
if 'testcases.yml' in fileList:
testDir = dirName
exampleName = os.path.basename(dirName)
suiteDir = os.path.dirname(dirName)
# suite name will be derived from the path components
suiteName = os.path.relpath(suiteDir, testsDir).replace('\\', '/')
#if suiteName.startswith("Examples"):
example = Example(suiteName, exampleName, testDir)
Example.allExamplesIndexedByFullName[example.fullName.lower()] = example
# Finds a location of a baseline file by probing different names in the following order:
# baseline.$os.$flavor.$device.txt
# baseline.$os.$flavor.txt
# baseline.$os.$device.txt
# baseline.$os.txt
# baseline.$flavor.$device.txt
# baseline.$flavor.txt
# baseline.$device.txt
# baseline.txt
def findBaselineFilesList(self):
baselineFilesList = []
oses = [".windows", ".linux", ""]
devices = [".cpu", ".gpu", ""]
flavors = [".debug", ".release", ""]
for o in oses:
for device in devices:
for flavor in flavors:
candidateName = "baseline" + o + flavor + device + ".txt"
fullPath = cygpath(os.path.join(self.testDir, candidateName), relative=True)
if os.path.isfile(fullPath):
baselineFilesList.append(fullPath)
return baselineFilesList
def getLastTestResult(line):
return line[0] + line[1] + "\n" + line[2].replace('; ', '\n').replace(' ','\n')
def getLastTrainResult(line):
separator = "\n[Training]\n"
epochsInfo, parameters = line[0], line[1]
return epochsInfo + separator + parameters.replace('; ', '\n')
def runCommand():
Example.allExamplesIndexedByFullName = list(sorted(Example.allExamplesIndexedByFullName.values(), key=lambda test: test.fullName))
allExamples = Example.allExamplesIndexedByFullName
print ("CNTK - Metrics collector")
six.print_("Getting examples: " + " ".join([y.fullName for y in allExamples]))
for example in allExamples:
baselineListForExample = example.findBaselineFilesList()
for baseline in baselineListForExample:
with open(baseline, "r") as f:
baselineContent = f.read()
trainResults = re.findall('.*(Finished Epoch\[[ ]*\d+ of \d+\]\:) \[Training\] (.*)', baselineContent, re.MULTILINE)
testResults = re.findall('.*(Final Results: Minibatch\[1-\d+\]:(\s+\* \d+|))\s+(.*)', baselineContent, re.MULTILINE)
if trainResults:
six.print_("==============================================================================")
six.print_("Suite Name " + example.suite)
six.print_("Example " + example.name)
six.print_("Baseline: " + baseline + "\n")
six.print_(getLastTrainResult(trainResults[-1]))
six.print_("")
if testResults:
six.print_(getLastTestResult(testResults[-1]))
gitHash = re.search('.*Build SHA1:\s([a-z0-9]{40})\s', baselineContent)
if gitHash is not None:
six.print_("\nBuild Hash: ")
six.print_(gitHash.group(1))
hardwareInfo = re.search(".*Hardware info:\s+"
"CPU Model Mame:\s*(.*)\s+"
"CPU cores:\s*(.*)\s+"
"Hardware threads: (\d+)\s+"
"Total Memory:\s*(.*)\s+"
"GPU Model Name: (.*)?\s+"
"GPU Memory: (.*)?", baselineContent)
if hardwareInfo is not None:
six.print_("Hardware information information: ")
six.print_("CPU model " + hardwareInfo.groups(1))
six.print_("CPU cores " + hardwareInfo.groups(2))
six.print_("Hardware threads: " + hardwareInfo.groups(3))
six.print_("Total memory: " + hardwareInfo.groups(4))
six.print_("GPU name: " + hardwareInfo.groups(5))
six.print_("GPU name: " + hardwareInfo.groups(6))
gpuInfo = re.search(".*GPU info:\s+Device ID: (\d+)\s+"
"Compute Capability: (\d\.\d)\s+"
"CUDA cores: (\d+)", baselineContent)
if gpuInfo is not None:
six.print_("Additional GPU information: ")
six.print_("CPU model " + hardwareInfo.groups(1))
six.print_("CPU cores " + hardwareInfo.groups(2))
six.print_("Hardware threads: " + hardwareInfo.groups(3))
six.print_("==============================================================================")
# ======================= Entry point =======================
# discover all the tests
Example.discoverAllExamples()
runCommand()

Просмотреть файл

@ -26,6 +26,33 @@ MPIArgs=
DeleteExistingModels=1
DeleteModelsAfterTest=1
printHardwareInfo()
{
cpuName=$(cat /proc/cpuinfo | grep -m 1 'model name' | cut -d ":" -f 2 | tr -s " ")
cpuCores=$(cat /proc/cpuinfo | grep -m 1 'cpu cores' | cut -d ":" -f 2 | tr -s " ")
totalMemory=$(cat /proc/meminfo | grep 'MemTotal' | cut -d ":" -f 2 | tr -s " ")
nproc=$(nproc)
nvidiaSmiPath="nvidia-smi"
if [ "$OS" == "Windows_NT" ]; then
nvidiaSmiPath="/cygdrive/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi"
fi
if [ -f "$nvidiaSmiPath" ]; then
gpuName=$("$nvidiaSmiPath" --query-gpu=gpu_name --format=csv,noheader)
gpuMem=$("$nvidiaSmiPath" --query-gpu=memory.total --format=csv,noheader)
fi
echo "Hardware info:
CPU Model Mame:$cpuName
CPU cores:$cpuCores
Hardware threads: $nproc
Total Memory:$totalMemory
GPU Model Name: $gpuName
GPU Memory: $gpuMem
"
}
# Helper function to print and run a command
run()
{
@ -119,3 +146,5 @@ cntkmpirun()
cntkrun "$2" "$3"
return $?
}
printHardwareInfo