CNTK/Common/BestGpu.cpp

//
// <copyright file="BestGPU.cpp" company="Microsoft">
//     Copyright (c) Microsoft Corporation.  All rights reserved.
// </copyright>
//

#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings

#include "BestGpu.h"
#include "commandArgUtil.h" // for ConfigParameters
#ifndef CPUONLY
#pragma comment (lib, "cudart.lib")
#pragma comment (lib, "nvml.lib")
#include <cuda_runtime.h>
#include <nvml.h>
#include <vector>
#endif
#include "CommonMatrix.h" // for CPUDEVICE and AUTOPLACEMATRIX

#ifndef CPUONLY  // #define this to disable GPUs

// CUDA-C includes
#include <cuda.h>
#include <windows.h>
#include <delayimp.h>
#include <Shlobj.h>
#include <stdio.h>

#ifdef MPI_SUPPORT
#include "mpi.h"
#endif
extern int myRank;
extern int numProcs;

// ---------------------------------------------------------------------------
// BestGpu class
// ---------------------------------------------------------------------------

namespace Microsoft { namespace MSR { namespace CNTK {

struct ProcessorData
{
    int cores;
    nvmlMemory_t memory;
    nvmlUtilization_t utilization;
    cudaDeviceProp deviceProp;
    size_t cudaFreeMem;
    size_t cudaTotalMem;
    bool dbnFound;
    bool cnFound;
    int deviceId; // the deviceId (cuda side) for this processor
};

enum BestGpuFlags
{
    bestGpuNormal = 0,
    bestGpuAvoidSharing = 1, // don't share with other known machine learning Apps (cl.exe/dbn.exe)
    bestGpuFavorMemory = 2, // favor memory
    bestGpuFavorUtilization = 4, // favor low utilization
    bestGpuFavorSpeed = 8, // favor fastest processor
    bestGpuRequery = 256, // rerun the last query, updating statistics
};

class BestGpu
{
private:
    bool m_initialized; // initialized
    bool m_nvmlData; // nvml Data is valid
    bool m_cudaData; // cuda Data is valid
    int m_deviceCount; // how many devices are available?
    int m_queryCount; // how many times have we queried the usage counters?
    BestGpuFlags m_lastFlags; // flag state at last query
    int m_lastCount; // count of devices (with filtering of allowed Devices)
    std::vector<ProcessorData*> m_procData;
    int m_allowedDevices; // bitfield of allowed devices
    void GetCudaProperties();
    void GetNvmlData();
    void QueryNvmlData();

public:
    BestGpu() : m_initialized(false), m_nvmlData(false), m_cudaData(false), m_deviceCount(0), m_queryCount(0),
        m_lastFlags(bestGpuNormal), m_lastCount(0), m_allowedDevices(-1)
    {
        Init();
    }
    ~BestGpu();
    void Init();
    void SetAllowedDevices(const std::vector<int>& devices); // only allow certain GPUs
    bool DeviceAllowed(int device);
    void DisallowDevice(int device) { m_allowedDevices &= ~(1 << device); }
    void AllowAll(); // reset to allow all GPUs (no allowed list)
    bool UseMultiple(); // using multiple GPUs?
    int GetDevice(BestGpuFlags flags = bestGpuNormal); // get a single device
    static const int AllDevices = -1;  // can be used to specify all GPUs in GetDevices() call
    static const int RequeryDevices = -2;  // Requery refreshing statistics and picking the same number as last query
    std::vector<int> GetDevices(int number = AllDevices, BestGpuFlags flags = bestGpuNormal); // get multiple devices
};

// DeviceFromConfig - Parse 'deviceId' config parameter to determine what type of behavior is desired
//Symbol - Meaning
// Auto - automatically pick a single GPU based on <20>BestGpu<70> score
// CPU  - use the CPU
// 0    - or some other single number, use a single GPU with CUDA ID same as the number
// 0:2:3- an array of ids to use, (PTask will only use the specified IDs)
// *3   - a count of GPUs to use (PTask)
// All  - Use all the GPUs (PTask)
DEVICEID_TYPE DeviceFromConfig(const ConfigParameters& config)
{
    static BestGpu* g_bestGpu = NULL;
    DEVICEID_TYPE deviceId = CPUDEVICE;

    ConfigValue val = config("deviceId", "auto");
    if (!_stricmp(val.c_str(), "CPU"))
    {
        return CPUDEVICE;
    }

    // potential GPU device, so init our class
    if (g_bestGpu == NULL)
    {
        g_bestGpu = new BestGpu();
    }
    if (!_stricmp(val.c_str(), "Auto"))
    {
#ifdef MPI_SUPPORT
        // make sure deviceId is unique among processes on the same machine
        g_bestGpu->AllowAll();
        std::string MyName(getenv("COMPUTERNAME"));
        for (int i = 0; i < numProcs; i++)
        {
            DEVICEID_TYPE yourDeviceId = deviceId;
            if (myRank == i)
            {
                std::vector<int> devices = g_bestGpu->GetDevices(1);
                deviceId = yourDeviceId = (DEVICEID_TYPE)devices[0];
            }
            MPI_Bcast(&yourDeviceId, 1, MPI_INT, i, MPI_COMM_WORLD);
            {
                INT32 YourSize = (INT32)MyName.length();
                MPI_Bcast(&YourSize,1,MPI_INT,i,MPI_COMM_WORLD);
                vector<char> YourName(YourSize+1);
                if (myRank == i)
                    copy(MyName.begin(), MyName.end(), YourName.begin());
                MPI_Bcast(YourName.data(), YourSize + 1, MPI_CHAR, i, MPI_COMM_WORLD);
                if (myRank != i)
                {
                    if (!_strcmpi(MyName.data(), YourName.data()))
                    {
                        g_bestGpu->DisallowDevice(yourDeviceId);
                    }
                }
            }
        }
#else
        std::vector<int> devices = g_bestGpu->GetDevices(1);
        deviceId = (DEVICEID_TYPE)devices[0];
#endif
    }
    else if (!_stricmp(val.c_str(), "All"))
    {
        std::vector<int> devices = g_bestGpu->GetDevices(BestGpu::AllDevices);
        deviceId = (DEVICEID_TYPE)devices[0];
    }
    else if (val.size() == 2 && val[0] == '*' && isdigit(val[1]))
    {
        int number = (int)(val[1] - '0');
        std::vector<int> devices = g_bestGpu->GetDevices(number);
        deviceId = (DEVICEID_TYPE)devices[0];
    }
    else
    {
        ConfigArray arr = val;
        if (arr.size() == 1)
        {
            deviceId = arr[0];
        }
        else
        {
            argvector<int> allowed = arr;
            g_bestGpu->SetAllowedDevices(allowed);
            std::vector<int> devices = g_bestGpu->GetDevices();
            deviceId = (DEVICEID_TYPE)devices[0];
        }
    }
    return deviceId;
}

// !!!!This is from helper_cuda.h which comes with CUDA samples!!!! Consider if it is beneficial to just include all helper_cuda.h
// TODO: This is duplicated in GPUMatrix.cu
// Beginning of GPU Architecture definitions
inline int _ConvertSMVer2Cores(int major, int minor)
{
    // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
    typedef struct
    {
        int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
        int Cores;
    } sSMtoCores;

    sSMtoCores nGpuArchCoresPerSM[] =
    {
        { 0x10,  8 }, // Tesla Generation (SM 1.0) G80 class
        { 0x11,  8 }, // Tesla Generation (SM 1.1) G8x class
        { 0x12,  8 }, // Tesla Generation (SM 1.2) G9x class
        { 0x13,  8 }, // Tesla Generation (SM 1.3) GT200 class
        { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
        { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
        { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
        { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
        {   -1, -1 }
    };

    int index = 0;

    while (nGpuArchCoresPerSM[index].SM != -1)
    {
        if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor))
        {
            return nGpuArchCoresPerSM[index].Cores;
        }

        index++;
    }

    return nGpuArchCoresPerSM[7].Cores;
}

void BestGpu::GetCudaProperties()
{
    if (m_cudaData)
        return;

    int dev = 0;

    for (ProcessorData* pd : m_procData)
    {
        cudaSetDevice(dev);
        pd->deviceId = dev;
        cudaGetDeviceProperties(&pd->deviceProp, dev);
        size_t free;
        size_t total;
        cudaMemGetInfo(&free, &total);
        pd->cores = _ConvertSMVer2Cores(pd->deviceProp.major, pd->deviceProp.minor) * pd->deviceProp.multiProcessorCount;
        pd->cudaFreeMem = free;
        pd->cudaTotalMem = total;
        dev++;
        cudaDeviceReset();
    }
    m_cudaData = m_procData.size() > 0;
}

void BestGpu::Init()
{
    if (m_initialized)
        return;

    //get the count of objects
    //cudaError_t err =
    cudaGetDeviceCount(&m_deviceCount);

    ProcessorData pdEmpty = { 0 };
    for (int i = 0; i < m_deviceCount; i++)
    {
        ProcessorData* data = new ProcessorData();
        *data = pdEmpty;
        m_procData.push_back(data);
    }

    if (m_deviceCount > 0)
    {
        GetCudaProperties();
        GetNvmlData();
    }
    m_initialized = true;
}


BestGpu::~BestGpu()
{
    for (ProcessorData* data : m_procData)
    {
        delete data;
    }
    m_procData.clear();

    if (m_nvmlData)
    {
        nvmlShutdown();
    }
}

// GetNvmlData - Get data from the Nvidia Management Library
void BestGpu::GetNvmlData()
{
    // if we already did this, or we couldn't initialize the CUDA data, skip it
    if (m_nvmlData || !m_cudaData)
        return;

    // First initialize NVML library
    nvmlReturn_t result = nvmlInit();
    if (NVML_SUCCESS != result)
    {
        return;
    }

    QueryNvmlData();
}

// GetDevice - Determine the best device ID to use
// bestFlags - flags that modify how the score is calculated
int BestGpu::GetDevice(BestGpuFlags bestFlags)
{
    std::vector<int> best = GetDevices(1, bestFlags);
    return best[0];
}

// SetAllowedDevices - set the allowed devices array up
// devices - vector of allowed devices
void BestGpu::SetAllowedDevices(const std::vector<int>& devices)
{
    m_allowedDevices = 0;
    for (int device : devices)
    {
        m_allowedDevices |= (1 << device);
    }
}

// DeviceAllowed - is a particular device allowed?
// returns: true if the device is allowed, otherwise false
bool BestGpu::DeviceAllowed(int device)
{
    return !!(m_allowedDevices & (1<<device));
}

// AllowAll - Reset the allowed filter to allow all GPUs
void BestGpu::AllowAll()
{
    m_allowedDevices = -1; // set all bits
}

// UseMultiple - Are we using multiple GPUs?
// returns: true if more than one GPU was returned in last call
bool BestGpu::UseMultiple()
{
    return m_lastCount > 1;
}

// GetDevices - Determine the best device IDs to use
// number - how many devices do we want?
// bestFlags - flags that modify how the score is calculated
std::vector<int> BestGpu::GetDevices(int number, BestGpuFlags p_bestFlags)
{
    BestGpuFlags bestFlags = p_bestFlags;

    // if they want all devices give them eveything we have
    if (number == AllDevices)
        number = max(m_deviceCount, 1);
    else if (number == RequeryDevices)
    {
        number = m_lastCount;
    }

    // create the initial array, initialized to all CPU
    std::vector<int> best(number, -1);
    std::vector<double> scores(number, -1.0);

    // if no GPUs were found, we should use the CPU
    if (m_procData.size() == 0)
    {
        best.resize(1);
        return best;
    }

    // get latest data
    QueryNvmlData();

    double utilGpuW = 0.15;
    double utilMemW = 0.1;
    double speedW = 0.2;
    double freeMemW = 0.2;
    double mlAppRunningW = 0.2;

    // if it's a requery, just use the same flags as last time
    if (bestFlags & bestGpuRequery)
        bestFlags = m_lastFlags;

    // adjust weights if necessary
    if (bestFlags&bestGpuAvoidSharing)
    {
        mlAppRunningW *= 3;
    }
    if (bestFlags&bestGpuFavorMemory) // favor memory
    {
        freeMemW *= 2;
    }
    if (bestFlags&bestGpuFavorUtilization) // favor low utilization
    {
        utilGpuW *= 2;
        utilMemW *= 2;
    }
    if (bestFlags&bestGpuFavorSpeed) // favor fastest processor
    {
        speedW *= 2;
    }

    for (ProcessorData* pd : m_procData)
    {
        double score = 0.0;

        if (!DeviceAllowed(pd->deviceId))
            continue;

        // GPU utilization score
        score = (1.0 - pd->utilization.gpu / 75.0f) * utilGpuW;
        score += (1.0 - pd->utilization.memory / 60.0f) * utilMemW;
        score += pd->cores / 1000.0f * speedW;
        double mem = pd->memory.free / (double)pd->memory.total;
        // if it's not a tcc driver, then it's WDDM driver and values will be off because windows allocates all the memory from the nvml point of view
        if (!pd->deviceProp.tccDriver)
            mem = pd->cudaFreeMem / (double)pd->cudaTotalMem;
        score += mem * freeMemW;
        score += ((pd->cnFound || pd->dbnFound) ? 0 : 1)*mlAppRunningW;
        for (int i = 0; i < best.size(); i++)
        {
            // look for a better score
            if (score > scores[i])
            {
                // make room for this score in the correct location (insertion sort)
                for (int j = (int)best.size() - 1; j > i; --j)
                {
                    scores[j] = scores[j - 1];
                    best[j] = best[j - 1];
                }
                scores[i] = score;
                best[i] = pd->deviceId;
                break;
            }
        }
    }

    // now get rid of any extra empty slots and disallowed devices
    for (int j = (int)best.size() - 1; j > 0; --j)
    {
        // if this device is not allowed, or never was set remove it
        if (best[j] == -1)
            best.pop_back();
        else
            break;
    }

    // save off the last values for future requeries
    m_lastFlags = bestFlags;
    m_lastCount = (int)best.size();

    // if we eliminated all GPUs, use CPU
    if (best.size() == 0)
    {
        best.push_back(-1);
    }

    return best; // return the array of the best GPUs
}

// QueryNvmlData - Query data from the Nvidia Management Library, and accumulate counters
void BestGpu::QueryNvmlData()
{
    if (!m_cudaData)
        return;

    for (int i = 0; i < m_deviceCount; i++)
    {
        nvmlDevice_t device;
        nvmlPciInfo_t pci;
        nvmlMemory_t memory;
        nvmlUtilization_t utilization;

        // Query for device handle to perform operations on a device
        nvmlReturn_t result = nvmlDeviceGetHandleByIndex(i, &device);
        if (NVML_SUCCESS != result)
        {
            return;
        }

        // pci.busId is very useful to know which device physically you're talking to
        // Using PCI identifier you can also match nvmlDevice handle to CUDA device.
        result = nvmlDeviceGetPciInfo(device, &pci);
        if (NVML_SUCCESS != result)
        {
            return;
        }

        ProcessorData* curPd = NULL;
        for (ProcessorData* pd : m_procData)
        {
            if (pd->deviceProp.pciBusID == (int)pci.bus)
            {
                curPd = pd;
                break;
            }
        }

        if (curPd == NULL)
            continue;

        // Get the memory usage, will only work for TCC drivers
        result = nvmlDeviceGetMemoryInfo(device, &memory);
        if (NVML_SUCCESS != result)
        {
            return;
        }
        curPd->memory = memory;

        // Get the memory usage, will only work for TCC drivers
        result = nvmlDeviceGetUtilizationRates(device, &utilization);
        if (NVML_SUCCESS != result)
        {
            return;
        }
        if (m_queryCount)
        {
            // average, slightly overweighting the most recent query
            curPd->utilization.gpu = (curPd->utilization.gpu*m_queryCount + utilization.gpu * 2) / (m_queryCount + 2);
            curPd->utilization.memory = (curPd->utilization.memory*m_queryCount + utilization.memory * 2) / (m_queryCount + 2);
        }
        else
        {
            curPd->utilization = utilization;
        }
        m_queryCount++;

        unsigned size = 0;
        result = nvmlDeviceGetComputeRunningProcesses(device, &size, NULL);
        if (size > 0)
        {
            std::vector<nvmlProcessInfo_t> processInfo(size);
            processInfo.resize(size);
            for (nvmlProcessInfo_t info : processInfo)
                info.usedGpuMemory = 0;
            result = nvmlDeviceGetComputeRunningProcesses(device, &size, &processInfo[0]);
            if (NVML_SUCCESS != result)
            {
                return;
            }
            bool cnFound = false;
            bool dbnFound = false;
            for (nvmlProcessInfo_t info : processInfo)
            {
                std::string name;
                name.resize(256);
                unsigned len = (unsigned)name.length();
                nvmlSystemGetProcessName(info.pid, (char*)name.data(), len);
                name.resize(strlen(name.c_str()));
                size_t pos = name.find_last_of('\\');
                if (pos != std::string::npos)
                    name = name.substr(pos + 1);
                if (GetCurrentProcessId() == info.pid || name.length() == 0)
                    continue;
                cnFound = (cnFound || (!name.compare("cn.exe")));
                dbnFound = (dbnFound || (!name.compare("dbn.exe")));
            }
            // set values to save
            curPd->cnFound = cnFound;
            curPd->dbnFound = dbnFound;
        }
    }
    m_nvmlData = true;
    return;
}

}}}

#ifdef _WIN32

// ---------------------------------------------------------------------------
// some interfacing with the Windows DLL system to ensure clean shutdown vs. delay loading of CUDA DLLs
// ---------------------------------------------------------------------------

// The "notify hook" gets called for every call to the
// delay load helper.  This allows a user to hook every call and
// skip the delay load helper entirely.
//
// dliNotify == { dliStartProcessing | dliNotePreLoadLibrary  | dliNotePreGetProc | dliNoteEndProcessing } on this call.

extern "C" INT_PTR WINAPI DelayLoadNofify(
    unsigned        dliNotify,
    PDelayLoadInfo  pdli
    )
{
    // load the library from an alternate path
    if (dliNotify == dliNotePreLoadLibrary && !strcmp(pdli->szDll, "nvml.dll"))
    {
        WCHAR *path;
        WCHAR nvmlPath[MAX_PATH] = { 0 };
        HRESULT hr = SHGetKnownFolderPath(FOLDERID_ProgramFiles, 0, NULL, &path);
        lstrcpy(nvmlPath, path);
        CoTaskMemFree(path);
        if (SUCCEEDED(hr))
        {
            HMODULE module = NULL;
            WCHAR* dllName = L"\\NVIDIA Corporation\\NVSMI\\nvml.dll";
            lstrcat(nvmlPath, dllName);
            module = LoadLibraryEx(nvmlPath, NULL, LOAD_WITH_ALTERED_SEARCH_PATH);
            return (INT_PTR)module;
        }
    }
    // check for failed GetProc, old version of the driver
    if (dliNotify == dliFailGetProc && !strcmp(pdli->szDll, "nvml.dll"))
    {
        char name[256];
        size_t len = strlen(pdli->dlp.szProcName);
        strcpy_s(name, pdli->dlp.szProcName);
        // if the version 2 APIs are not supported, truncate "_v2"
        if (len>3 && name[len-1] == '2')
            name[len-3] = 0;
        FARPROC pfnRet = ::GetProcAddress(pdli->hmodCur, name);
        return (INT_PTR)pfnRet;
    }

    return NULL;
}

ExternC
PfnDliHook __pfnDliNotifyHook2 = (PfnDliHook)DelayLoadNofify;
// This is the failure hook, dliNotify = {dliFailLoadLib|dliFailGetProc}
ExternC
PfnDliHook   __pfnDliFailureHook2 = (PfnDliHook)DelayLoadNofify;

#endif  // _WIN32

#endif  // CPUONLY