More logging, fix case of no best tree found

This commit is contained in:
sanjeevm0 2018-08-28 10:08:32 -07:00
Родитель bee81de573
Коммит 3b76d60186
3 изменённых файлов: 39 добавлений и 23 удалений

Просмотреть файл

@ -228,7 +228,7 @@ func translateToTree(node *sctypes.SortedTreeNode, cont *types.ContainerInfo) {
}
// find total GPUs needed
func ConvertToBestGPURequests(podInfo *types.PodInfo) {
func ConvertToBestGPURequests(podInfo *types.PodInfo) bool {
numGPUs := int64(0)
for _, cont := range podInfo.RunningContainers {
numGPUs += cont.Requests[gputypes.ResourceGPU]
@ -239,19 +239,23 @@ func ConvertToBestGPURequests(podInfo *types.PodInfo) {
}
}
bestTree := findBestTreeInCache(int(numGPUs))
//fmt.Printf("Best tree\n")
//types.PrintTreeNode(bestTree)
// now translate requests to best tree
contKeys := utils.SortedStringKeys(podInfo.RunningContainers)
for _, contKey := range contKeys {
contCopy := podInfo.RunningContainers[contKey]
translateToTree(bestTree, &contCopy)
podInfo.RunningContainers[contKey] = contCopy
}
contKeys = utils.SortedStringKeys(podInfo.InitContainers)
for _, contKey := range contKeys {
contCopy := podInfo.InitContainers[contKey]
translateToTree(bestTree, &contCopy)
podInfo.InitContainers[contKey] = contCopy
if bestTree != nil {
//fmt.Printf("Best tree\n")
//types.PrintTreeNode(bestTree)
// now translate requests to best tree
contKeys := utils.SortedStringKeys(podInfo.RunningContainers)
for _, contKey := range contKeys {
contCopy := podInfo.RunningContainers[contKey]
translateToTree(bestTree, &contCopy)
podInfo.RunningContainers[contKey] = contCopy
}
contKeys = utils.SortedStringKeys(podInfo.InitContainers)
for _, contKey := range contKeys {
contCopy := podInfo.InitContainers[contKey]
translateToTree(bestTree, &contCopy)
podInfo.InitContainers[contKey] = contCopy
}
return true
}
return false
}

Просмотреть файл

@ -23,7 +23,7 @@ func TranslateGPUContainerResources(alloc types.ResourceList, cont types.Contain
return TranslateGPUResources(numGPUs, alloc, cont.DevRequests)
}
func TranslateGPUResorces(nodeInfo *types.NodeInfo, podInfo *types.PodInfo) error {
func TranslateGPUResorces(nodeInfo *types.NodeInfo, podInfo *types.PodInfo) (error, bool) {
if podInfo.Requests[GPUTopologyGeneration] == int64(0) { // zero implies no topology, or topology explictly given
for contName, contCopy := range podInfo.InitContainers {
contCopy.DevRequests = TranslateGPUContainerResources(nodeInfo.Allocatable, contCopy)
@ -33,12 +33,13 @@ func TranslateGPUResorces(nodeInfo *types.NodeInfo, podInfo *types.PodInfo) erro
contCopy.DevRequests = TranslateGPUContainerResources(nodeInfo.Allocatable, contCopy)
podInfo.RunningContainers[contName] = contCopy
}
return nil
return nil, true
} else if podInfo.Requests[GPUTopologyGeneration] == int64(1) {
ConvertToBestGPURequests(podInfo)
return nil
found := ConvertToBestGPURequests(podInfo) // found a tree
return nil, found
} else {
return fmt.Errorf("Invalid topology generation request")
glog.Errorf("Invalid topology generation request %v", podInfo.Requests[GPUTopologyGeneration])
return fmt.Errorf("Invalid topology generation request"), false
}
}
@ -51,9 +52,13 @@ func (ns *NvidiaGPUScheduler) RemoveNode(nodeName string) {
}
func (ns *NvidiaGPUScheduler) PodFitsDevice(nodeInfo *types.NodeInfo, podInfo *types.PodInfo, fillAllocateFrom bool, runGrpScheduler bool) (bool, []sctypes.PredicateFailureReason, float64) {
err := TranslateGPUResorces(nodeInfo, podInfo)
err, found := TranslateGPUResorces(nodeInfo, podInfo)
if err != nil {
panic("Unexpected error")
//panic("Unexpected error")
return false, nil, 0.0
}
if !found {
return false, nil, 0.0
}
if runGrpScheduler {
glog.V(5).Infof("Running group scheduler on device requests %+v", podInfo)
@ -63,10 +68,13 @@ func (ns *NvidiaGPUScheduler) PodFitsDevice(nodeInfo *types.NodeInfo, podInfo *t
}
func (ns *NvidiaGPUScheduler) PodAllocate(nodeInfo *types.NodeInfo, podInfo *types.PodInfo, runGrpScheduler bool) error {
err := TranslateGPUResorces(nodeInfo, podInfo)
err, found := TranslateGPUResorces(nodeInfo, podInfo)
if err != nil {
return err
}
if !found {
return fmt.Errorf("TranslateGPUResorces fails as no translation is found")
}
if runGrpScheduler {
fits, reasons, _ := grpalloc.PodFitsGroupConstraints(nodeInfo, podInfo, true)
if !fits {

Просмотреть файл

@ -130,10 +130,12 @@ func (ngm *NvidiaGPUManager) UpdateGPUInfo() error {
if err != nil {
return err
}
glog.V(5).Infof("GetGPUInfo returns %s", string(body))
var gpus gpusInfo
if err := json.Unmarshal(body, &gpus); err != nil {
return err
}
glog.V(5).Infof("GPUInfo: %+v", gpus)
// convert certain resources to correct units, such as memory and Bandwidth
for i := range gpus.Gpus {
gpus.Gpus[i].Memory.Global *= int64(1024) * int64(1024) // in units of MiB
@ -202,9 +204,11 @@ func (ngm *NvidiaGPUManager) Start() error {
func (ngm *NvidiaGPUManager) UpdateNodeInfo(nodeInfo *types.NodeInfo) error {
err := ngm.UpdateGPUInfo() // don't care about error, ignore it
if err != nil {
glog.Infof("UpdateGPUInfo encounters error %+v, setting GPUs to zero", err)
ngm.numGpus = 0
return err
}
glog.V(4).Infof("NumGPUs found = %d", ngm.numGpus)
nodeInfo.Capacity[gputypes.ResourceGPU] = int64(len(ngm.gpus))
nodeInfo.Allocatable[gputypes.ResourceGPU] = int64(len(ngm.gpus))
for _, val := range ngm.gpus {