зеркало из https://github.com/microsoft/KubeGPU.git
More logging, fix case of no best tree found
This commit is contained in:
Родитель
bee81de573
Коммит
3b76d60186
|
@ -228,7 +228,7 @@ func translateToTree(node *sctypes.SortedTreeNode, cont *types.ContainerInfo) {
|
|||
}
|
||||
|
||||
// find total GPUs needed
|
||||
func ConvertToBestGPURequests(podInfo *types.PodInfo) {
|
||||
func ConvertToBestGPURequests(podInfo *types.PodInfo) bool {
|
||||
numGPUs := int64(0)
|
||||
for _, cont := range podInfo.RunningContainers {
|
||||
numGPUs += cont.Requests[gputypes.ResourceGPU]
|
||||
|
@ -239,19 +239,23 @@ func ConvertToBestGPURequests(podInfo *types.PodInfo) {
|
|||
}
|
||||
}
|
||||
bestTree := findBestTreeInCache(int(numGPUs))
|
||||
//fmt.Printf("Best tree\n")
|
||||
//types.PrintTreeNode(bestTree)
|
||||
// now translate requests to best tree
|
||||
contKeys := utils.SortedStringKeys(podInfo.RunningContainers)
|
||||
for _, contKey := range contKeys {
|
||||
contCopy := podInfo.RunningContainers[contKey]
|
||||
translateToTree(bestTree, &contCopy)
|
||||
podInfo.RunningContainers[contKey] = contCopy
|
||||
}
|
||||
contKeys = utils.SortedStringKeys(podInfo.InitContainers)
|
||||
for _, contKey := range contKeys {
|
||||
contCopy := podInfo.InitContainers[contKey]
|
||||
translateToTree(bestTree, &contCopy)
|
||||
podInfo.InitContainers[contKey] = contCopy
|
||||
if bestTree != nil {
|
||||
//fmt.Printf("Best tree\n")
|
||||
//types.PrintTreeNode(bestTree)
|
||||
// now translate requests to best tree
|
||||
contKeys := utils.SortedStringKeys(podInfo.RunningContainers)
|
||||
for _, contKey := range contKeys {
|
||||
contCopy := podInfo.RunningContainers[contKey]
|
||||
translateToTree(bestTree, &contCopy)
|
||||
podInfo.RunningContainers[contKey] = contCopy
|
||||
}
|
||||
contKeys = utils.SortedStringKeys(podInfo.InitContainers)
|
||||
for _, contKey := range contKeys {
|
||||
contCopy := podInfo.InitContainers[contKey]
|
||||
translateToTree(bestTree, &contCopy)
|
||||
podInfo.InitContainers[contKey] = contCopy
|
||||
}
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
|
|
@ -23,7 +23,7 @@ func TranslateGPUContainerResources(alloc types.ResourceList, cont types.Contain
|
|||
return TranslateGPUResources(numGPUs, alloc, cont.DevRequests)
|
||||
}
|
||||
|
||||
func TranslateGPUResorces(nodeInfo *types.NodeInfo, podInfo *types.PodInfo) error {
|
||||
func TranslateGPUResorces(nodeInfo *types.NodeInfo, podInfo *types.PodInfo) (error, bool) {
|
||||
if podInfo.Requests[GPUTopologyGeneration] == int64(0) { // zero implies no topology, or topology explictly given
|
||||
for contName, contCopy := range podInfo.InitContainers {
|
||||
contCopy.DevRequests = TranslateGPUContainerResources(nodeInfo.Allocatable, contCopy)
|
||||
|
@ -33,12 +33,13 @@ func TranslateGPUResorces(nodeInfo *types.NodeInfo, podInfo *types.PodInfo) erro
|
|||
contCopy.DevRequests = TranslateGPUContainerResources(nodeInfo.Allocatable, contCopy)
|
||||
podInfo.RunningContainers[contName] = contCopy
|
||||
}
|
||||
return nil
|
||||
return nil, true
|
||||
} else if podInfo.Requests[GPUTopologyGeneration] == int64(1) {
|
||||
ConvertToBestGPURequests(podInfo)
|
||||
return nil
|
||||
found := ConvertToBestGPURequests(podInfo) // found a tree
|
||||
return nil, found
|
||||
} else {
|
||||
return fmt.Errorf("Invalid topology generation request")
|
||||
glog.Errorf("Invalid topology generation request %v", podInfo.Requests[GPUTopologyGeneration])
|
||||
return fmt.Errorf("Invalid topology generation request"), false
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -51,9 +52,13 @@ func (ns *NvidiaGPUScheduler) RemoveNode(nodeName string) {
|
|||
}
|
||||
|
||||
func (ns *NvidiaGPUScheduler) PodFitsDevice(nodeInfo *types.NodeInfo, podInfo *types.PodInfo, fillAllocateFrom bool, runGrpScheduler bool) (bool, []sctypes.PredicateFailureReason, float64) {
|
||||
err := TranslateGPUResorces(nodeInfo, podInfo)
|
||||
err, found := TranslateGPUResorces(nodeInfo, podInfo)
|
||||
if err != nil {
|
||||
panic("Unexpected error")
|
||||
//panic("Unexpected error")
|
||||
return false, nil, 0.0
|
||||
}
|
||||
if !found {
|
||||
return false, nil, 0.0
|
||||
}
|
||||
if runGrpScheduler {
|
||||
glog.V(5).Infof("Running group scheduler on device requests %+v", podInfo)
|
||||
|
@ -63,10 +68,13 @@ func (ns *NvidiaGPUScheduler) PodFitsDevice(nodeInfo *types.NodeInfo, podInfo *t
|
|||
}
|
||||
|
||||
func (ns *NvidiaGPUScheduler) PodAllocate(nodeInfo *types.NodeInfo, podInfo *types.PodInfo, runGrpScheduler bool) error {
|
||||
err := TranslateGPUResorces(nodeInfo, podInfo)
|
||||
err, found := TranslateGPUResorces(nodeInfo, podInfo)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if !found {
|
||||
return fmt.Errorf("TranslateGPUResorces fails as no translation is found")
|
||||
}
|
||||
if runGrpScheduler {
|
||||
fits, reasons, _ := grpalloc.PodFitsGroupConstraints(nodeInfo, podInfo, true)
|
||||
if !fits {
|
||||
|
|
|
@ -130,10 +130,12 @@ func (ngm *NvidiaGPUManager) UpdateGPUInfo() error {
|
|||
if err != nil {
|
||||
return err
|
||||
}
|
||||
glog.V(5).Infof("GetGPUInfo returns %s", string(body))
|
||||
var gpus gpusInfo
|
||||
if err := json.Unmarshal(body, &gpus); err != nil {
|
||||
return err
|
||||
}
|
||||
glog.V(5).Infof("GPUInfo: %+v", gpus)
|
||||
// convert certain resources to correct units, such as memory and Bandwidth
|
||||
for i := range gpus.Gpus {
|
||||
gpus.Gpus[i].Memory.Global *= int64(1024) * int64(1024) // in units of MiB
|
||||
|
@ -202,9 +204,11 @@ func (ngm *NvidiaGPUManager) Start() error {
|
|||
func (ngm *NvidiaGPUManager) UpdateNodeInfo(nodeInfo *types.NodeInfo) error {
|
||||
err := ngm.UpdateGPUInfo() // don't care about error, ignore it
|
||||
if err != nil {
|
||||
glog.Infof("UpdateGPUInfo encounters error %+v, setting GPUs to zero", err)
|
||||
ngm.numGpus = 0
|
||||
return err
|
||||
}
|
||||
glog.V(4).Infof("NumGPUs found = %d", ngm.numGpus)
|
||||
nodeInfo.Capacity[gputypes.ResourceGPU] = int64(len(ngm.gpus))
|
||||
nodeInfo.Allocatable[gputypes.ResourceGPU] = int64(len(ngm.gpus))
|
||||
for _, val := range ngm.gpus {
|
||||
|
|
Загрузка…
Ссылка в новой задаче