[Hived]: Expose and Refine Pod Waiting Reason (#3931)

This commit is contained in:
Yuqi Wang 2019-12-04 11:48:20 +08:00 коммит произвёл GitHub
Родитель 1dcb1198d9
Коммит f22085b159
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
7 изменённых файлов: 51 добавлений и 24 удалений

Просмотреть файл

@ -71,6 +71,7 @@ func NewHivedAlgorithm(sConfig *api.Config) *HivedAlgorithm {
reservedCells: reservedPc,
}
for vc := range nonReservedVcl {
// TODO: Support per-VC configurable intra VC scheduling algo.
h.vcSchedulers[vc] = newDefaultIntraVCScheduler(nonReservedVcl[vc], reservedVcl[vc], gpuNums)
}
for chain, ccl := range h.fullCellList {
@ -407,7 +408,8 @@ func (h *HivedAlgorithm) scheduleAffinityGroupForGpuType(
if gpuType != "" {
if chains := h.chains[gpuType]; chains == nil {
panic(internal.NewBadRequestError(fmt.Sprintf(
"[%v]: pod requesting an invalid GPU type: %v", internal.Key(pod), gpuType)))
"[%v]: pod requesting GPU type %v which the whole cluster does not have",
internal.Key(pod), gpuType)))
} else {
vcHasType := false
for _, chain := range chains {
@ -512,7 +514,7 @@ func (h *HivedAlgorithm) scheduleGuaranteedAffinityGroup(
c := buddyAlloc(h.getTmpFreeCellList(sr.chain), pac.GetLevel(), suggestedNodeSet)
if c == nil {
panic(fmt.Sprintf(
"Cannot find physical cell for a VC cell: %v", pac.GetName()))
"VC Safety Broken: Cannot find physical cell for a VC cell: %v", pac.GetName()))
} else {
preassignedPhysical = c
// create binding (which is temporary and will be cleared after the scheduling,
@ -746,7 +748,8 @@ func generatePodScheduleResult(
if affinityGroupBindInfo == nil {
return internal.PodScheduleResult{
PodWaitInfo: &internal.PodWaitInfo{
FailedNodeReasons: map[string]string{},
// TODO: Enrich the Pod Waiting Reason.
Reason: "",
},
}
}
@ -795,6 +798,7 @@ func generatePodScheduleResult(
} else {
// we check suggested nodes after the preemption is done, otherwise the preemption victims
// may cause the selected node to be excluded from the suggested nodes
// TODO: Keep selectedNode within suggestedNodeSet, so here should be Unreachable
if !suggestedNodeSet.Contains(selectedNode) && newGroup {
panic(fmt.Sprintf("[%v]: node %v picked by algorithm but not in K8S candidates",
internal.Key(pod), selectedNode))
@ -840,8 +844,9 @@ func generateAffinityGroupBindInfo(
for gpuIndex := 0; gpuIndex < len(podPhysicalPlacements[podIndex]); gpuIndex++ {
pGpu := podPhysicalPlacements[podIndex][gpuIndex]
if pGpu == nil {
klog.Warningf("Resources previously allocated has been invalid; pod should wait")
return nil, "", nil
// TODO: Should insist binding and continue to force bind, then the Pod may
// run or retry, instead of stuck in PodBinding state forever here.
panic("Resources previously allocated has been invalid; pod should wait")
}
nodes, gpuIndices := pGpu.(*PhysicalCell).GetPhysicalPlacement()
// here each cell (i.e., pGpu) is only one GPU, hence we takes the first element
@ -932,7 +937,7 @@ func mapNonPreassignedCellToPhysical(c *VirtualCell, suggestedNodeSet common.Set
parentPhysical := mapNonPreassignedCellToPhysical(c.GetParent().(*VirtualCell), suggestedNodeSet)
pc := getFewestOpporPhysicalCell(parentPhysical.GetChildren(), suggestedNodeSet)
if pc == nil || pc.GetPriority() > opportunisticPriority {
panic(fmt.Sprintf("Cannot find physical cell for %v", c.GetName()))
panic(fmt.Sprintf("VC Safety Broken: Cannot find physical cell for %v", c.GetName()))
}
c.SetPreBoundPhysicalCell(pc)
pc.SetPreBoundVirtualCell(c)

Просмотреть файл

@ -188,7 +188,7 @@ var pss = map[types.UID]api.PodSchedulingSpec{
GpuType: "",
GpuNumber: 5,
AffinityGroup: group9,
}, "pod10": { // use a GPU type that the VC does not have; should panic BadRequest
}, "pod10": { // use a GPU type that the VC does not have; should User Error Panic
VirtualCluster: "VC2",
Priority: 1,
LazyPreemptionEnable: true,
@ -445,13 +445,15 @@ func testCasesThatShouldSucceed(t *testing.T, h *HivedAlgorithm) {
func testOneCaseThatShouldFail(t *testing.T, h *HivedAlgorithm, podNames []string) {
defer func() {
if r := recover(); r != nil {
if err, ok := r.(*api.WebServerError); ok && err.Code == http.StatusBadRequest {
t.Logf("Got BadRequest as expected: %v", err)
if err, ok := r.(*api.WebServerError); ok &&
err.Code >= http.StatusBadRequest &&
err.Code < http.StatusInternalServerError {
t.Logf("Got User Error Panic as expected: %v", err)
} else {
t.Errorf("Expected BadRequest error, but got %v", r)
t.Errorf("Expected User Error Panic, but got %v", r)
}
} else {
t.Errorf("Expected BadRequest error, but got none")
t.Errorf("Expected User Error Panic, but got none")
}
}()
var psr internal.PodScheduleResult

Просмотреть файл

@ -46,6 +46,7 @@ type defaultIntraVCScheduler struct {
// currently we create a topologyAwareScheduler for each cluster view (each chain, each reservation).
// we plan to support multiple cluster views in one scheduler, and to support schedule pods
// across different cluster views.
// TODO: Support an affinity group can relax to be allocated across multiple chains.
nonReservedSchedulers map[CellChain]*topologyAwareScheduler
reservedSchedulers map[api.ReservationId]*topologyAwareScheduler
}

Просмотреть файл

@ -112,6 +112,8 @@ func (t *topologyAwareScheduler) Schedule(
for podIndex := 0; podIndex < len(sortedPodGpuNumbers); podIndex++ {
gpuNumber := sortedPodGpuNumbers[podIndex]
n := selectedNodes[podIndex]
// TODO: Optimize findNodesForPods and findGpusInNode together to get a better placement,
// such as also aware intra node topology when findNodesForPods.
selectedGpus, nodeAvailableGpus[n] = findGpusInNode(n, gpuNumber, priority, nodeAvailableGpus[n], t.levelGpuNum)
if podPlacements[gpuNumber] == nil {
podPlacements[gpuNumber] = []CellList{}
@ -227,6 +229,12 @@ func (t *topologyAwareScheduler) updateClusterView(p CellPriority, suggestedNode
func findNodesForPods(cv clusterView, gpuNums []int32, p CellPriority) []int32 {
// sort the nodes according to gpu numbers in each node.
// this is achieved through the Less method defined in type CellList.
// TODO: Ensure Opportunistic Pods also can always can find the solution, regardless of
// the iteration order.
// For example:
// 1. clusterView = 2GPU Node, 1GPU Node
// 2. gpuNums = 1GPU Pod, 2GPU Pod
// First 1GPU Pod may allocate to 2GPU Node, but the latter pod cannot be fitted anymore.
sort.Stable(cv)
currentNodeIndices := make([]int32, len(gpuNums)) // indices of the currently picked nodes
podIndex := 0
@ -320,7 +328,8 @@ func findGpusInNode(
searchGpuIndex--
if searchGpuIndex < 0 {
if bestAffinity == highestLevel {
panic(fmt.Sprintf("failed to allocate %v GPUs in picked node %v", gpuNum, n.GetName()))
// Unreachable
panic(fmt.Sprintf("Assert Failure: failed to allocate %v GPUs in picked node %v", gpuNum, n.GetName()))
}
availableGpus = removePickedGpus(availableGpus, bestAffinityGpuIndices)
return bestAffinityGpus, availableGpus
@ -336,7 +345,9 @@ func getOptimalAffinity(gpuNum int32, levelGpuNum map[CellLevel]int32) CellLevel
return l
}
}
panic(fmt.Sprintf("pod allocated a node but exceeds the capacity of the current chain"))
// Unreachable
panic(fmt.Sprintf("Assert Failure: pod allocated a node but exceeds the capacity of the current chain"))
}
// checkCurrentGpus checks if the currently picked GPUs have the lowest LCA. It also checks if the solution

Просмотреть файл

@ -36,7 +36,7 @@ import (
// WebServer Callbacks with K8S Default Scheduler
// Notes:
// 1. Error should be passed by panic
// 1. Error should be delivered by panic
// 2. Should not assume previous succeeded operation also has been successfully
// executed by K8S Default Scheduler.
type ExtenderHandlers struct {
@ -53,7 +53,11 @@ type InspectHandlers struct {
// SchedulerAlgorithm is used to make the pod schedule decision based on its whole
// cluster scheduling view constructed from its Add/Update/Delete callbacks.
// Notes:
// 1. Error should be passed by panic
// 1. Error should be delivered by panic and it will not change any state.
// For WebServer Callbacks, all Panics will be recovered as error responses,
// see HandleInformerPanic.
// For Informer Callbacks, only User Error Panics will be recovered as error
// logs, other Panics will crash the whole process, see HandleWebServerPanic.
// 2. Should take all the input parameters as readonly and return pod schedule
// decision by PodScheduleResult.
// 3. {Schedule, AddAllocatedPod, DeleteAllocatedPod} will never be executed
@ -170,8 +174,8 @@ func IsAllocated(state PodState) bool {
// No need to use it recover scheduler waiting resource
type PodWaitInfo struct {
// NodeName -> The reason why the node cannot be used for the Pod to allocate.
FailedNodeReasons map[string]string
// The reason why no preemptable or free resource to allocate the Pod now.
Reason string
}
// No need to use it recover scheduler preempting resource

Просмотреть файл

@ -527,19 +527,19 @@ func (s *HivedScheduler) filterRoutine(args ei.ExtenderArgs) *ei.ExtenderFilterR
}
// Return FailedNodes to tell K8S Default Scheduler that preemption may help.
failedNodeReasons := map[string]string{}
failedNodes := map[string]string{}
for _, victim := range result.PodPreemptInfo.VictimPods {
node := victim.Spec.NodeName
if _, ok := failedNodeReasons[node]; !ok {
failedNodeReasons[node] = fmt.Sprintf(
if _, ok := failedNodes[node]; !ok {
failedNodes[node] = fmt.Sprintf(
"node(%v) is waiting for victim Pod(s) to be preempted: %v",
node, internal.Key(victim))
} else {
failedNodeReasons[node] += ", " + internal.Key(victim)
failedNodes[node] += ", " + internal.Key(victim)
}
}
return &ei.ExtenderFilterResult{
FailedNodes: failedNodeReasons,
FailedNodes: failedNodes,
}
} else {
s.podScheduleStatuses[pod.UID] = &internal.PodScheduleStatus{
@ -547,9 +547,13 @@ func (s *HivedScheduler) filterRoutine(args ei.ExtenderArgs) *ei.ExtenderFilterR
PodState: internal.PodWaiting,
PodScheduleResult: &result,
}
// Return Error to tell K8S Default Scheduler that preemption must not help.
if result.PodWaitInfo != nil {
return &ei.ExtenderFilterResult{
FailedNodes: result.PodWaitInfo.FailedNodeReasons,
Error: fmt.Sprintf(
"Pod is waiting for preemptable or free resource to appear: %v",
result.PodWaitInfo.Reason),
}
} else {
return &ei.ExtenderFilterResult{}

Просмотреть файл

@ -133,7 +133,7 @@ func (ln tcpKeepAliveListener) Accept() (net.Conn, error) {
return tc, nil
}
// Error should be passed by panic
// Error should be delivered by panic
type servePathHandler func(w http.ResponseWriter, r *http.Request)
func (ws *WebServer) serve(handler servePathHandler) servePathHandler {