[Hived]: Expose and Refine Pod Waiting Reason (#3931)

2019-12-04 11:48:20 +08:00 · 2019-12-04 11:48:20 +08:00 · f22085b159
--- a/subprojects/GOPATH/src/github.com/microsoft/hivedscheduler/pkg/algorithm/hived_algorithm.go
+++ b/subprojects/GOPATH/src/github.com/microsoft/hivedscheduler/pkg/algorithm/hived_algorithm.go
@ -71,6 +71,7 @@ func NewHivedAlgorithm(sConfig *api.Config) *HivedAlgorithm {
 		reservedCells:           reservedPc,
 	}
 	for vc := range nonReservedVcl {
+		// TODO: Support per-VC configurable intra VC scheduling algo.
 		h.vcSchedulers[vc] = newDefaultIntraVCScheduler(nonReservedVcl[vc], reservedVcl[vc], gpuNums)
 	}
 	for chain, ccl := range h.fullCellList {
@ -407,7 +408,8 @@ func (h *HivedAlgorithm) scheduleAffinityGroupForGpuType(
 	if gpuType != "" {
 		if chains := h.chains[gpuType]; chains == nil {
 			panic(internal.NewBadRequestError(fmt.Sprintf(
-				"[%v]: pod requesting an invalid GPU type: %v", internal.Key(pod), gpuType)))
+				"[%v]: pod requesting GPU type %v which the whole cluster does not have",
+				internal.Key(pod), gpuType)))
 		} else {
 			vcHasType := false
 			for _, chain := range chains {
@ -512,7 +514,7 @@ func (h *HivedAlgorithm) scheduleGuaranteedAffinityGroup(
 					c := buddyAlloc(h.getTmpFreeCellList(sr.chain), pac.GetLevel(), suggestedNodeSet)
 					if c == nil {
 						panic(fmt.Sprintf(
-							"Cannot find physical cell for a VC cell: %v", pac.GetName()))
+							"VC Safety Broken: Cannot find physical cell for a VC cell: %v", pac.GetName()))
 					} else {
 						preassignedPhysical = c
 						// create binding (which is temporary and will be cleared after the scheduling,
@ -746,7 +748,8 @@ func generatePodScheduleResult(
 	if affinityGroupBindInfo == nil {
 		return internal.PodScheduleResult{
 			PodWaitInfo: &internal.PodWaitInfo{
-				FailedNodeReasons: map[string]string{},
+				// TODO: Enrich the Pod Waiting Reason.
+				Reason: "",
 			},
 		}
 	}
@ -795,6 +798,7 @@ func generatePodScheduleResult(
 	} else {
 		// we check suggested nodes after the preemption is done, otherwise the preemption victims
 		// may cause the selected node to be excluded from the suggested nodes
+		// TODO: Keep selectedNode within suggestedNodeSet, so here should be Unreachable
 		if !suggestedNodeSet.Contains(selectedNode) && newGroup {
 			panic(fmt.Sprintf("[%v]: node %v picked by algorithm but not in K8S candidates",
 				internal.Key(pod), selectedNode))
@ -840,8 +844,9 @@ func generateAffinityGroupBindInfo(
 			for gpuIndex := 0; gpuIndex < len(podPhysicalPlacements[podIndex]); gpuIndex++ {
 				pGpu := podPhysicalPlacements[podIndex][gpuIndex]
 				if pGpu == nil {
-					klog.Warningf("Resources previously allocated has been invalid; pod should wait")
-					return nil, "", nil
+					// TODO: Should insist binding and continue to force bind, then the Pod may
+					//  run or retry, instead of stuck in PodBinding state forever here.
+					panic("Resources previously allocated has been invalid; pod should wait")
 				}
 				nodes, gpuIndices := pGpu.(*PhysicalCell).GetPhysicalPlacement()
 				// here each cell (i.e., pGpu) is only one GPU, hence we takes the first element
@ -932,7 +937,7 @@ func mapNonPreassignedCellToPhysical(c *VirtualCell, suggestedNodeSet common.Set
 		parentPhysical := mapNonPreassignedCellToPhysical(c.GetParent().(*VirtualCell), suggestedNodeSet)
 		pc := getFewestOpporPhysicalCell(parentPhysical.GetChildren(), suggestedNodeSet)
 		if pc == nil || pc.GetPriority() > opportunisticPriority {
-			panic(fmt.Sprintf("Cannot find physical cell for %v", c.GetName()))
+			panic(fmt.Sprintf("VC Safety Broken: Cannot find physical cell for %v", c.GetName()))
 		}
 		c.SetPreBoundPhysicalCell(pc)
 		pc.SetPreBoundVirtualCell(c)
--- a/subprojects/GOPATH/src/github.com/microsoft/hivedscheduler/pkg/algorithm/hived_algorithm_test.go
+++ b/subprojects/GOPATH/src/github.com/microsoft/hivedscheduler/pkg/algorithm/hived_algorithm_test.go
@ -188,7 +188,7 @@ var pss = map[types.UID]api.PodSchedulingSpec{
 		GpuType:              "",
 		GpuNumber:            5,
 		AffinityGroup:        group9,
-	}, "pod10": { // use a GPU type that the VC does not have; should panic BadRequest
+	}, "pod10": { // use a GPU type that the VC does not have; should User Error Panic
 		VirtualCluster:       "VC2",
 		Priority:             1,
 		LazyPreemptionEnable: true,
@ -445,13 +445,15 @@ func testCasesThatShouldSucceed(t *testing.T, h *HivedAlgorithm) {
 func testOneCaseThatShouldFail(t *testing.T, h *HivedAlgorithm, podNames []string) {
 	defer func() {
 		if r := recover(); r != nil {
-			if err, ok := r.(*api.WebServerError); ok && err.Code == http.StatusBadRequest {
-				t.Logf("Got BadRequest as expected: %v", err)
+			if err, ok := r.(*api.WebServerError); ok &&
+				err.Code >= http.StatusBadRequest &&
+				err.Code < http.StatusInternalServerError {
+				t.Logf("Got User Error Panic as expected: %v", err)
 			} else {
-				t.Errorf("Expected BadRequest error, but got %v", r)
+				t.Errorf("Expected User Error Panic, but got %v", r)
 			}
 		} else {
-			t.Errorf("Expected BadRequest error, but got none")
+			t.Errorf("Expected User Error Panic, but got none")
 		}
 	}()
 	var psr internal.PodScheduleResult
--- a/subprojects/GOPATH/src/github.com/microsoft/hivedscheduler/pkg/algorithm/intra_vc_scheduler.go
+++ b/subprojects/GOPATH/src/github.com/microsoft/hivedscheduler/pkg/algorithm/intra_vc_scheduler.go
@ -46,6 +46,7 @@ type defaultIntraVCScheduler struct {
 	// currently we create a topologyAwareScheduler for each cluster view (each chain, each reservation).
 	// we plan to support multiple cluster views in one scheduler, and to support schedule pods
 	// across different cluster views.
+	// TODO: Support an affinity group can relax to be allocated across multiple chains.
 	nonReservedSchedulers map[CellChain]*topologyAwareScheduler
 	reservedSchedulers    map[api.ReservationId]*topologyAwareScheduler
 }
--- a/subprojects/GOPATH/src/github.com/microsoft/hivedscheduler/pkg/algorithm/topology_aware_scheduler.go
+++ b/subprojects/GOPATH/src/github.com/microsoft/hivedscheduler/pkg/algorithm/topology_aware_scheduler.go
@ -112,6 +112,8 @@ func (t *topologyAwareScheduler) Schedule(
 	for podIndex := 0; podIndex < len(sortedPodGpuNumbers); podIndex++ {
 		gpuNumber := sortedPodGpuNumbers[podIndex]
 		n := selectedNodes[podIndex]
+		// TODO: Optimize findNodesForPods and findGpusInNode together to get a better placement,
+		//  such as also aware intra node topology when findNodesForPods.
 		selectedGpus, nodeAvailableGpus[n] = findGpusInNode(n, gpuNumber, priority, nodeAvailableGpus[n], t.levelGpuNum)
 		if podPlacements[gpuNumber] == nil {
 			podPlacements[gpuNumber] = []CellList{}
@ -227,6 +229,12 @@ func (t *topologyAwareScheduler) updateClusterView(p CellPriority, suggestedNode
 func findNodesForPods(cv clusterView, gpuNums []int32, p CellPriority) []int32 {
 	// sort the nodes according to gpu numbers in each node.
 	// this is achieved through the Less method defined in type CellList.
+	// TODO: Ensure Opportunistic Pods also can always can find the solution, regardless of
+	//  the iteration order.
+	//  For example:
+	//   1. clusterView = 2GPU Node, 1GPU Node
+	//   2. gpuNums = 1GPU Pod, 2GPU Pod
+	//   First 1GPU Pod may allocate to 2GPU Node, but the latter pod cannot be fitted anymore.
 	sort.Stable(cv)
 	currentNodeIndices := make([]int32, len(gpuNums)) // indices of the currently picked nodes
 	podIndex := 0
@ -320,7 +328,8 @@ func findGpusInNode(
 		searchGpuIndex--
 		if searchGpuIndex < 0 {
 			if bestAffinity == highestLevel {
-				panic(fmt.Sprintf("failed to allocate %v GPUs in picked node %v", gpuNum, n.GetName()))
+				// Unreachable
+				panic(fmt.Sprintf("Assert Failure: failed to allocate %v GPUs in picked node %v", gpuNum, n.GetName()))
 			}
 			availableGpus = removePickedGpus(availableGpus, bestAffinityGpuIndices)
 			return bestAffinityGpus, availableGpus
@ -336,7 +345,9 @@ func getOptimalAffinity(gpuNum int32, levelGpuNum map[CellLevel]int32) CellLevel
 			return l
 		}
 	}
-	panic(fmt.Sprintf("pod allocated a node but exceeds the capacity of the current chain"))
+
+	// Unreachable
+	panic(fmt.Sprintf("Assert Failure: pod allocated a node but exceeds the capacity of the current chain"))
 }

 // checkCurrentGpus checks if the currently picked GPUs have the lowest LCA. It also checks if the solution
--- a/subprojects/GOPATH/src/github.com/microsoft/hivedscheduler/pkg/internal/types.go
+++ b/subprojects/GOPATH/src/github.com/microsoft/hivedscheduler/pkg/internal/types.go
@ -36,7 +36,7 @@ import (

 // WebServer Callbacks with K8S Default Scheduler
 // Notes:
-// 1. Error should be passed by panic
+// 1. Error should be delivered by panic
 // 2. Should not assume previous succeeded operation also has been successfully
 //    executed by K8S Default Scheduler.
 type ExtenderHandlers struct {
@ -53,7 +53,11 @@ type InspectHandlers struct {
 // SchedulerAlgorithm is used to make the pod schedule decision based on its whole
 // cluster scheduling view constructed from its Add/Update/Delete callbacks.
 // Notes:
-// 1. Error should be passed by panic
+// 1. Error should be delivered by panic and it will not change any state.
+//    For WebServer Callbacks, all Panics will be recovered as error responses,
+//    see HandleInformerPanic.
+//    For Informer Callbacks, only User Error Panics will be recovered as error
+//    logs, other Panics will crash the whole process, see HandleWebServerPanic.
 // 2. Should take all the input parameters as readonly and return pod schedule
 //    decision by PodScheduleResult.
 // 3. {Schedule, AddAllocatedPod, DeleteAllocatedPod} will never be executed
@ -170,8 +174,8 @@ func IsAllocated(state PodState) bool {

 // No need to use it recover scheduler waiting resource
 type PodWaitInfo struct {
-	// NodeName ->  The reason why the node cannot be used for the Pod to allocate.
-	FailedNodeReasons map[string]string
+	// The reason why no preemptable or free resource to allocate the Pod now.
+	Reason string
 }

 // No need to use it recover scheduler preempting resource
--- a/subprojects/GOPATH/src/github.com/microsoft/hivedscheduler/pkg/scheduler/scheduler.go
+++ b/subprojects/GOPATH/src/github.com/microsoft/hivedscheduler/pkg/scheduler/scheduler.go
@ -527,19 +527,19 @@ func (s *HivedScheduler) filterRoutine(args ei.ExtenderArgs) *ei.ExtenderFilterR
 		}

 		// Return FailedNodes to tell K8S Default Scheduler that preemption may help.
-		failedNodeReasons := map[string]string{}
+		failedNodes := map[string]string{}
 		for _, victim := range result.PodPreemptInfo.VictimPods {
 			node := victim.Spec.NodeName
-			if _, ok := failedNodeReasons[node]; !ok {
-				failedNodeReasons[node] = fmt.Sprintf(
+			if _, ok := failedNodes[node]; !ok {
+				failedNodes[node] = fmt.Sprintf(
 					"node(%v) is waiting for victim Pod(s) to be preempted: %v",
 					node, internal.Key(victim))
 			} else {
-				failedNodeReasons[node] += ", " + internal.Key(victim)
+				failedNodes[node] += ", " + internal.Key(victim)
 			}
 		}
 		return &ei.ExtenderFilterResult{
-			FailedNodes: failedNodeReasons,
+			FailedNodes: failedNodes,
 		}
 	} else {
 		s.podScheduleStatuses[pod.UID] = &internal.PodScheduleStatus{
@ -547,9 +547,13 @@ func (s *HivedScheduler) filterRoutine(args ei.ExtenderArgs) *ei.ExtenderFilterR
 			PodState:          internal.PodWaiting,
 			PodScheduleResult: &result,
 		}
+
+		// Return Error to tell K8S Default Scheduler that preemption must not help.
 		if result.PodWaitInfo != nil {
 			return &ei.ExtenderFilterResult{
-				FailedNodes: result.PodWaitInfo.FailedNodeReasons,
+				Error: fmt.Sprintf(
+					"Pod is waiting for preemptable or free resource to appear: %v",
+					result.PodWaitInfo.Reason),
 			}
 		} else {
 			return &ei.ExtenderFilterResult{}
--- a/subprojects/GOPATH/src/github.com/microsoft/hivedscheduler/pkg/webserver/webserver.go
+++ b/subprojects/GOPATH/src/github.com/microsoft/hivedscheduler/pkg/webserver/webserver.go
@ -133,7 +133,7 @@ func (ln tcpKeepAliveListener) Accept() (net.Conn, error) {
 	return tc, nil
 }

-// Error should be passed by panic
+// Error should be delivered by panic
 type servePathHandler func(w http.ResponseWriter, r *http.Request)

 func (ws *WebServer) serve(handler servePathHandler) servePathHandler {