зеркало из https://github.com/microsoft/pai.git
[Hived]: Expose and Refine Pod Waiting Reason (#3931)
This commit is contained in:
Родитель
1dcb1198d9
Коммит
f22085b159
|
@ -71,6 +71,7 @@ func NewHivedAlgorithm(sConfig *api.Config) *HivedAlgorithm {
|
|||
reservedCells: reservedPc,
|
||||
}
|
||||
for vc := range nonReservedVcl {
|
||||
// TODO: Support per-VC configurable intra VC scheduling algo.
|
||||
h.vcSchedulers[vc] = newDefaultIntraVCScheduler(nonReservedVcl[vc], reservedVcl[vc], gpuNums)
|
||||
}
|
||||
for chain, ccl := range h.fullCellList {
|
||||
|
@ -407,7 +408,8 @@ func (h *HivedAlgorithm) scheduleAffinityGroupForGpuType(
|
|||
if gpuType != "" {
|
||||
if chains := h.chains[gpuType]; chains == nil {
|
||||
panic(internal.NewBadRequestError(fmt.Sprintf(
|
||||
"[%v]: pod requesting an invalid GPU type: %v", internal.Key(pod), gpuType)))
|
||||
"[%v]: pod requesting GPU type %v which the whole cluster does not have",
|
||||
internal.Key(pod), gpuType)))
|
||||
} else {
|
||||
vcHasType := false
|
||||
for _, chain := range chains {
|
||||
|
@ -512,7 +514,7 @@ func (h *HivedAlgorithm) scheduleGuaranteedAffinityGroup(
|
|||
c := buddyAlloc(h.getTmpFreeCellList(sr.chain), pac.GetLevel(), suggestedNodeSet)
|
||||
if c == nil {
|
||||
panic(fmt.Sprintf(
|
||||
"Cannot find physical cell for a VC cell: %v", pac.GetName()))
|
||||
"VC Safety Broken: Cannot find physical cell for a VC cell: %v", pac.GetName()))
|
||||
} else {
|
||||
preassignedPhysical = c
|
||||
// create binding (which is temporary and will be cleared after the scheduling,
|
||||
|
@ -746,7 +748,8 @@ func generatePodScheduleResult(
|
|||
if affinityGroupBindInfo == nil {
|
||||
return internal.PodScheduleResult{
|
||||
PodWaitInfo: &internal.PodWaitInfo{
|
||||
FailedNodeReasons: map[string]string{},
|
||||
// TODO: Enrich the Pod Waiting Reason.
|
||||
Reason: "",
|
||||
},
|
||||
}
|
||||
}
|
||||
|
@ -795,6 +798,7 @@ func generatePodScheduleResult(
|
|||
} else {
|
||||
// we check suggested nodes after the preemption is done, otherwise the preemption victims
|
||||
// may cause the selected node to be excluded from the suggested nodes
|
||||
// TODO: Keep selectedNode within suggestedNodeSet, so here should be Unreachable
|
||||
if !suggestedNodeSet.Contains(selectedNode) && newGroup {
|
||||
panic(fmt.Sprintf("[%v]: node %v picked by algorithm but not in K8S candidates",
|
||||
internal.Key(pod), selectedNode))
|
||||
|
@ -840,8 +844,9 @@ func generateAffinityGroupBindInfo(
|
|||
for gpuIndex := 0; gpuIndex < len(podPhysicalPlacements[podIndex]); gpuIndex++ {
|
||||
pGpu := podPhysicalPlacements[podIndex][gpuIndex]
|
||||
if pGpu == nil {
|
||||
klog.Warningf("Resources previously allocated has been invalid; pod should wait")
|
||||
return nil, "", nil
|
||||
// TODO: Should insist binding and continue to force bind, then the Pod may
|
||||
// run or retry, instead of stuck in PodBinding state forever here.
|
||||
panic("Resources previously allocated has been invalid; pod should wait")
|
||||
}
|
||||
nodes, gpuIndices := pGpu.(*PhysicalCell).GetPhysicalPlacement()
|
||||
// here each cell (i.e., pGpu) is only one GPU, hence we takes the first element
|
||||
|
@ -932,7 +937,7 @@ func mapNonPreassignedCellToPhysical(c *VirtualCell, suggestedNodeSet common.Set
|
|||
parentPhysical := mapNonPreassignedCellToPhysical(c.GetParent().(*VirtualCell), suggestedNodeSet)
|
||||
pc := getFewestOpporPhysicalCell(parentPhysical.GetChildren(), suggestedNodeSet)
|
||||
if pc == nil || pc.GetPriority() > opportunisticPriority {
|
||||
panic(fmt.Sprintf("Cannot find physical cell for %v", c.GetName()))
|
||||
panic(fmt.Sprintf("VC Safety Broken: Cannot find physical cell for %v", c.GetName()))
|
||||
}
|
||||
c.SetPreBoundPhysicalCell(pc)
|
||||
pc.SetPreBoundVirtualCell(c)
|
||||
|
|
|
@ -188,7 +188,7 @@ var pss = map[types.UID]api.PodSchedulingSpec{
|
|||
GpuType: "",
|
||||
GpuNumber: 5,
|
||||
AffinityGroup: group9,
|
||||
}, "pod10": { // use a GPU type that the VC does not have; should panic BadRequest
|
||||
}, "pod10": { // use a GPU type that the VC does not have; should User Error Panic
|
||||
VirtualCluster: "VC2",
|
||||
Priority: 1,
|
||||
LazyPreemptionEnable: true,
|
||||
|
@ -445,13 +445,15 @@ func testCasesThatShouldSucceed(t *testing.T, h *HivedAlgorithm) {
|
|||
func testOneCaseThatShouldFail(t *testing.T, h *HivedAlgorithm, podNames []string) {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
if err, ok := r.(*api.WebServerError); ok && err.Code == http.StatusBadRequest {
|
||||
t.Logf("Got BadRequest as expected: %v", err)
|
||||
if err, ok := r.(*api.WebServerError); ok &&
|
||||
err.Code >= http.StatusBadRequest &&
|
||||
err.Code < http.StatusInternalServerError {
|
||||
t.Logf("Got User Error Panic as expected: %v", err)
|
||||
} else {
|
||||
t.Errorf("Expected BadRequest error, but got %v", r)
|
||||
t.Errorf("Expected User Error Panic, but got %v", r)
|
||||
}
|
||||
} else {
|
||||
t.Errorf("Expected BadRequest error, but got none")
|
||||
t.Errorf("Expected User Error Panic, but got none")
|
||||
}
|
||||
}()
|
||||
var psr internal.PodScheduleResult
|
||||
|
|
|
@ -46,6 +46,7 @@ type defaultIntraVCScheduler struct {
|
|||
// currently we create a topologyAwareScheduler for each cluster view (each chain, each reservation).
|
||||
// we plan to support multiple cluster views in one scheduler, and to support schedule pods
|
||||
// across different cluster views.
|
||||
// TODO: Support an affinity group can relax to be allocated across multiple chains.
|
||||
nonReservedSchedulers map[CellChain]*topologyAwareScheduler
|
||||
reservedSchedulers map[api.ReservationId]*topologyAwareScheduler
|
||||
}
|
||||
|
|
|
@ -112,6 +112,8 @@ func (t *topologyAwareScheduler) Schedule(
|
|||
for podIndex := 0; podIndex < len(sortedPodGpuNumbers); podIndex++ {
|
||||
gpuNumber := sortedPodGpuNumbers[podIndex]
|
||||
n := selectedNodes[podIndex]
|
||||
// TODO: Optimize findNodesForPods and findGpusInNode together to get a better placement,
|
||||
// such as also aware intra node topology when findNodesForPods.
|
||||
selectedGpus, nodeAvailableGpus[n] = findGpusInNode(n, gpuNumber, priority, nodeAvailableGpus[n], t.levelGpuNum)
|
||||
if podPlacements[gpuNumber] == nil {
|
||||
podPlacements[gpuNumber] = []CellList{}
|
||||
|
@ -227,6 +229,12 @@ func (t *topologyAwareScheduler) updateClusterView(p CellPriority, suggestedNode
|
|||
func findNodesForPods(cv clusterView, gpuNums []int32, p CellPriority) []int32 {
|
||||
// sort the nodes according to gpu numbers in each node.
|
||||
// this is achieved through the Less method defined in type CellList.
|
||||
// TODO: Ensure Opportunistic Pods also can always can find the solution, regardless of
|
||||
// the iteration order.
|
||||
// For example:
|
||||
// 1. clusterView = 2GPU Node, 1GPU Node
|
||||
// 2. gpuNums = 1GPU Pod, 2GPU Pod
|
||||
// First 1GPU Pod may allocate to 2GPU Node, but the latter pod cannot be fitted anymore.
|
||||
sort.Stable(cv)
|
||||
currentNodeIndices := make([]int32, len(gpuNums)) // indices of the currently picked nodes
|
||||
podIndex := 0
|
||||
|
@ -320,7 +328,8 @@ func findGpusInNode(
|
|||
searchGpuIndex--
|
||||
if searchGpuIndex < 0 {
|
||||
if bestAffinity == highestLevel {
|
||||
panic(fmt.Sprintf("failed to allocate %v GPUs in picked node %v", gpuNum, n.GetName()))
|
||||
// Unreachable
|
||||
panic(fmt.Sprintf("Assert Failure: failed to allocate %v GPUs in picked node %v", gpuNum, n.GetName()))
|
||||
}
|
||||
availableGpus = removePickedGpus(availableGpus, bestAffinityGpuIndices)
|
||||
return bestAffinityGpus, availableGpus
|
||||
|
@ -336,7 +345,9 @@ func getOptimalAffinity(gpuNum int32, levelGpuNum map[CellLevel]int32) CellLevel
|
|||
return l
|
||||
}
|
||||
}
|
||||
panic(fmt.Sprintf("pod allocated a node but exceeds the capacity of the current chain"))
|
||||
|
||||
// Unreachable
|
||||
panic(fmt.Sprintf("Assert Failure: pod allocated a node but exceeds the capacity of the current chain"))
|
||||
}
|
||||
|
||||
// checkCurrentGpus checks if the currently picked GPUs have the lowest LCA. It also checks if the solution
|
||||
|
|
|
@ -36,7 +36,7 @@ import (
|
|||
|
||||
// WebServer Callbacks with K8S Default Scheduler
|
||||
// Notes:
|
||||
// 1. Error should be passed by panic
|
||||
// 1. Error should be delivered by panic
|
||||
// 2. Should not assume previous succeeded operation also has been successfully
|
||||
// executed by K8S Default Scheduler.
|
||||
type ExtenderHandlers struct {
|
||||
|
@ -53,7 +53,11 @@ type InspectHandlers struct {
|
|||
// SchedulerAlgorithm is used to make the pod schedule decision based on its whole
|
||||
// cluster scheduling view constructed from its Add/Update/Delete callbacks.
|
||||
// Notes:
|
||||
// 1. Error should be passed by panic
|
||||
// 1. Error should be delivered by panic and it will not change any state.
|
||||
// For WebServer Callbacks, all Panics will be recovered as error responses,
|
||||
// see HandleInformerPanic.
|
||||
// For Informer Callbacks, only User Error Panics will be recovered as error
|
||||
// logs, other Panics will crash the whole process, see HandleWebServerPanic.
|
||||
// 2. Should take all the input parameters as readonly and return pod schedule
|
||||
// decision by PodScheduleResult.
|
||||
// 3. {Schedule, AddAllocatedPod, DeleteAllocatedPod} will never be executed
|
||||
|
@ -170,8 +174,8 @@ func IsAllocated(state PodState) bool {
|
|||
|
||||
// No need to use it recover scheduler waiting resource
|
||||
type PodWaitInfo struct {
|
||||
// NodeName -> The reason why the node cannot be used for the Pod to allocate.
|
||||
FailedNodeReasons map[string]string
|
||||
// The reason why no preemptable or free resource to allocate the Pod now.
|
||||
Reason string
|
||||
}
|
||||
|
||||
// No need to use it recover scheduler preempting resource
|
||||
|
|
|
@ -527,19 +527,19 @@ func (s *HivedScheduler) filterRoutine(args ei.ExtenderArgs) *ei.ExtenderFilterR
|
|||
}
|
||||
|
||||
// Return FailedNodes to tell K8S Default Scheduler that preemption may help.
|
||||
failedNodeReasons := map[string]string{}
|
||||
failedNodes := map[string]string{}
|
||||
for _, victim := range result.PodPreemptInfo.VictimPods {
|
||||
node := victim.Spec.NodeName
|
||||
if _, ok := failedNodeReasons[node]; !ok {
|
||||
failedNodeReasons[node] = fmt.Sprintf(
|
||||
if _, ok := failedNodes[node]; !ok {
|
||||
failedNodes[node] = fmt.Sprintf(
|
||||
"node(%v) is waiting for victim Pod(s) to be preempted: %v",
|
||||
node, internal.Key(victim))
|
||||
} else {
|
||||
failedNodeReasons[node] += ", " + internal.Key(victim)
|
||||
failedNodes[node] += ", " + internal.Key(victim)
|
||||
}
|
||||
}
|
||||
return &ei.ExtenderFilterResult{
|
||||
FailedNodes: failedNodeReasons,
|
||||
FailedNodes: failedNodes,
|
||||
}
|
||||
} else {
|
||||
s.podScheduleStatuses[pod.UID] = &internal.PodScheduleStatus{
|
||||
|
@ -547,9 +547,13 @@ func (s *HivedScheduler) filterRoutine(args ei.ExtenderArgs) *ei.ExtenderFilterR
|
|||
PodState: internal.PodWaiting,
|
||||
PodScheduleResult: &result,
|
||||
}
|
||||
|
||||
// Return Error to tell K8S Default Scheduler that preemption must not help.
|
||||
if result.PodWaitInfo != nil {
|
||||
return &ei.ExtenderFilterResult{
|
||||
FailedNodes: result.PodWaitInfo.FailedNodeReasons,
|
||||
Error: fmt.Sprintf(
|
||||
"Pod is waiting for preemptable or free resource to appear: %v",
|
||||
result.PodWaitInfo.Reason),
|
||||
}
|
||||
} else {
|
||||
return &ei.ExtenderFilterResult{}
|
||||
|
|
|
@ -133,7 +133,7 @@ func (ln tcpKeepAliveListener) Accept() (net.Conn, error) {
|
|||
return tc, nil
|
||||
}
|
||||
|
||||
// Error should be passed by panic
|
||||
// Error should be delivered by panic
|
||||
type servePathHandler func(w http.ResponseWriter, r *http.Request)
|
||||
|
||||
func (ws *WebServer) serve(handler servePathHandler) servePathHandler {
|
||||
|
|
Загрузка…
Ссылка в новой задаче