зеркало из https://github.com/microsoft/pai.git
[HiveD] VC aware of suggested nodes (#4268)
* vc aware of suggested nodes * vc aware of suggested nodes * suggested * add UT * fix comments * minor fix * revert vc aware suggested nodes * fix UT
This commit is contained in:
Родитель
e633511722
Коммит
283668e504
|
@ -347,7 +347,7 @@ func (h *HivedAlgorithm) initReservations() {
|
|||
func (h *HivedAlgorithm) scheduleNewAffinityGroup(
|
||||
pod *core.Pod,
|
||||
s *api.PodSchedulingSpec,
|
||||
suggestedNodeSet common.Set) (map[int32][]CellList, map[int32][]CellList) {
|
||||
suggestedNodes common.Set) (map[int32][]CellList, map[int32][]CellList) {
|
||||
|
||||
var (
|
||||
physicalPlacement map[int32][]CellList
|
||||
|
@ -371,9 +371,9 @@ func (h *HivedAlgorithm) scheduleNewAffinityGroup(
|
|||
if sr.reservationId != "" {
|
||||
klog.Infof("Use reservation %v", s.ReservationId)
|
||||
sr.chain = h.reservedCells[sr.vc][sr.reservationId].GetChain()
|
||||
physicalPlacement, virtualPlacement = h.processSchedulingRequest(sr, suggestedNodeSet)
|
||||
physicalPlacement, virtualPlacement = h.processSchedulingRequest(sr, suggestedNodes)
|
||||
} else {
|
||||
physicalPlacement, virtualPlacement = h.scheduleAffinityGroupForGpuType(sr, s.GpuType, pod, suggestedNodeSet)
|
||||
physicalPlacement, virtualPlacement = h.scheduleAffinityGroupForGpuType(sr, s.GpuType, pod, suggestedNodes)
|
||||
}
|
||||
if physicalPlacement != nil {
|
||||
klog.Infof("Succeeded in scheduling group %v", s.AffinityGroup.Name)
|
||||
|
@ -390,7 +390,7 @@ func (h *HivedAlgorithm) scheduleAffinityGroupForGpuType(
|
|||
sr schedulingRequest,
|
||||
gpuType string,
|
||||
pod *core.Pod,
|
||||
suggestedNodeSet common.Set) (map[int32][]CellList, map[int32][]CellList) {
|
||||
suggestedNodes common.Set) (map[int32][]CellList, map[int32][]CellList) {
|
||||
|
||||
if gpuType != "" {
|
||||
if chains := h.chains[gpuType]; chains == nil {
|
||||
|
@ -404,7 +404,7 @@ func (h *HivedAlgorithm) scheduleAffinityGroupForGpuType(
|
|||
vcHasType = true
|
||||
}
|
||||
sr.chain = chain
|
||||
if physicalPlacement, virtualPlacement := h.processSchedulingRequest(sr, suggestedNodeSet); physicalPlacement != nil {
|
||||
if physicalPlacement, virtualPlacement := h.processSchedulingRequest(sr, suggestedNodes); physicalPlacement != nil {
|
||||
return physicalPlacement, virtualPlacement
|
||||
}
|
||||
}
|
||||
|
@ -418,7 +418,7 @@ func (h *HivedAlgorithm) scheduleAffinityGroupForGpuType(
|
|||
for _, chains := range h.chains {
|
||||
for _, chain := range chains {
|
||||
sr.chain = chain
|
||||
if physicalPlacement, virtualPlacement := h.processSchedulingRequest(sr, suggestedNodeSet); physicalPlacement != nil {
|
||||
if physicalPlacement, virtualPlacement := h.processSchedulingRequest(sr, suggestedNodes); physicalPlacement != nil {
|
||||
return physicalPlacement, virtualPlacement
|
||||
}
|
||||
}
|
||||
|
@ -448,12 +448,12 @@ func (h *HivedAlgorithm) validateSchedulingRequest(sr schedulingRequest, pod *co
|
|||
// or the opportunistic scheduler according to its priority.
|
||||
func (h *HivedAlgorithm) processSchedulingRequest(
|
||||
sr schedulingRequest,
|
||||
suggestedNodeSet common.Set) (map[int32][]CellList, map[int32][]CellList) {
|
||||
suggestedNodes common.Set) (map[int32][]CellList, map[int32][]CellList) {
|
||||
|
||||
if sr.priority >= minGuaranteedPriority {
|
||||
return h.scheduleGuaranteedAffinityGroup(sr, suggestedNodeSet)
|
||||
return h.scheduleGuaranteedAffinityGroup(sr, suggestedNodes)
|
||||
} else {
|
||||
return h.scheduleOpportunisticAffinityGroup(sr, suggestedNodeSet), nil
|
||||
return h.scheduleOpportunisticAffinityGroup(sr, suggestedNodes), nil
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -461,7 +461,7 @@ func (h *HivedAlgorithm) processSchedulingRequest(
|
|||
// then maps the placement in VC to the physical cluster.
|
||||
func (h *HivedAlgorithm) scheduleGuaranteedAffinityGroup(
|
||||
sr schedulingRequest,
|
||||
suggestedNodeSet common.Set) (map[int32][]CellList, map[int32][]CellList) {
|
||||
suggestedNodes common.Set) (map[int32][]CellList, map[int32][]CellList) {
|
||||
|
||||
// schedule in VC
|
||||
virtualPlacement := h.vcSchedulers[sr.vc].schedule(sr)
|
||||
|
@ -498,7 +498,7 @@ func (h *HivedAlgorithm) scheduleGuaranteedAffinityGroup(
|
|||
if preassignedPhysical == nil {
|
||||
// allocate a new physical cell to the preassigned cell. input a copy of the free cell list
|
||||
// because during the scheduling we should not make in-place change to the data structures
|
||||
c := buddyAlloc(h.getTmpFreeCellList(sr.chain), pac.GetLevel(), suggestedNodeSet)
|
||||
c := buddyAlloc(h.getTmpFreeCellList(sr.chain), pac.GetLevel(), suggestedNodes)
|
||||
if c == nil {
|
||||
panic(fmt.Sprintf(
|
||||
"VC Safety Broken: Cannot find physical cell for a VC cell: %v", pac.GetName()))
|
||||
|
@ -510,7 +510,7 @@ func (h *HivedAlgorithm) scheduleGuaranteedAffinityGroup(
|
|||
preassignedPhysical.SetPreBoundVirtualCell(pac)
|
||||
}
|
||||
}
|
||||
physicalPlacement[podGpuNum][i][j] = mapNonPreassignedCellToPhysical(vGpu, suggestedNodeSet)
|
||||
physicalPlacement[podGpuNum][i][j] = mapNonPreassignedCellToPhysical(vGpu, suggestedNodes)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -521,10 +521,10 @@ func (h *HivedAlgorithm) scheduleGuaranteedAffinityGroup(
|
|||
// scheduleOpportunisticAffinityGroup calls the opportunistic pod scheduler to schedule an affinity group.
|
||||
func (h *HivedAlgorithm) scheduleOpportunisticAffinityGroup(
|
||||
sr schedulingRequest,
|
||||
suggestedNodeSet common.Set) map[int32][]CellList {
|
||||
suggestedNodes common.Set) map[int32][]CellList {
|
||||
|
||||
placement := h.opportunisticSchedulers[sr.chain].Schedule(
|
||||
sr.affinityGroupPodNums, opportunisticPriority, suggestedNodeSet)
|
||||
sr.affinityGroupPodNums, opportunisticPriority, suggestedNodes)
|
||||
if placement == nil {
|
||||
klog.Infof("Insufficient capacity in PC for scheduling request: GPU numbers %v, priority %v",
|
||||
sr.affinityGroupPodNums, sr.priority)
|
||||
|
@ -865,7 +865,7 @@ func generatePodScheduleResult(
|
|||
currentPodIndex int32,
|
||||
group *AlgoAffinityGroup,
|
||||
groupName string,
|
||||
suggestedNodeSet common.Set,
|
||||
suggestedNodes common.Set,
|
||||
vc api.VirtualClusterName,
|
||||
pod *core.Pod) internal.PodScheduleResult {
|
||||
|
||||
|
@ -889,7 +889,7 @@ func generatePodScheduleResult(
|
|||
// we find the selected node after the preemption is done, otherwise the preemption victims
|
||||
// may cause the selected node to be excluded from the suggested nodes
|
||||
affinityGroupBindInfo, nodesNotInSuggested, selectedNode, selectedGpuIndices, cellChain := generateAffinityGroupBindInfo(
|
||||
groupPhysicalPlacement, groupVirtualPlacement, cellLevelToType, currentGpuNum, currentPodIndex, group, groupName, suggestedNodeSet)
|
||||
groupPhysicalPlacement, groupVirtualPlacement, cellLevelToType, currentGpuNum, currentPodIndex, group, groupName, suggestedNodes)
|
||||
var waitReason string
|
||||
if affinityGroupBindInfo == nil {
|
||||
waitReason = "insufficient capacity in physical cluster"
|
||||
|
@ -936,7 +936,7 @@ func generateAffinityGroupBindInfo(
|
|||
currentPodIndex int32,
|
||||
group *AlgoAffinityGroup,
|
||||
groupName string,
|
||||
suggestedNodeSet common.Set) (
|
||||
suggestedNodes common.Set) (
|
||||
affinityGroupBindInfo []api.AffinityGroupMemberBindInfo,
|
||||
nodesNotInSuggested []string,
|
||||
selectedNode string,
|
||||
|
@ -973,7 +973,7 @@ func generateAffinityGroupBindInfo(
|
|||
// in its "nodes" and "gpuIndices" as the node and GPU address
|
||||
if mbi.PodPlacements[podIndex].PhysicalNode == "" {
|
||||
mbi.PodPlacements[podIndex].PhysicalNode = nodes[0]
|
||||
if !suggestedNodeSet.Contains(nodes[0]) {
|
||||
if !suggestedNodes.Contains(nodes[0]) {
|
||||
nodesNotInSuggested = append(nodesNotInSuggested, nodes[0])
|
||||
}
|
||||
}
|
||||
|
@ -1075,9 +1075,9 @@ func retrieveMissingPodPlacement(group *AlgoAffinityGroup, gpuNum int32, podInde
|
|||
// It splits a higher-level cell when there is no free cell at the current level.
|
||||
// As the input cell list is a copy of the real free list and hence is one-off,
|
||||
// we won't remove a returned cell from it.
|
||||
func buddyAlloc(freeList ChainCellList, level CellLevel, suggestedNodeSet common.Set) *PhysicalCell {
|
||||
func buddyAlloc(freeList ChainCellList, level CellLevel, suggestedNodes common.Set) *PhysicalCell {
|
||||
if len(freeList[level]) == 0 && level < CellLevel(len(freeList)) {
|
||||
higherCell := buddyAlloc(freeList, level+1, suggestedNodeSet)
|
||||
higherCell := buddyAlloc(freeList, level+1, suggestedNodes)
|
||||
if higherCell != nil {
|
||||
freeList[level] = append(freeList[level], higherCell.GetChildren()...)
|
||||
}
|
||||
|
@ -1085,54 +1085,65 @@ func buddyAlloc(freeList ChainCellList, level CellLevel, suggestedNodeSet common
|
|||
if len(freeList[level]) == 0 {
|
||||
return nil
|
||||
}
|
||||
return getFewestOpporPhysicalCell(freeList[level], suggestedNodeSet)
|
||||
return getFewestOpporPhysicalCell(freeList[level], suggestedNodes)
|
||||
}
|
||||
|
||||
// getFewestOpporPhysicalCell selects a physical cell with the minimum number of opportunistic pods from a cell list.
|
||||
func getFewestOpporPhysicalCell(cl CellList, suggestedNodeSet common.Set) *PhysicalCell {
|
||||
func getFewestOpporPhysicalCell(cl CellList, suggestedNodes common.Set) *PhysicalCell {
|
||||
fewestOpporNum := int32(math.MaxInt32)
|
||||
fewestOpporNumSuggested := int32(math.MaxInt32)
|
||||
var fewestOpporCell *PhysicalCell
|
||||
var fewestOpporSuggested *PhysicalCell
|
||||
var fewestOpporCellSuggested *PhysicalCell
|
||||
var preemptibleCells []*PhysicalCell
|
||||
for _, c := range cl {
|
||||
if pc := c.(*PhysicalCell); pc.GetVirtualCell() == nil && pc.GetPreBoundVirtualCell() == nil {
|
||||
numOppor := pc.GetUsedGpuNumAtPriorities()[opportunisticPriority]
|
||||
if numOppor < fewestOpporNum {
|
||||
fewestOpporNum = numOppor
|
||||
opporNum := pc.GetUsedGpuNumAtPriorities()[opportunisticPriority]
|
||||
if opporNum < fewestOpporNum {
|
||||
fewestOpporNum = opporNum
|
||||
fewestOpporCell = pc
|
||||
}
|
||||
allNodesInSuggested := true
|
||||
nodes, _ := pc.GetPhysicalPlacement()
|
||||
for _, n := range nodes {
|
||||
if !suggestedNodeSet.Contains(n) {
|
||||
if !suggestedNodes.Contains(n) {
|
||||
allNodesInSuggested = false
|
||||
break
|
||||
}
|
||||
}
|
||||
if allNodesInSuggested && numOppor < fewestOpporNumSuggested {
|
||||
fewestOpporNumSuggested = numOppor
|
||||
fewestOpporSuggested = pc
|
||||
if allNodesInSuggested && opporNum < fewestOpporNumSuggested {
|
||||
fewestOpporNumSuggested = opporNum
|
||||
fewestOpporCellSuggested = pc
|
||||
}
|
||||
if opporNum > 0 {
|
||||
preemptibleCells = append(preemptibleCells, pc)
|
||||
}
|
||||
}
|
||||
}
|
||||
if fewestOpporSuggested == nil {
|
||||
return fewestOpporCell
|
||||
} else {
|
||||
return fewestOpporSuggested
|
||||
if fewestOpporCellSuggested != nil {
|
||||
return fewestOpporCellSuggested
|
||||
}
|
||||
// If we cannot find a cell within suggested nodes, we will try to preempt some pods instead of
|
||||
// directly returning the fewestOpporCell (because this cell could be a bad node, we should not return it).
|
||||
// Also, we will choose a random cell, to avoid always returning the same cell (similar to above,
|
||||
// if we always return the same cell, it might be a bad node, preempting pods on a bad node won't bring
|
||||
// it back the suggested nodes)
|
||||
if len(preemptibleCells) > 0 {
|
||||
return preemptibleCells[rand.Int31n(int32(len(preemptibleCells)))]
|
||||
}
|
||||
return fewestOpporCell
|
||||
}
|
||||
|
||||
// mapNonPreassignedCellToPhysical maps a virtual cell (possibly inside a preassigned one) to the
|
||||
// physical cell of the preassigned cell. This operation keeps the inner-cell topology equivalent,
|
||||
// by recursively binding the cells inside the preassigned one.
|
||||
func mapNonPreassignedCellToPhysical(c *VirtualCell, suggestedNodeSet common.Set) *PhysicalCell {
|
||||
func mapNonPreassignedCellToPhysical(c *VirtualCell, suggestedNodes common.Set) *PhysicalCell {
|
||||
if c.GetPhysicalCell() != nil {
|
||||
return c.GetPhysicalCell()
|
||||
} else if c.GetPreBoundPhysicalCell() != nil {
|
||||
return c.GetPreBoundPhysicalCell()
|
||||
} else {
|
||||
parentPhysical := mapNonPreassignedCellToPhysical(c.GetParent().(*VirtualCell), suggestedNodeSet)
|
||||
pc := getFewestOpporPhysicalCell(parentPhysical.GetChildren(), suggestedNodeSet)
|
||||
parentPhysical := mapNonPreassignedCellToPhysical(c.GetParent().(*VirtualCell), suggestedNodes)
|
||||
pc := getFewestOpporPhysicalCell(parentPhysical.GetChildren(), suggestedNodes)
|
||||
if pc == nil || pc.GetPriority() > opportunisticPriority {
|
||||
panic(fmt.Sprintf("VC Safety Broken: Cannot find physical cell for %v", c.GetName()))
|
||||
}
|
||||
|
|
|
@ -39,7 +39,7 @@ var allPods = map[string]*core.Pod{}
|
|||
|
||||
func init() {
|
||||
common.InitAll()
|
||||
for i := 1; i <= 25; i++ {
|
||||
for i := 1; i <= len(pss); i++ {
|
||||
podName := fmt.Sprintf("pod%v", i)
|
||||
allPods[podName] = &core.Pod{
|
||||
ObjectMeta: meta.ObjectMeta{
|
||||
|
@ -62,7 +62,7 @@ func initNodes(h *HivedAlgorithm) {
|
|||
}
|
||||
}
|
||||
|
||||
var group1, group2, group3, group4, group5, group6, group7, group8, group9, group10, group11, group12, group13, group14, group15, group16, group17 = &api.AffinityGroupSpec{
|
||||
var group1, group2, group3, group4, group5, group6, group7, group8, group9, group10, group11, group12, group13, group14, group15, group16, group17, group18 = &api.AffinityGroupSpec{
|
||||
Name: "group1",
|
||||
Members: []api.AffinityGroupMemberSpec{{PodNumber: 1, GpuNumber: 1}},
|
||||
}, &api.AffinityGroupSpec{
|
||||
|
@ -113,6 +113,9 @@ var group1, group2, group3, group4, group5, group6, group7, group8, group9, grou
|
|||
}, &api.AffinityGroupSpec{
|
||||
Name: "group17",
|
||||
Members: []api.AffinityGroupMemberSpec{{PodNumber: 1, GpuNumber: 2}},
|
||||
}, &api.AffinityGroupSpec{
|
||||
Name: "group18",
|
||||
Members: []api.AffinityGroupMemberSpec{{PodNumber: 2, GpuNumber: 16}},
|
||||
}
|
||||
|
||||
var pss = map[types.UID]api.PodSchedulingSpec{
|
||||
|
@ -324,6 +327,14 @@ var pss = map[types.UID]api.PodSchedulingSpec{
|
|||
GpuType: "CT1",
|
||||
GpuNumber: 2,
|
||||
AffinityGroup: group17,
|
||||
}, "pod27": { // will be rejected because one of the pod in this group is allocated a non-suggested node
|
||||
VirtualCluster: "VC1",
|
||||
Priority: 1,
|
||||
LazyPreemptionEnable: true,
|
||||
ReservationId: "VC1-YQW-DGX2",
|
||||
GpuType: "DGX2-V100",
|
||||
GpuNumber: 16,
|
||||
AffinityGroup: group18,
|
||||
},
|
||||
}
|
||||
|
||||
|
@ -492,13 +503,10 @@ func testSuggestedNodes(t *testing.T, h *HivedAlgorithm) {
|
|||
nodes = append(nodes, node)
|
||||
}
|
||||
}
|
||||
pod := allPods["pod5"]
|
||||
pod := allPods["pod27"]
|
||||
pod.Annotations[api.AnnotationKeyPodSchedulingSpec] = common.ToYaml(pss[pod.UID])
|
||||
psr := h.Schedule(pod, nodes)
|
||||
if psr.PodBindInfo != nil {
|
||||
t.Errorf("[%v]: wrong pod scheduling result: expected empty, but got %v:%v",
|
||||
internal.Key(pod), psr.PodBindInfo.Node, psr.PodBindInfo.GpuIsolation)
|
||||
}
|
||||
compareSchedulingResult(t, pod, psr)
|
||||
}
|
||||
|
||||
func testReconfiguration(t *testing.T, configFilePath string) {
|
||||
|
|
|
@ -76,7 +76,7 @@ func ancestorNoHigherThanNode(c Cell) Cell {
|
|||
func (t *topologyAwareScheduler) Schedule(
|
||||
podGpuNumbers map[int32]int32,
|
||||
p CellPriority,
|
||||
suggestedNodeSet common.Set) map[int32][]CellList {
|
||||
suggestedNodes common.Set) map[int32][]CellList {
|
||||
|
||||
// GPU numbers of the pods to schedule
|
||||
var sortedPodGpuNumbers []int32
|
||||
|
@ -89,13 +89,13 @@ func (t *topologyAwareScheduler) Schedule(
|
|||
|
||||
// disable preemption first (reduce preemption)
|
||||
priority := opportunisticPriority
|
||||
t.updateClusterView(priority, suggestedNodeSet)
|
||||
t.updateClusterView(priority, suggestedNodes)
|
||||
// try to fit the pods to a set of nodes
|
||||
selectedNodeIndices := findNodesForPods(t.cv, sortedPodGpuNumbers, priority)
|
||||
// enable preemption if scheduling failed
|
||||
if selectedNodeIndices == nil && p > opportunisticPriority {
|
||||
priority = p
|
||||
t.updateClusterView(priority, suggestedNodeSet)
|
||||
t.updateClusterView(priority, suggestedNodes)
|
||||
selectedNodeIndices = findNodesForPods(t.cv, sortedPodGpuNumbers, priority)
|
||||
}
|
||||
if selectedNodeIndices == nil {
|
||||
|
@ -219,12 +219,12 @@ func (cv clusterView) Swap(i int, j int) {
|
|||
}
|
||||
|
||||
// updateClusterView updates the GPU numbers of the nodes for the sorting.
|
||||
func (t *topologyAwareScheduler) updateClusterView(p CellPriority, suggestedNodeSet common.Set) {
|
||||
func (t *topologyAwareScheduler) updateClusterView(p CellPriority, suggestedNodes common.Set) {
|
||||
for _, n := range t.cv {
|
||||
inSuggested := true
|
||||
if t.considerSuggestedNodes {
|
||||
nodeNames, _ := n.c.(*PhysicalCell).GetPhysicalPlacement()
|
||||
inSuggested = suggestedNodeSet.Contains(nodeNames[0])
|
||||
inSuggested = suggestedNodes.Contains(nodeNames[0])
|
||||
}
|
||||
n.UpdateUsedGpuNumForPriority(p, t.crossPriorityPack, inSuggested)
|
||||
}
|
||||
|
|
Загрузка…
Ссылка в новой задаче