[HiveD] VC aware of suggested nodes (#4268)

* vc aware of suggested nodes

* vc aware of suggested nodes

* suggested

* add UT

* fix comments

* minor fix

* revert vc aware suggested nodes

* fix UT
This commit is contained in:
Hanyu Zhao 2020-03-16 13:07:36 +08:00 коммит произвёл GitHub
Родитель e633511722
Коммит 283668e504
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
3 изменённых файлов: 68 добавлений и 49 удалений

Просмотреть файл

@ -347,7 +347,7 @@ func (h *HivedAlgorithm) initReservations() {
func (h *HivedAlgorithm) scheduleNewAffinityGroup(
pod *core.Pod,
s *api.PodSchedulingSpec,
suggestedNodeSet common.Set) (map[int32][]CellList, map[int32][]CellList) {
suggestedNodes common.Set) (map[int32][]CellList, map[int32][]CellList) {
var (
physicalPlacement map[int32][]CellList
@ -371,9 +371,9 @@ func (h *HivedAlgorithm) scheduleNewAffinityGroup(
if sr.reservationId != "" {
klog.Infof("Use reservation %v", s.ReservationId)
sr.chain = h.reservedCells[sr.vc][sr.reservationId].GetChain()
physicalPlacement, virtualPlacement = h.processSchedulingRequest(sr, suggestedNodeSet)
physicalPlacement, virtualPlacement = h.processSchedulingRequest(sr, suggestedNodes)
} else {
physicalPlacement, virtualPlacement = h.scheduleAffinityGroupForGpuType(sr, s.GpuType, pod, suggestedNodeSet)
physicalPlacement, virtualPlacement = h.scheduleAffinityGroupForGpuType(sr, s.GpuType, pod, suggestedNodes)
}
if physicalPlacement != nil {
klog.Infof("Succeeded in scheduling group %v", s.AffinityGroup.Name)
@ -390,7 +390,7 @@ func (h *HivedAlgorithm) scheduleAffinityGroupForGpuType(
sr schedulingRequest,
gpuType string,
pod *core.Pod,
suggestedNodeSet common.Set) (map[int32][]CellList, map[int32][]CellList) {
suggestedNodes common.Set) (map[int32][]CellList, map[int32][]CellList) {
if gpuType != "" {
if chains := h.chains[gpuType]; chains == nil {
@ -404,7 +404,7 @@ func (h *HivedAlgorithm) scheduleAffinityGroupForGpuType(
vcHasType = true
}
sr.chain = chain
if physicalPlacement, virtualPlacement := h.processSchedulingRequest(sr, suggestedNodeSet); physicalPlacement != nil {
if physicalPlacement, virtualPlacement := h.processSchedulingRequest(sr, suggestedNodes); physicalPlacement != nil {
return physicalPlacement, virtualPlacement
}
}
@ -418,7 +418,7 @@ func (h *HivedAlgorithm) scheduleAffinityGroupForGpuType(
for _, chains := range h.chains {
for _, chain := range chains {
sr.chain = chain
if physicalPlacement, virtualPlacement := h.processSchedulingRequest(sr, suggestedNodeSet); physicalPlacement != nil {
if physicalPlacement, virtualPlacement := h.processSchedulingRequest(sr, suggestedNodes); physicalPlacement != nil {
return physicalPlacement, virtualPlacement
}
}
@ -448,12 +448,12 @@ func (h *HivedAlgorithm) validateSchedulingRequest(sr schedulingRequest, pod *co
// or the opportunistic scheduler according to its priority.
func (h *HivedAlgorithm) processSchedulingRequest(
sr schedulingRequest,
suggestedNodeSet common.Set) (map[int32][]CellList, map[int32][]CellList) {
suggestedNodes common.Set) (map[int32][]CellList, map[int32][]CellList) {
if sr.priority >= minGuaranteedPriority {
return h.scheduleGuaranteedAffinityGroup(sr, suggestedNodeSet)
return h.scheduleGuaranteedAffinityGroup(sr, suggestedNodes)
} else {
return h.scheduleOpportunisticAffinityGroup(sr, suggestedNodeSet), nil
return h.scheduleOpportunisticAffinityGroup(sr, suggestedNodes), nil
}
}
@ -461,7 +461,7 @@ func (h *HivedAlgorithm) processSchedulingRequest(
// then maps the placement in VC to the physical cluster.
func (h *HivedAlgorithm) scheduleGuaranteedAffinityGroup(
sr schedulingRequest,
suggestedNodeSet common.Set) (map[int32][]CellList, map[int32][]CellList) {
suggestedNodes common.Set) (map[int32][]CellList, map[int32][]CellList) {
// schedule in VC
virtualPlacement := h.vcSchedulers[sr.vc].schedule(sr)
@ -498,7 +498,7 @@ func (h *HivedAlgorithm) scheduleGuaranteedAffinityGroup(
if preassignedPhysical == nil {
// allocate a new physical cell to the preassigned cell. input a copy of the free cell list
// because during the scheduling we should not make in-place change to the data structures
c := buddyAlloc(h.getTmpFreeCellList(sr.chain), pac.GetLevel(), suggestedNodeSet)
c := buddyAlloc(h.getTmpFreeCellList(sr.chain), pac.GetLevel(), suggestedNodes)
if c == nil {
panic(fmt.Sprintf(
"VC Safety Broken: Cannot find physical cell for a VC cell: %v", pac.GetName()))
@ -510,7 +510,7 @@ func (h *HivedAlgorithm) scheduleGuaranteedAffinityGroup(
preassignedPhysical.SetPreBoundVirtualCell(pac)
}
}
physicalPlacement[podGpuNum][i][j] = mapNonPreassignedCellToPhysical(vGpu, suggestedNodeSet)
physicalPlacement[podGpuNum][i][j] = mapNonPreassignedCellToPhysical(vGpu, suggestedNodes)
}
}
}
@ -521,10 +521,10 @@ func (h *HivedAlgorithm) scheduleGuaranteedAffinityGroup(
// scheduleOpportunisticAffinityGroup calls the opportunistic pod scheduler to schedule an affinity group.
func (h *HivedAlgorithm) scheduleOpportunisticAffinityGroup(
sr schedulingRequest,
suggestedNodeSet common.Set) map[int32][]CellList {
suggestedNodes common.Set) map[int32][]CellList {
placement := h.opportunisticSchedulers[sr.chain].Schedule(
sr.affinityGroupPodNums, opportunisticPriority, suggestedNodeSet)
sr.affinityGroupPodNums, opportunisticPriority, suggestedNodes)
if placement == nil {
klog.Infof("Insufficient capacity in PC for scheduling request: GPU numbers %v, priority %v",
sr.affinityGroupPodNums, sr.priority)
@ -865,7 +865,7 @@ func generatePodScheduleResult(
currentPodIndex int32,
group *AlgoAffinityGroup,
groupName string,
suggestedNodeSet common.Set,
suggestedNodes common.Set,
vc api.VirtualClusterName,
pod *core.Pod) internal.PodScheduleResult {
@ -889,7 +889,7 @@ func generatePodScheduleResult(
// we find the selected node after the preemption is done, otherwise the preemption victims
// may cause the selected node to be excluded from the suggested nodes
affinityGroupBindInfo, nodesNotInSuggested, selectedNode, selectedGpuIndices, cellChain := generateAffinityGroupBindInfo(
groupPhysicalPlacement, groupVirtualPlacement, cellLevelToType, currentGpuNum, currentPodIndex, group, groupName, suggestedNodeSet)
groupPhysicalPlacement, groupVirtualPlacement, cellLevelToType, currentGpuNum, currentPodIndex, group, groupName, suggestedNodes)
var waitReason string
if affinityGroupBindInfo == nil {
waitReason = "insufficient capacity in physical cluster"
@ -936,7 +936,7 @@ func generateAffinityGroupBindInfo(
currentPodIndex int32,
group *AlgoAffinityGroup,
groupName string,
suggestedNodeSet common.Set) (
suggestedNodes common.Set) (
affinityGroupBindInfo []api.AffinityGroupMemberBindInfo,
nodesNotInSuggested []string,
selectedNode string,
@ -973,7 +973,7 @@ func generateAffinityGroupBindInfo(
// in its "nodes" and "gpuIndices" as the node and GPU address
if mbi.PodPlacements[podIndex].PhysicalNode == "" {
mbi.PodPlacements[podIndex].PhysicalNode = nodes[0]
if !suggestedNodeSet.Contains(nodes[0]) {
if !suggestedNodes.Contains(nodes[0]) {
nodesNotInSuggested = append(nodesNotInSuggested, nodes[0])
}
}
@ -1075,9 +1075,9 @@ func retrieveMissingPodPlacement(group *AlgoAffinityGroup, gpuNum int32, podInde
// It splits a higher-level cell when there is no free cell at the current level.
// As the input cell list is a copy of the real free list and hence is one-off,
// we won't remove a returned cell from it.
func buddyAlloc(freeList ChainCellList, level CellLevel, suggestedNodeSet common.Set) *PhysicalCell {
func buddyAlloc(freeList ChainCellList, level CellLevel, suggestedNodes common.Set) *PhysicalCell {
if len(freeList[level]) == 0 && level < CellLevel(len(freeList)) {
higherCell := buddyAlloc(freeList, level+1, suggestedNodeSet)
higherCell := buddyAlloc(freeList, level+1, suggestedNodes)
if higherCell != nil {
freeList[level] = append(freeList[level], higherCell.GetChildren()...)
}
@ -1085,54 +1085,65 @@ func buddyAlloc(freeList ChainCellList, level CellLevel, suggestedNodeSet common
if len(freeList[level]) == 0 {
return nil
}
return getFewestOpporPhysicalCell(freeList[level], suggestedNodeSet)
return getFewestOpporPhysicalCell(freeList[level], suggestedNodes)
}
// getFewestOpporPhysicalCell selects a physical cell with the minimum number of opportunistic pods from a cell list.
func getFewestOpporPhysicalCell(cl CellList, suggestedNodeSet common.Set) *PhysicalCell {
func getFewestOpporPhysicalCell(cl CellList, suggestedNodes common.Set) *PhysicalCell {
fewestOpporNum := int32(math.MaxInt32)
fewestOpporNumSuggested := int32(math.MaxInt32)
var fewestOpporCell *PhysicalCell
var fewestOpporSuggested *PhysicalCell
var fewestOpporCellSuggested *PhysicalCell
var preemptibleCells []*PhysicalCell
for _, c := range cl {
if pc := c.(*PhysicalCell); pc.GetVirtualCell() == nil && pc.GetPreBoundVirtualCell() == nil {
numOppor := pc.GetUsedGpuNumAtPriorities()[opportunisticPriority]
if numOppor < fewestOpporNum {
fewestOpporNum = numOppor
opporNum := pc.GetUsedGpuNumAtPriorities()[opportunisticPriority]
if opporNum < fewestOpporNum {
fewestOpporNum = opporNum
fewestOpporCell = pc
}
allNodesInSuggested := true
nodes, _ := pc.GetPhysicalPlacement()
for _, n := range nodes {
if !suggestedNodeSet.Contains(n) {
if !suggestedNodes.Contains(n) {
allNodesInSuggested = false
break
}
}
if allNodesInSuggested && numOppor < fewestOpporNumSuggested {
fewestOpporNumSuggested = numOppor
fewestOpporSuggested = pc
if allNodesInSuggested && opporNum < fewestOpporNumSuggested {
fewestOpporNumSuggested = opporNum
fewestOpporCellSuggested = pc
}
if opporNum > 0 {
preemptibleCells = append(preemptibleCells, pc)
}
}
}
if fewestOpporSuggested == nil {
return fewestOpporCell
} else {
return fewestOpporSuggested
if fewestOpporCellSuggested != nil {
return fewestOpporCellSuggested
}
// If we cannot find a cell within suggested nodes, we will try to preempt some pods instead of
// directly returning the fewestOpporCell (because this cell could be a bad node, we should not return it).
// Also, we will choose a random cell, to avoid always returning the same cell (similar to above,
// if we always return the same cell, it might be a bad node, preempting pods on a bad node won't bring
// it back the suggested nodes)
if len(preemptibleCells) > 0 {
return preemptibleCells[rand.Int31n(int32(len(preemptibleCells)))]
}
return fewestOpporCell
}
// mapNonPreassignedCellToPhysical maps a virtual cell (possibly inside a preassigned one) to the
// physical cell of the preassigned cell. This operation keeps the inner-cell topology equivalent,
// by recursively binding the cells inside the preassigned one.
func mapNonPreassignedCellToPhysical(c *VirtualCell, suggestedNodeSet common.Set) *PhysicalCell {
func mapNonPreassignedCellToPhysical(c *VirtualCell, suggestedNodes common.Set) *PhysicalCell {
if c.GetPhysicalCell() != nil {
return c.GetPhysicalCell()
} else if c.GetPreBoundPhysicalCell() != nil {
return c.GetPreBoundPhysicalCell()
} else {
parentPhysical := mapNonPreassignedCellToPhysical(c.GetParent().(*VirtualCell), suggestedNodeSet)
pc := getFewestOpporPhysicalCell(parentPhysical.GetChildren(), suggestedNodeSet)
parentPhysical := mapNonPreassignedCellToPhysical(c.GetParent().(*VirtualCell), suggestedNodes)
pc := getFewestOpporPhysicalCell(parentPhysical.GetChildren(), suggestedNodes)
if pc == nil || pc.GetPriority() > opportunisticPriority {
panic(fmt.Sprintf("VC Safety Broken: Cannot find physical cell for %v", c.GetName()))
}

Просмотреть файл

@ -39,7 +39,7 @@ var allPods = map[string]*core.Pod{}
func init() {
common.InitAll()
for i := 1; i <= 25; i++ {
for i := 1; i <= len(pss); i++ {
podName := fmt.Sprintf("pod%v", i)
allPods[podName] = &core.Pod{
ObjectMeta: meta.ObjectMeta{
@ -62,7 +62,7 @@ func initNodes(h *HivedAlgorithm) {
}
}
var group1, group2, group3, group4, group5, group6, group7, group8, group9, group10, group11, group12, group13, group14, group15, group16, group17 = &api.AffinityGroupSpec{
var group1, group2, group3, group4, group5, group6, group7, group8, group9, group10, group11, group12, group13, group14, group15, group16, group17, group18 = &api.AffinityGroupSpec{
Name: "group1",
Members: []api.AffinityGroupMemberSpec{{PodNumber: 1, GpuNumber: 1}},
}, &api.AffinityGroupSpec{
@ -113,6 +113,9 @@ var group1, group2, group3, group4, group5, group6, group7, group8, group9, grou
}, &api.AffinityGroupSpec{
Name: "group17",
Members: []api.AffinityGroupMemberSpec{{PodNumber: 1, GpuNumber: 2}},
}, &api.AffinityGroupSpec{
Name: "group18",
Members: []api.AffinityGroupMemberSpec{{PodNumber: 2, GpuNumber: 16}},
}
var pss = map[types.UID]api.PodSchedulingSpec{
@ -324,6 +327,14 @@ var pss = map[types.UID]api.PodSchedulingSpec{
GpuType: "CT1",
GpuNumber: 2,
AffinityGroup: group17,
}, "pod27": { // will be rejected because one of the pod in this group is allocated a non-suggested node
VirtualCluster: "VC1",
Priority: 1,
LazyPreemptionEnable: true,
ReservationId: "VC1-YQW-DGX2",
GpuType: "DGX2-V100",
GpuNumber: 16,
AffinityGroup: group18,
},
}
@ -492,13 +503,10 @@ func testSuggestedNodes(t *testing.T, h *HivedAlgorithm) {
nodes = append(nodes, node)
}
}
pod := allPods["pod5"]
pod := allPods["pod27"]
pod.Annotations[api.AnnotationKeyPodSchedulingSpec] = common.ToYaml(pss[pod.UID])
psr := h.Schedule(pod, nodes)
if psr.PodBindInfo != nil {
t.Errorf("[%v]: wrong pod scheduling result: expected empty, but got %v:%v",
internal.Key(pod), psr.PodBindInfo.Node, psr.PodBindInfo.GpuIsolation)
}
compareSchedulingResult(t, pod, psr)
}
func testReconfiguration(t *testing.T, configFilePath string) {

Просмотреть файл

@ -76,7 +76,7 @@ func ancestorNoHigherThanNode(c Cell) Cell {
func (t *topologyAwareScheduler) Schedule(
podGpuNumbers map[int32]int32,
p CellPriority,
suggestedNodeSet common.Set) map[int32][]CellList {
suggestedNodes common.Set) map[int32][]CellList {
// GPU numbers of the pods to schedule
var sortedPodGpuNumbers []int32
@ -89,13 +89,13 @@ func (t *topologyAwareScheduler) Schedule(
// disable preemption first (reduce preemption)
priority := opportunisticPriority
t.updateClusterView(priority, suggestedNodeSet)
t.updateClusterView(priority, suggestedNodes)
// try to fit the pods to a set of nodes
selectedNodeIndices := findNodesForPods(t.cv, sortedPodGpuNumbers, priority)
// enable preemption if scheduling failed
if selectedNodeIndices == nil && p > opportunisticPriority {
priority = p
t.updateClusterView(priority, suggestedNodeSet)
t.updateClusterView(priority, suggestedNodes)
selectedNodeIndices = findNodesForPods(t.cv, sortedPodGpuNumbers, priority)
}
if selectedNodeIndices == nil {
@ -219,12 +219,12 @@ func (cv clusterView) Swap(i int, j int) {
}
// updateClusterView updates the GPU numbers of the nodes for the sorting.
func (t *topologyAwareScheduler) updateClusterView(p CellPriority, suggestedNodeSet common.Set) {
func (t *topologyAwareScheduler) updateClusterView(p CellPriority, suggestedNodes common.Set) {
for _, n := range t.cv {
inSuggested := true
if t.considerSuggestedNodes {
nodeNames, _ := n.c.(*PhysicalCell).GetPhysicalPlacement()
inSuggested = suggestedNodeSet.Contains(nodeNames[0])
inSuggested = suggestedNodes.Contains(nodeNames[0])
}
n.UpdateUsedGpuNumForPriority(p, t.crossPriorityPack, inSuggested)
}