HiveD: record cell type in pod annotation (#3962)

* record cell type (instead of level) in pod annotation

* fix selectedNode

* refine failed reason

* minor fixes

* minor fixes
This commit is contained in:
Hanyu Zhao 2019-12-06 09:09:29 +08:00 коммит произвёл GitHub
Родитель c3341596b2
Коммит 965ae38e18
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
6 изменённых файлов: 184 добавлений и 125 удалений

Просмотреть файл

@ -389,10 +389,24 @@ func calculateGpuType(cellChainElements map[api.CellType]*cellChainElement, chai
return gpuTypeToChain
}
func calculateCellType(cellChainElements map[api.CellType]*cellChainElement, chains []CellChain) map[CellChain]map[CellLevel]api.CellType {
cellLevelToType := map[CellChain]map[CellLevel]api.CellType{}
for _, chain := range chains {
cellLevelToType[chain] = map[CellLevel]api.CellType{}
ce, ok := cellChainElements[api.CellType(chain)]
for ok {
cellLevelToType[chain][ce.level] = ce.cellType
ce, ok = cellChainElements[ce.childCellType]
}
}
return cellLevelToType
}
func ParseConfig(sConfig *api.Config) (
map[CellChain]ChainCellList, // chain:level:[]physicalCell
map[CellChain]map[CellLevel]int32, // chain:level:gpuNumber
map[string][]CellChain, // gpuType:[]chain
map[CellChain]map[CellLevel]api.CellType, // chain:level:cellType
map[api.VirtualClusterName]map[CellChain]ChainCellList, // non reserved virtual cells, vc:chain:level:[]virtualCell
map[api.VirtualClusterName]map[api.ReservationId]ChainCellList, // reserved virtual cells, vc:reservationId:level:[]virtualCell
map[api.VirtualClusterName]map[api.ReservationId]*PhysicalCell, // vc:reservationId:PhysicalCell
@ -409,10 +423,11 @@ func ParseConfig(sConfig *api.Config) (
}
gpuNums := calculateGpuNumber(cellChainElements, cellChains)
gpuTypeToChain := calculateGpuType(cellChainElements, cellChains)
cellLevelToType := calculateCellType(cellChainElements, cellChains)
virtualSpecs := sConfig.VirtualClusters
virtualNonReservedCellList, virtualReservedCellList, reservedPhysicalCells := newVirtualCellConstructor(
cellChainElements, *virtualSpecs, rawReservedPhysicalCells).build()
return physicalCells, gpuNums, gpuTypeToChain, virtualNonReservedCellList, virtualReservedCellList, reservedPhysicalCells
return physicalCells, gpuNums, gpuTypeToChain, cellLevelToType, virtualNonReservedCellList, virtualReservedCellList, reservedPhysicalCells
}

Просмотреть файл

@ -50,6 +50,8 @@ type HivedAlgorithm struct {
freeCellList map[CellChain]ChainCellList
// map each GPU type to all chains that contain this type
chains map[string][]CellChain
// map each level in a chain to the specific cell type name
cellTypes map[CellChain]map[CellLevel]api.CellType
// all affinity groups that have been allocated cells
allocatedAffinityGroups map[string]*AlgoAffinityGroup
// all reserved physical cells (VC -> reservation ID -> cells)
@ -60,13 +62,14 @@ type HivedAlgorithm struct {
// NewHivedAlgorithm initializes a HivedAlgorithm from the config file
func NewHivedAlgorithm(sConfig *api.Config) *HivedAlgorithm {
pcl, gpuNums, gpuTypeToChain, nonReservedVcl, reservedVcl, reservedPc := ParseConfig(sConfig)
pcl, gpuNums, gpuTypeToChain, cellLevelToType, nonReservedVcl, reservedVcl, reservedPc := ParseConfig(sConfig)
h := &HivedAlgorithm{
vcSchedulers: make(map[api.VirtualClusterName]intraVCScheduler),
opportunisticSchedulers: map[CellChain]*topologyAwareScheduler{},
fullCellList: pcl,
freeCellList: make(map[CellChain]ChainCellList),
chains: gpuTypeToChain,
cellTypes: cellLevelToType,
allocatedAffinityGroups: make(map[string]*AlgoAffinityGroup),
reservedCells: reservedPc,
}
@ -127,7 +130,15 @@ func (h *HivedAlgorithm) Schedule(pod *core.Pod, suggestedNodes []string) intern
podIndex = int32(len(group.allocatedPods[s.GpuNumber]))
}
return generatePodScheduleResult(
groupPhysicalPlacement, groupVirtualPlacement, s.GpuNumber, podIndex, newGroup, suggestedNodeSet, pod)
groupPhysicalPlacement,
groupVirtualPlacement,
h.cellTypes,
s.GpuNumber,
podIndex,
newGroup,
suggestedNodeSet,
s.VirtualCluster,
pod)
}
func (h *HivedAlgorithm) AddAllocatedPod(pod *core.Pod) {
@ -160,23 +171,30 @@ func (h *HivedAlgorithm) AddAllocatedPod(pod *core.Pod) {
newGroup.physicalGpuPlacement[gpuNumber][podIndex][gpuIndex] = pGpu
var vGpu *VirtualCell
if newGroup.virtualGpuPlacement != nil && !lazyPreempted {
preassignedLevel := CellLevel(gms.PodPlacements[podIndex].PreassignedCellLevels[gpuIndex])
if preassignedLevel >= 0 {
preassignedType := gms.PodPlacements[podIndex].PreassignedCellTypes[gpuIndex]
if preassignedType != "" {
var preassignedLevel CellLevel
typeFound := false
for l, t := range h.cellTypes[pGpu.GetChain()] {
if t == preassignedType {
preassignedLevel = l
typeFound = true
}
}
var message string
if vcs := h.vcSchedulers[s.VirtualCluster]; vcs == nil {
message = fmt.Sprintf("VC %v has been deleted", s.VirtualCluster)
if !typeFound {
message = fmt.Sprintf("preassigned cell type %v not found in chain %v", preassignedType, pGpu.GetChain())
} else if vcs := h.vcSchedulers[s.VirtualCluster]; vcs == nil {
message = fmt.Sprintf("VC %v not found", s.VirtualCluster)
} else {
var vccl ChainCellList
var str string
vccl := vcs.getNonReservedCellList()[pGpu.GetChain()]
str := string(pGpu.GetChain())
if s.ReservationId != "" {
vccl = vcs.getReservedCellList()[s.ReservationId]
str = string(s.ReservationId)
} else {
vccl = vcs.getNonReservedCellList()[pGpu.GetChain()]
str = string(pGpu.GetChain())
}
if vccl == nil {
message = fmt.Sprintf("VC %v no longer has cells for %v", s.VirtualCluster, str)
message = fmt.Sprintf("VC %v has no cell for %v", s.VirtualCluster, str)
} else {
vGpu, message = mapNonPreassignedCellToVirtual(pGpu, vccl, preassignedLevel, priority)
}
@ -536,7 +554,16 @@ func (h *HivedAlgorithm) scheduleOpportunisticAffinityGroup(
sr schedulingRequest,
suggestedNodeSet common.Set) map[int32][]CellList {
return h.opportunisticSchedulers[sr.chain].Schedule(sr.affinityGroupPodNums, opportunisticPriority, suggestedNodeSet)
placement := h.opportunisticSchedulers[sr.chain].Schedule(
sr.affinityGroupPodNums, opportunisticPriority, suggestedNodeSet)
if placement == nil {
klog.Infof("Insufficient capacity in PC for scheduling request: GPU numbers %v, priority %v",
sr.affinityGroupPodNums, sr.priority)
} else {
klog.Infof("Succeeded in scheduling in PC for scheduling request: GPU numbers %v, priority %v",
sr.affinityGroupPodNums, sr.priority)
}
return placement
}
// getTmpFreeCellList returns a copy of the free cell list.
@ -737,25 +764,129 @@ func (h *HivedAlgorithm) findPhysicalGpuInChain(
func generatePodScheduleResult(
groupPhysicalPlacement map[int32][]CellList,
groupVirtualPlacement map[int32][]CellList,
cellLevelToType map[CellChain]map[CellLevel]api.CellType,
currentGpuNum int32,
currentPodIndex int32,
newGroup bool,
suggestedNodeSet common.Set,
vc api.VirtualClusterName,
pod *core.Pod) internal.PodScheduleResult {
affinityGroupBindInfo, selectedNode, selectedGpuIndices := generateAffinityGroupBindInfo(
groupPhysicalPlacement, groupVirtualPlacement, currentGpuNum, currentPodIndex)
if affinityGroupBindInfo == nil {
preemptionVictims, nodesHaveVictims := collectPreemptionVictims(groupPhysicalPlacement, newGroup)
if len(preemptionVictims) > 0 {
// we collect victims on a random node, as K8S preempts victims from only one node once.
// random is to let different pods preempt victims on different nodes
// (note that this randomness is not necessary for the eventual-completeness of preemption).
nodeToPreempt := nodesHaveVictims[rand.Int31n(int32(len(nodesHaveVictims)))]
var victimPods []*core.Pod
var victimNames []string
for v := range preemptionVictims[nodeToPreempt].Items() {
victimPods = append(victimPods, v.(*core.Pod))
victimNames = append(victimNames, internal.Key(v.(*core.Pod)))
}
klog.Infof("[%v]: need to preempt pods %v", internal.Key(pod), strings.Join(victimNames, ", "))
return internal.PodScheduleResult{
PodWaitInfo: &internal.PodWaitInfo{
// TODO: Enrich the Pod Waiting Reason.
Reason: "",
PodPreemptInfo: &internal.PodPreemptInfo{VictimPods: victimPods},
}
} else {
// we find the selected node after the preemption is done, otherwise the preemption victims
// may cause the selected node to be excluded from the suggested nodes
affinityGroupBindInfo, selectedNode, selectedGpuIndices := generateAffinityGroupBindInfo(
groupPhysicalPlacement, groupVirtualPlacement, cellLevelToType, currentGpuNum, currentPodIndex, newGroup, suggestedNodeSet)
var waitReason string
if affinityGroupBindInfo == nil {
waitReason = "insufficient capacity in physical cluster"
if groupVirtualPlacement != nil {
waitReason = fmt.Sprintf("insufficient quota in VC %v", vc)
}
} else if selectedNode == "" {
waitReason = "cannot find a K8s candidate node within physical cluster"
if groupVirtualPlacement != nil {
waitReason = fmt.Sprintf("cannot find a K8s candidate node within VC %v's quota", vc)
}
}
if waitReason != "" {
return internal.PodScheduleResult{PodWaitInfo: &internal.PodWaitInfo{Reason: waitReason}}
}
klog.Infof("[%v]: scheduled to node %v, GPUs %v",
internal.Key(pod), selectedNode, selectedGpuIndices)
return internal.PodScheduleResult{
PodBindInfo: &api.PodBindInfo{
Node: selectedNode,
GpuIsolation: selectedGpuIndices,
CellChain: string(groupPhysicalPlacement[currentGpuNum][currentPodIndex][0].GetChain()),
AffinityGroupBindInfo: affinityGroupBindInfo,
},
}
}
chain := string(groupPhysicalPlacement[currentGpuNum][currentPodIndex][0].GetChain())
// collect preemption victims of the whole group
preemptionVictims := map[string]common.Set{} // node -> pods
}
// generateAffinityGroupBindInfo writes the physical and virtual placements of an affinity group
// into a a series of AffinityGroupMemberBindInfos, and returns the allocated node and GPU addresses
// of the current pod.
func generateAffinityGroupBindInfo(
groupPhysicalPlacement map[int32][]CellList,
groupVirtualPlacement map[int32][]CellList,
cellLevelToType map[CellChain]map[CellLevel]api.CellType,
currentGpuNum int32,
currentPodIndex int32,
newGroup bool,
suggestedNodeSet common.Set) ([]api.AffinityGroupMemberBindInfo, string, []int32) {
if groupPhysicalPlacement == nil {
return nil, "", nil
}
affinityGroupBindInfo := make([]api.AffinityGroupMemberBindInfo, len(groupPhysicalPlacement))
var selectedNode string
var selectedGpuIndices []int32
groupMemberIndex := 0
for podGpuNum, podPhysicalPlacements := range groupPhysicalPlacement {
mbi := api.AffinityGroupMemberBindInfo{
PodPlacements: make([]api.PodPlacementInfo, len(podPhysicalPlacements)),
}
for podIndex := int32(0); podIndex < int32(len(podPhysicalPlacements)); podIndex++ {
mbi.PodPlacements[podIndex].PhysicalGpuIndices = make(
[]int32, len(podPhysicalPlacements[podIndex]))
mbi.PodPlacements[podIndex].PreassignedCellTypes = make(
[]api.CellType, len(podPhysicalPlacements[podIndex]))
for gpuIndex := 0; gpuIndex < len(podPhysicalPlacements[podIndex]); gpuIndex++ {
pGpu := podPhysicalPlacements[podIndex][gpuIndex]
if pGpu == nil {
// TODO: Should insist binding and continue to force bind, then the Pod may
// run or retry, instead of stuck in PodBinding state forever here.
panic("Resources previously allocated has been invalid; pod should wait")
}
nodes, gpuIndices := pGpu.(*PhysicalCell).GetPhysicalPlacement()
// here each cell (i.e., pGpu) is only one GPU, hence we takes the first element
// in its "nodes" and "gpuIndices" as the node and GPU address
if mbi.PodPlacements[podIndex].PhysicalNode == "" {
mbi.PodPlacements[podIndex].PhysicalNode = nodes[0]
}
mbi.PodPlacements[podIndex].PhysicalGpuIndices[gpuIndex] = gpuIndices[0]
if groupVirtualPlacement != nil {
vGpu := groupVirtualPlacement[podGpuNum][podIndex][gpuIndex].(*VirtualCell)
mbi.PodPlacements[podIndex].PreassignedCellTypes[gpuIndex] = cellLevelToType[vGpu.GetChain()][vGpu.GetPreAssignedCell().GetLevel()]
} else {
mbi.PodPlacements[podIndex].PreassignedCellTypes[gpuIndex] = ""
}
}
}
if podGpuNum == currentGpuNum &&
(!newGroup || suggestedNodeSet.Contains(mbi.PodPlacements[currentPodIndex].PhysicalNode)) {
selectedNode = mbi.PodPlacements[currentPodIndex].PhysicalNode
selectedGpuIndices = mbi.PodPlacements[currentPodIndex].PhysicalGpuIndices
}
affinityGroupBindInfo[groupMemberIndex] = mbi
groupMemberIndex++
}
return affinityGroupBindInfo, selectedNode, selectedGpuIndices
}
// collectPreemptionVictims collects preemption victims of an affinity group.
// If any of the GPUs allocated for the whole group is still used by a pod,
// we will wait for the preemption, as the group is gang-scheduled.
func collectPreemptionVictims(groupPhysicalPlacement map[int32][]CellList, newGroup bool) (map[string]common.Set, []string) {
preemptionVictims := map[string]common.Set{}
var nodesHaveVictims []string
if newGroup {
for gpuNum := range groupPhysicalPlacement {
@ -778,100 +909,7 @@ func generatePodScheduleResult(
}
}
}
// if any of the GPUs allocated for the whole group is still used by a pod,
// we will wait for the preemption, as the group is gang-scheduled.
if len(nodesHaveVictims) > 0 {
// we collect victims on a random node, as K8S preempts victims from only one node once.
// random is to let different pods preempt victims on different nodes
// (but we don't rely on this randomness for the eventual-completeness of preemption).
nodeToPreempt := nodesHaveVictims[rand.Int31n(int32(len(nodesHaveVictims)))]
var victimPods []*core.Pod
var victimNames []string
for v := range preemptionVictims[nodeToPreempt].Items() {
victimPods = append(victimPods, v.(*core.Pod))
victimNames = append(victimNames, internal.Key(v.(*core.Pod)))
}
klog.Infof("[%v]: need to preempt pods %v", internal.Key(pod), strings.Join(victimNames, ", "))
return internal.PodScheduleResult{
PodPreemptInfo: &internal.PodPreemptInfo{VictimPods: victimPods},
}
} else {
// we check suggested nodes after the preemption is done, otherwise the preemption victims
// may cause the selected node to be excluded from the suggested nodes
// TODO: Keep selectedNode within suggestedNodeSet, so here should be Unreachable
if !suggestedNodeSet.Contains(selectedNode) && newGroup {
panic(fmt.Sprintf("[%v]: node %v picked by algorithm but not in K8S candidates",
internal.Key(pod), selectedNode))
}
klog.Infof("[%v]: scheduled to node %v, GPUs %v",
internal.Key(pod), selectedNode, selectedGpuIndices)
return internal.PodScheduleResult{
PodBindInfo: &api.PodBindInfo{
Node: selectedNode,
GpuIsolation: selectedGpuIndices,
CellChain: chain,
AffinityGroupBindInfo: affinityGroupBindInfo,
},
}
}
}
// generateAffinityGroupBindInfo writes the physical and virtual placements of an affinity group
// into a a series of AffinityGroupMemberBindInfos, and returns the allocated node and GPU addresses
// of the current pod.
func generateAffinityGroupBindInfo(
groupPhysicalPlacement map[int32][]CellList,
groupVirtualPlacement map[int32][]CellList,
currentGpuNum int32,
currentPodIndex int32) ([]api.AffinityGroupMemberBindInfo, string, []int32) {
if groupPhysicalPlacement == nil {
return nil, "", nil
}
affinityGroupBindInfo := make([]api.AffinityGroupMemberBindInfo, len(groupPhysicalPlacement))
var selectedNode string
var selectedGpuIndices []int32
groupMemberIndex := 0
for podGpuNum, podPhysicalPlacements := range groupPhysicalPlacement {
mbi := api.AffinityGroupMemberBindInfo{
PodPlacements: make([]api.PodPlacementInfo, len(podPhysicalPlacements)),
}
for podIndex := int32(0); podIndex < int32(len(podPhysicalPlacements)); podIndex++ {
mbi.PodPlacements[podIndex].PhysicalGpuIndices = make(
[]int32, len(podPhysicalPlacements[podIndex]))
mbi.PodPlacements[podIndex].PreassignedCellLevels = make(
[]int32, len(podPhysicalPlacements[podIndex]))
for gpuIndex := 0; gpuIndex < len(podPhysicalPlacements[podIndex]); gpuIndex++ {
pGpu := podPhysicalPlacements[podIndex][gpuIndex]
if pGpu == nil {
// TODO: Should insist binding and continue to force bind, then the Pod may
// run or retry, instead of stuck in PodBinding state forever here.
panic("Resources previously allocated has been invalid; pod should wait")
}
nodes, gpuIndices := pGpu.(*PhysicalCell).GetPhysicalPlacement()
// here each cell (i.e., pGpu) is only one GPU, hence we takes the first element
// in its "nodes" and "gpuIndices" as the node and GPU address
if mbi.PodPlacements[podIndex].PhysicalNode == "" {
mbi.PodPlacements[podIndex].PhysicalNode = nodes[0]
}
mbi.PodPlacements[podIndex].PhysicalGpuIndices[gpuIndex] = gpuIndices[0]
if groupVirtualPlacement != nil {
vGpu := groupVirtualPlacement[podGpuNum][podIndex][gpuIndex].(*VirtualCell)
mbi.PodPlacements[podIndex].PreassignedCellLevels[gpuIndex] = int32(
vGpu.GetPreAssignedCell().GetLevel())
} else {
mbi.PodPlacements[podIndex].PreassignedCellLevels[gpuIndex] = -1
}
}
}
if podGpuNum == currentGpuNum {
selectedNode = mbi.PodPlacements[currentPodIndex].PhysicalNode
selectedGpuIndices = mbi.PodPlacements[currentPodIndex].PhysicalGpuIndices
}
affinityGroupBindInfo[groupMemberIndex] = mbi
groupMemberIndex++
}
return affinityGroupBindInfo, selectedNode, selectedGpuIndices
return preemptionVictims, nodesHaveVictims
}
// buddyAlloc allocates a free cell at a certain level from a free list.

Просмотреть файл

@ -491,6 +491,12 @@ func testReconfiguration(t *testing.T, sConfig *api.Config) {
}
testCasesThatShouldSucceed(t, h)
// case: shorten cell chain
(*sConfig.PhysicalCluster).CellTypes["DGX2-V100-NODE"] = api.CellTypeSpec{
ChildCellType: "DGX2-V100",
ChildCellNumber: 16,
IsNodeLevel: true,
}
// case: physical cell not found
(*sConfig.PhysicalCluster).PhysicalCells[7].CellChildren[0].CellChildren[0].CellAddress = "0.0.3.100"
// case: insufficient VC quota

Просмотреть файл

@ -110,9 +110,9 @@ type AffinityGroupMemberBindInfo struct {
type PodPlacementInfo struct {
PhysicalNode string `yaml:"physicalNode"`
PhysicalGpuIndices []int32 `yaml:"physicalGpuIndices"`
// levels of the preassigned cells used by the pods. used to locate the virtual cells
// preassigned cell types used by the pods. used to locate the virtual cells
// when adding an allocated pod
PreassignedCellLevels []int32 `yaml:"preassignedCellLevels"`
PreassignedCellTypes []CellType `yaml:"preassignedCellTypes"`
}
type WebServerPaths struct {

Просмотреть файл

@ -91,7 +91,7 @@ type SchedulerAlgorithm interface {
// Notes:
// 1. If the SchedulerAlgorithm found sufficient free resource, only PodBindInfo
// should be set.
// If the SchedulerAlgorithm found sufficient preemptable resource, only
// If the SchedulerAlgorithm found sufficient preemptible resource, only
// PodPreemptInfo should be set.
// Otherwise, only PodWaitInfo can be optionally set.
// 2. The selected node in PodBindInfo requires:
@ -146,13 +146,13 @@ const PodUnknown PodState = "Unknown"
// [AllocatedState]: The Pod is considered to be allocated from the scheduler view.
const (
// Pod is waiting for preemptable or free resource to appear.
// Pod is waiting for preemptible or free resource to appear.
// [StartState]
// -> PodPreempting
// -> PodBinding
PodWaiting PodState = "Waiting"
// Pod is waiting for the appeared preemptable resource to be free by preemption.
// Pod is waiting for the appeared preemptible resource to be free by preemption.
// -> PodBinding
// -> PodWaiting
PodPreempting PodState = "Preempting"
@ -174,7 +174,7 @@ func IsAllocated(state PodState) bool {
// No need to use it recover scheduler waiting resource
type PodWaitInfo struct {
// The reason why no preemptable or free resource to allocate the Pod now.
// The reason why no preemptible or free resource to allocate the Pod now.
Reason string
}

Просмотреть файл

@ -552,7 +552,7 @@ func (s *HivedScheduler) filterRoutine(args ei.ExtenderArgs) *ei.ExtenderFilterR
if result.PodWaitInfo != nil {
return &ei.ExtenderFilterResult{
Error: fmt.Sprintf(
"Pod is waiting for preemptable or free resource to appear: %v",
"Pod is waiting for preemptible or free resource to appear: %v",
result.PodWaitInfo.Reason),
}
} else {
@ -646,7 +646,7 @@ func (s *HivedScheduler) preemptRoutine(args ei.ExtenderPreemptionArgs) *ei.Exte
// At this point, podState must be in:
// {PodWaiting}
// The Pod should keep on waiting for preemptable or free resource to appear,
// The Pod should keep on waiting for preemptible or free resource to appear,
// so do not preempt any victim.
return &ei.ExtenderPreemptionResult{}
}