237 строки
9.2 KiB
Go
237 строки
9.2 KiB
Go
// MIT License
|
|
//
|
|
// Copyright (c) Microsoft Corporation. All rights reserved.
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
// of this software and associated documentation files (the "Software"), to deal
|
|
// in the Software without restriction, including without limitation the rights
|
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
// copies of the Software, and to permit persons to whom the Software is
|
|
// furnished to do so, subject to the following conditions:
|
|
//
|
|
// The above copyright notice and this permission notice shall be included in all
|
|
// copies or substantial portions of the Software.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
// SOFTWARE
|
|
|
|
package internal
|
|
|
|
import (
|
|
"fmt"
|
|
si "github.com/microsoft/hivedscheduler/pkg/api"
|
|
core "k8s.io/api/core/v1"
|
|
"k8s.io/apimachinery/pkg/types"
|
|
ei "k8s.io/kubernetes/pkg/scheduler/api"
|
|
)
|
|
|
|
///////////////////////////////////////////////////////////////////////////////////////
|
|
// Internal Common Types
|
|
///////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
// WebServer Callbacks with K8S Default Scheduler
|
|
// Notes:
|
|
// 1. Error should be delivered by panic
|
|
// 2. Should not assume previous succeeded operation also has been successfully
|
|
// executed by K8S Default Scheduler.
|
|
type ExtenderHandlers struct {
|
|
FilterHandler func(args ei.ExtenderArgs) *ei.ExtenderFilterResult
|
|
BindHandler func(args ei.ExtenderBindingArgs) *ei.ExtenderBindingResult
|
|
PreemptHandler func(args ei.ExtenderPreemptionArgs) *ei.ExtenderPreemptionResult
|
|
}
|
|
|
|
type InspectHandlers struct {
|
|
GetAllAffinityGroupsHandler func() si.AffinityGroupList
|
|
GetAffinityGroupHandler func(groupName string) si.AffinityGroup
|
|
GetClusterStatusHandler func() si.ClusterStatus
|
|
GetPhysicalClusterStatusHandler func() si.PhysicalClusterStatus
|
|
GetAllVirtualClustersStatusHandler func() map[si.VirtualClusterName]si.VirtualClusterStatus
|
|
GetVirtualClusterStatusHandler func(vcName si.VirtualClusterName) si.VirtualClusterStatus
|
|
}
|
|
|
|
// SchedulerAlgorithm is used to make the pod schedule decision based on its whole
|
|
// cluster scheduling view constructed from its Add/Update/Delete callbacks.
|
|
// Notes:
|
|
// 1. Error should be delivered by panic and it will not change any state.
|
|
// For WebServer Callbacks, all Panics will be recovered as error responses,
|
|
// see HandleInformerPanic.
|
|
// For Informer Callbacks, only User Error Panics will be recovered as error
|
|
// logs, other Panics will crash the whole process, see HandleWebServerPanic.
|
|
// 2. Should take all the input parameters as readonly and return pod schedule
|
|
// decision by PodScheduleResult.
|
|
// 3. {Schedule, AddUnallocatedPod, DeleteUnallocatedPod, AddAllocatedPod,
|
|
// DeleteAllocatedPod} will never be executed concurrently for all pods.
|
|
// 4. [Schedule -> (AddAllocatedPod) -> Schedule -> ...] is executed sequentially
|
|
// for all pods.
|
|
// I.e. the constructed scheduling view is already lock protected.
|
|
// 5. [START -> AddAllocatedPod -> DeleteAllocatedPod -> END] is executed
|
|
// sequentially for one specific Pod.
|
|
// I.e. once a specific Pod is allocated by AddAllocatedPod, its placement
|
|
// will never be changed to another one.
|
|
type SchedulerAlgorithm interface {
|
|
// See details in SchedulingPhase and PodScheduleResult.
|
|
Schedule(pod *core.Pod, suggestedNodes []string, phase SchedulingPhase) PodScheduleResult
|
|
|
|
// Track all current Nodes in the whole cluster.
|
|
AddNode(node *core.Node)
|
|
UpdateNode(oldNode, newNode *core.Node)
|
|
DeleteNode(node *core.Node)
|
|
|
|
// Track all current unallocated and allocated Pods in the whole cluster.
|
|
// Unallocated Pod includes both PodWaiting and PodPreempting Pods.
|
|
AddUnallocatedPod(pod *core.Pod)
|
|
DeleteUnallocatedPod(pod *core.Pod)
|
|
// Allocated Pod includes both PodBound and PodBinding Pods.
|
|
AddAllocatedPod(pod *core.Pod)
|
|
DeleteAllocatedPod(pod *core.Pod)
|
|
|
|
// Expose current scheduling status
|
|
GetAllAffinityGroups() si.AffinityGroupList
|
|
GetAffinityGroup(name string) si.AffinityGroup
|
|
GetClusterStatus() si.ClusterStatus
|
|
GetPhysicalClusterStatus() si.PhysicalClusterStatus
|
|
GetAllVirtualClustersStatus() map[si.VirtualClusterName]si.VirtualClusterStatus
|
|
GetVirtualClusterStatus(si.VirtualClusterName) si.VirtualClusterStatus
|
|
}
|
|
|
|
type SchedulingPhase string
|
|
|
|
const (
|
|
// The Schedule is called during filterRoutine:
|
|
// The suggestedNodes contain all nodes that can fit the pod without any
|
|
// lower priority Pods preempted.
|
|
FilteringPhase SchedulingPhase = "Filtering"
|
|
|
|
// The Schedule is called during preemptRoutine:
|
|
// The suggestedNodes contain all nodes that can fit the pod after all
|
|
// lower priority Pods preempted.
|
|
PreemptingPhase SchedulingPhase = "Preempting"
|
|
)
|
|
|
|
// Notes:
|
|
// 1. If the SchedulerAlgorithm found sufficient free resource, only PodBindInfo
|
|
// should be set.
|
|
// If the SchedulerAlgorithm found sufficient preemptible resource, only
|
|
// PodPreemptInfo should be set.
|
|
// Otherwise, only PodWaitInfo can be optionally set.
|
|
// 2. The selected node in PodBindInfo requires:
|
|
// 1. Must be within candidateNodes:
|
|
// All existing nodes which can be constructed from AddNode/DeleteNode.
|
|
// Otherwise, the Pod after bound will be probably deleted by the
|
|
// GarbageCollectionController.
|
|
// 2. Better to be within suggestedNodes:
|
|
// The input parameter of Schedule().
|
|
// Otherwise, the Pod after bound will not respect previous pod schedule
|
|
// decision from K8S Default Scheduler, i.e. this binding is incompatible
|
|
// with K8S Default Scheduler.
|
|
type PodScheduleResult struct {
|
|
PodWaitInfo *PodWaitInfo
|
|
PodPreemptInfo *PodPreemptInfo
|
|
PodBindInfo *si.PodBindInfo
|
|
}
|
|
|
|
// PodUID -> PodScheduleStatus
|
|
type PodScheduleStatuses map[types.UID]*PodScheduleStatus
|
|
|
|
// Used to track the PodScheduleResult
|
|
type PodScheduleStatus struct {
|
|
// The Pod which will be used in current PodState.
|
|
// For example, in PodBinding state, it should be a Pod used to bind,
|
|
// i.e. with all placements set in the Pod.
|
|
Pod *core.Pod
|
|
PodState PodState
|
|
// The already tried Pod binding attempts, i.e. the scheduled times for a Pod
|
|
// in PodBinding state.
|
|
PodBindAttempts int32
|
|
PodScheduleResult *PodScheduleResult
|
|
}
|
|
|
|
type PodState string
|
|
|
|
// [VirtualState]: This state is not tracked in PodScheduleStatuses.
|
|
//
|
|
// Pod is unknown for the scheduler, such as the Pod does not exist, completed
|
|
// or has not been informed to the scheduler.
|
|
//
|
|
// A completed Pod is the same as a deleted Pod from the scheduler view and they
|
|
// will never be informed to the scheduler, thus they are rejected to be scheduled.
|
|
// However, the not yet informed not completed Pod will be eventually informed to
|
|
// the scheduler, then transition to not VirtualState, thus they are accepted
|
|
// to be scheduled.
|
|
//
|
|
// This state can be transitioned from any state, but can only transition to:
|
|
// -> PodWaiting
|
|
// -> PodBound
|
|
const PodUnknown PodState = "Unknown"
|
|
|
|
// [AllocatedState]: The Pod is considered to be allocated from the scheduler view.
|
|
const (
|
|
// Pod is waiting for preemptible or free resource to appear.
|
|
// [StartState]
|
|
// -> PodPreempting
|
|
// -> PodBinding
|
|
PodWaiting PodState = "Waiting"
|
|
|
|
// Pod is waiting for the appeared preemptible resource to be free by preemption.
|
|
// -> PodBinding
|
|
// -> PodWaiting
|
|
PodPreempting PodState = "Preempting"
|
|
|
|
// Pod is binding on the free resource.
|
|
// [AllocatedState]
|
|
// -> PodBound
|
|
PodBinding PodState = "Binding"
|
|
|
|
// Pod is bound on the free resource.
|
|
// [FinalState]
|
|
// [AllocatedState]
|
|
PodBound PodState = "Bound"
|
|
)
|
|
|
|
func IsAllocated(state PodState) bool {
|
|
return state == PodBinding || state == PodBound
|
|
}
|
|
|
|
// No need to use it recover scheduler waiting resource
|
|
type PodWaitInfo struct {
|
|
// The reason why no preemptible or free resource to allocate the Pod now.
|
|
Reason string
|
|
}
|
|
|
|
// No need to use it recover scheduler preempting resource
|
|
type PodPreemptInfo struct {
|
|
// Only need to include the victim Pods for the current preemptor Pod.
|
|
// Need to ensure the newly deleted victim Pods are eventually removed from here,
|
|
// otherwise, the default scheduler refuse to execute any preemption.
|
|
// Need to ensure the newly added victim Pods are eventually added to here,
|
|
// otherwise, the preemption will never complete.
|
|
// It can be empty, such as current preemptor Pod is waiting for the victim Pods
|
|
// of other preemptor Pods in the same group to be preempted.
|
|
// It can contain victim Pods across multiple nodes, such as a victim group may
|
|
// contain Pods across multiple nodes.
|
|
VictimPods []*core.Pod
|
|
}
|
|
|
|
type PodKey struct {
|
|
Namespace string
|
|
Name string
|
|
UID types.UID
|
|
}
|
|
|
|
func NewPodKey(namespace string, name string, uid types.UID) *PodKey {
|
|
return &PodKey{
|
|
Namespace: namespace,
|
|
Name: name,
|
|
UID: uid,
|
|
}
|
|
}
|
|
|
|
func (pk *PodKey) String() string {
|
|
return fmt.Sprintf("%v(%v/%v)", pk.UID, pk.Namespace, pk.Name)
|
|
}
|