hivedscheduler/pkg/internal/types.go

237 строки
9.2 KiB
Go

// MIT License
//
// Copyright (c) Microsoft Corporation. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE
package internal
import (
"fmt"
si "github.com/microsoft/hivedscheduler/pkg/api"
core "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
ei "k8s.io/kubernetes/pkg/scheduler/api"
)
///////////////////////////////////////////////////////////////////////////////////////
// Internal Common Types
///////////////////////////////////////////////////////////////////////////////////////
// WebServer Callbacks with K8S Default Scheduler
// Notes:
// 1. Error should be delivered by panic
// 2. Should not assume previous succeeded operation also has been successfully
// executed by K8S Default Scheduler.
type ExtenderHandlers struct {
FilterHandler func(args ei.ExtenderArgs) *ei.ExtenderFilterResult
BindHandler func(args ei.ExtenderBindingArgs) *ei.ExtenderBindingResult
PreemptHandler func(args ei.ExtenderPreemptionArgs) *ei.ExtenderPreemptionResult
}
type InspectHandlers struct {
GetAllAffinityGroupsHandler func() si.AffinityGroupList
GetAffinityGroupHandler func(groupName string) si.AffinityGroup
GetClusterStatusHandler func() si.ClusterStatus
GetPhysicalClusterStatusHandler func() si.PhysicalClusterStatus
GetAllVirtualClustersStatusHandler func() map[si.VirtualClusterName]si.VirtualClusterStatus
GetVirtualClusterStatusHandler func(vcName si.VirtualClusterName) si.VirtualClusterStatus
}
// SchedulerAlgorithm is used to make the pod schedule decision based on its whole
// cluster scheduling view constructed from its Add/Update/Delete callbacks.
// Notes:
// 1. Error should be delivered by panic and it will not change any state.
// For WebServer Callbacks, all Panics will be recovered as error responses,
// see HandleInformerPanic.
// For Informer Callbacks, only User Error Panics will be recovered as error
// logs, other Panics will crash the whole process, see HandleWebServerPanic.
// 2. Should take all the input parameters as readonly and return pod schedule
// decision by PodScheduleResult.
// 3. {Schedule, AddUnallocatedPod, DeleteUnallocatedPod, AddAllocatedPod,
// DeleteAllocatedPod} will never be executed concurrently for all pods.
// 4. [Schedule -> (AddAllocatedPod) -> Schedule -> ...] is executed sequentially
// for all pods.
// I.e. the constructed scheduling view is already lock protected.
// 5. [START -> AddAllocatedPod -> DeleteAllocatedPod -> END] is executed
// sequentially for one specific Pod.
// I.e. once a specific Pod is allocated by AddAllocatedPod, its placement
// will never be changed to another one.
type SchedulerAlgorithm interface {
// See details in SchedulingPhase and PodScheduleResult.
Schedule(pod *core.Pod, suggestedNodes []string, phase SchedulingPhase) PodScheduleResult
// Track all current Nodes in the whole cluster.
AddNode(node *core.Node)
UpdateNode(oldNode, newNode *core.Node)
DeleteNode(node *core.Node)
// Track all current unallocated and allocated Pods in the whole cluster.
// Unallocated Pod includes both PodWaiting and PodPreempting Pods.
AddUnallocatedPod(pod *core.Pod)
DeleteUnallocatedPod(pod *core.Pod)
// Allocated Pod includes both PodBound and PodBinding Pods.
AddAllocatedPod(pod *core.Pod)
DeleteAllocatedPod(pod *core.Pod)
// Expose current scheduling status
GetAllAffinityGroups() si.AffinityGroupList
GetAffinityGroup(name string) si.AffinityGroup
GetClusterStatus() si.ClusterStatus
GetPhysicalClusterStatus() si.PhysicalClusterStatus
GetAllVirtualClustersStatus() map[si.VirtualClusterName]si.VirtualClusterStatus
GetVirtualClusterStatus(si.VirtualClusterName) si.VirtualClusterStatus
}
type SchedulingPhase string
const (
// The Schedule is called during filterRoutine:
// The suggestedNodes contain all nodes that can fit the pod without any
// lower priority Pods preempted.
FilteringPhase SchedulingPhase = "Filtering"
// The Schedule is called during preemptRoutine:
// The suggestedNodes contain all nodes that can fit the pod after all
// lower priority Pods preempted.
PreemptingPhase SchedulingPhase = "Preempting"
)
// Notes:
// 1. If the SchedulerAlgorithm found sufficient free resource, only PodBindInfo
// should be set.
// If the SchedulerAlgorithm found sufficient preemptible resource, only
// PodPreemptInfo should be set.
// Otherwise, only PodWaitInfo can be optionally set.
// 2. The selected node in PodBindInfo requires:
// 1. Must be within candidateNodes:
// All existing nodes which can be constructed from AddNode/DeleteNode.
// Otherwise, the Pod after bound will be probably deleted by the
// GarbageCollectionController.
// 2. Better to be within suggestedNodes:
// The input parameter of Schedule().
// Otherwise, the Pod after bound will not respect previous pod schedule
// decision from K8S Default Scheduler, i.e. this binding is incompatible
// with K8S Default Scheduler.
type PodScheduleResult struct {
PodWaitInfo *PodWaitInfo
PodPreemptInfo *PodPreemptInfo
PodBindInfo *si.PodBindInfo
}
// PodUID -> PodScheduleStatus
type PodScheduleStatuses map[types.UID]*PodScheduleStatus
// Used to track the PodScheduleResult
type PodScheduleStatus struct {
// The Pod which will be used in current PodState.
// For example, in PodBinding state, it should be a Pod used to bind,
// i.e. with all placements set in the Pod.
Pod *core.Pod
PodState PodState
// The already tried Pod binding attempts, i.e. the scheduled times for a Pod
// in PodBinding state.
PodBindAttempts int32
PodScheduleResult *PodScheduleResult
}
type PodState string
// [VirtualState]: This state is not tracked in PodScheduleStatuses.
//
// Pod is unknown for the scheduler, such as the Pod does not exist, completed
// or has not been informed to the scheduler.
//
// A completed Pod is the same as a deleted Pod from the scheduler view and they
// will never be informed to the scheduler, thus they are rejected to be scheduled.
// However, the not yet informed not completed Pod will be eventually informed to
// the scheduler, then transition to not VirtualState, thus they are accepted
// to be scheduled.
//
// This state can be transitioned from any state, but can only transition to:
// -> PodWaiting
// -> PodBound
const PodUnknown PodState = "Unknown"
// [AllocatedState]: The Pod is considered to be allocated from the scheduler view.
const (
// Pod is waiting for preemptible or free resource to appear.
// [StartState]
// -> PodPreempting
// -> PodBinding
PodWaiting PodState = "Waiting"
// Pod is waiting for the appeared preemptible resource to be free by preemption.
// -> PodBinding
// -> PodWaiting
PodPreempting PodState = "Preempting"
// Pod is binding on the free resource.
// [AllocatedState]
// -> PodBound
PodBinding PodState = "Binding"
// Pod is bound on the free resource.
// [FinalState]
// [AllocatedState]
PodBound PodState = "Bound"
)
func IsAllocated(state PodState) bool {
return state == PodBinding || state == PodBound
}
// No need to use it recover scheduler waiting resource
type PodWaitInfo struct {
// The reason why no preemptible or free resource to allocate the Pod now.
Reason string
}
// No need to use it recover scheduler preempting resource
type PodPreemptInfo struct {
// Only need to include the victim Pods for the current preemptor Pod.
// Need to ensure the newly deleted victim Pods are eventually removed from here,
// otherwise, the default scheduler refuse to execute any preemption.
// Need to ensure the newly added victim Pods are eventually added to here,
// otherwise, the preemption will never complete.
// It can be empty, such as current preemptor Pod is waiting for the victim Pods
// of other preemptor Pods in the same group to be preempted.
// It can contain victim Pods across multiple nodes, such as a victim group may
// contain Pods across multiple nodes.
VictimPods []*core.Pod
}
type PodKey struct {
Namespace string
Name string
UID types.UID
}
func NewPodKey(namespace string, name string, uid types.UID) *PodKey {
return &PodKey{
Namespace: namespace,
Name: name,
UID: uid,
}
}
func (pk *PodKey) String() string {
return fmt.Sprintf("%v(%v/%v)", pk.UID, pk.Namespace, pk.Name)
}