// MIT License // // Copyright (c) Microsoft Corporation. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE package internal import ( "fmt" si "github.com/microsoft/hivedscheduler/pkg/api" core "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" ei "k8s.io/kubernetes/pkg/scheduler/api" ) /////////////////////////////////////////////////////////////////////////////////////// // Internal Common Types /////////////////////////////////////////////////////////////////////////////////////// // WebServer Callbacks with K8S Default Scheduler // Notes: // 1. Error should be delivered by panic // 2. Should not assume previous succeeded operation also has been successfully // executed by K8S Default Scheduler. type ExtenderHandlers struct { FilterHandler func(args ei.ExtenderArgs) *ei.ExtenderFilterResult BindHandler func(args ei.ExtenderBindingArgs) *ei.ExtenderBindingResult PreemptHandler func(args ei.ExtenderPreemptionArgs) *ei.ExtenderPreemptionResult } type InspectHandlers struct { GetAllAffinityGroupsHandler func() si.AffinityGroupList GetAffinityGroupHandler func(groupName string) si.AffinityGroup GetClusterStatusHandler func() si.ClusterStatus GetPhysicalClusterStatusHandler func() si.PhysicalClusterStatus GetAllVirtualClustersStatusHandler func() map[si.VirtualClusterName]si.VirtualClusterStatus GetVirtualClusterStatusHandler func(vcName si.VirtualClusterName) si.VirtualClusterStatus } // SchedulerAlgorithm is used to make the pod schedule decision based on its whole // cluster scheduling view constructed from its Add/Update/Delete callbacks. // Notes: // 1. Error should be delivered by panic and it will not change any state. // For WebServer Callbacks, all Panics will be recovered as error responses, // see HandleInformerPanic. // For Informer Callbacks, only User Error Panics will be recovered as error // logs, other Panics will crash the whole process, see HandleWebServerPanic. // 2. Should take all the input parameters as readonly and return pod schedule // decision by PodScheduleResult. // 3. {Schedule, AddUnallocatedPod, DeleteUnallocatedPod, AddAllocatedPod, // DeleteAllocatedPod} will never be executed concurrently for all pods. // 4. [Schedule -> (AddAllocatedPod) -> Schedule -> ...] is executed sequentially // for all pods. // I.e. the constructed scheduling view is already lock protected. // 5. [START -> AddAllocatedPod -> DeleteAllocatedPod -> END] is executed // sequentially for one specific Pod. // I.e. once a specific Pod is allocated by AddAllocatedPod, its placement // will never be changed to another one. type SchedulerAlgorithm interface { // See details in SchedulingPhase and PodScheduleResult. Schedule(pod *core.Pod, suggestedNodes []string, phase SchedulingPhase) PodScheduleResult // Track all current Nodes in the whole cluster. AddNode(node *core.Node) UpdateNode(oldNode, newNode *core.Node) DeleteNode(node *core.Node) // Track all current unallocated and allocated Pods in the whole cluster. // Unallocated Pod includes both PodWaiting and PodPreempting Pods. AddUnallocatedPod(pod *core.Pod) DeleteUnallocatedPod(pod *core.Pod) // Allocated Pod includes both PodBound and PodBinding Pods. AddAllocatedPod(pod *core.Pod) DeleteAllocatedPod(pod *core.Pod) // Expose current scheduling status GetAllAffinityGroups() si.AffinityGroupList GetAffinityGroup(name string) si.AffinityGroup GetClusterStatus() si.ClusterStatus GetPhysicalClusterStatus() si.PhysicalClusterStatus GetAllVirtualClustersStatus() map[si.VirtualClusterName]si.VirtualClusterStatus GetVirtualClusterStatus(si.VirtualClusterName) si.VirtualClusterStatus } type SchedulingPhase string const ( // The Schedule is called during filterRoutine: // The suggestedNodes contain all nodes that can fit the pod without any // lower priority Pods preempted. FilteringPhase SchedulingPhase = "Filtering" // The Schedule is called during preemptRoutine: // The suggestedNodes contain all nodes that can fit the pod after all // lower priority Pods preempted. PreemptingPhase SchedulingPhase = "Preempting" ) // Notes: // 1. If the SchedulerAlgorithm found sufficient free resource, only PodBindInfo // should be set. // If the SchedulerAlgorithm found sufficient preemptible resource, only // PodPreemptInfo should be set. // Otherwise, only PodWaitInfo can be optionally set. // 2. The selected node in PodBindInfo requires: // 1. Must be within candidateNodes: // All existing nodes which can be constructed from AddNode/DeleteNode. // Otherwise, the Pod after bound will be probably deleted by the // GarbageCollectionController. // 2. Better to be within suggestedNodes: // The input parameter of Schedule(). // Otherwise, the Pod after bound will not respect previous pod schedule // decision from K8S Default Scheduler, i.e. this binding is incompatible // with K8S Default Scheduler. type PodScheduleResult struct { PodWaitInfo *PodWaitInfo PodPreemptInfo *PodPreemptInfo PodBindInfo *si.PodBindInfo } // PodUID -> PodScheduleStatus type PodScheduleStatuses map[types.UID]*PodScheduleStatus // Used to track the PodScheduleResult type PodScheduleStatus struct { // The Pod which will be used in current PodState. // For example, in PodBinding state, it should be a Pod used to bind, // i.e. with all placements set in the Pod. Pod *core.Pod PodState PodState // The already tried Pod binding attempts, i.e. the scheduled times for a Pod // in PodBinding state. PodBindAttempts int32 PodScheduleResult *PodScheduleResult } type PodState string // [VirtualState]: This state is not tracked in PodScheduleStatuses. // // Pod is unknown for the scheduler, such as the Pod does not exist, completed // or has not been informed to the scheduler. // // A completed Pod is the same as a deleted Pod from the scheduler view and they // will never be informed to the scheduler, thus they are rejected to be scheduled. // However, the not yet informed not completed Pod will be eventually informed to // the scheduler, then transition to not VirtualState, thus they are accepted // to be scheduled. // // This state can be transitioned from any state, but can only transition to: // -> PodWaiting // -> PodBound const PodUnknown PodState = "Unknown" // [AllocatedState]: The Pod is considered to be allocated from the scheduler view. const ( // Pod is waiting for preemptible or free resource to appear. // [StartState] // -> PodPreempting // -> PodBinding PodWaiting PodState = "Waiting" // Pod is waiting for the appeared preemptible resource to be free by preemption. // -> PodBinding // -> PodWaiting PodPreempting PodState = "Preempting" // Pod is binding on the free resource. // [AllocatedState] // -> PodBound PodBinding PodState = "Binding" // Pod is bound on the free resource. // [FinalState] // [AllocatedState] PodBound PodState = "Bound" ) func IsAllocated(state PodState) bool { return state == PodBinding || state == PodBound } // No need to use it recover scheduler waiting resource type PodWaitInfo struct { // The reason why no preemptible or free resource to allocate the Pod now. Reason string } // No need to use it recover scheduler preempting resource type PodPreemptInfo struct { // Only need to include the victim Pods for the current preemptor Pod. // Need to ensure the newly deleted victim Pods are eventually removed from here, // otherwise, the default scheduler refuse to execute any preemption. // Need to ensure the newly added victim Pods are eventually added to here, // otherwise, the preemption will never complete. // It can be empty, such as current preemptor Pod is waiting for the victim Pods // of other preemptor Pods in the same group to be preempted. // It can contain victim Pods across multiple nodes, such as a victim group may // contain Pods across multiple nodes. VictimPods []*core.Pod } type PodKey struct { Namespace string Name string UID types.UID } func NewPodKey(namespace string, name string, uid types.UID) *PodKey { return &PodKey{ Namespace: namespace, Name: name, UID: uid, } } func (pk *PodKey) String() string { return fmt.Sprintf("%v(%v/%v)", pk.UID, pk.Namespace, pk.Name) }