From caf3e95830007fda2aa7f6083abff70d4775e7df Mon Sep 17 00:00:00 2001 From: Mohit Vinchoo Date: Tue, 23 Sep 2025 22:23:29 -0700 Subject: [PATCH 01/13] Preempt: Add lightweight strict gang preemption (opt-in) Signed-off-by: Mohit Vinchoo --- pkg/scheduler/actions/preempt/preempt.go | 59 ++++++++++++++++++++---- 1 file changed, 51 insertions(+), 8 deletions(-) diff --git a/pkg/scheduler/actions/preempt/preempt.go b/pkg/scheduler/actions/preempt/preempt.go index f237e3a162..f49e22dd3d 100644 --- a/pkg/scheduler/actions/preempt/preempt.go +++ b/pkg/scheduler/actions/preempt/preempt.go @@ -56,6 +56,9 @@ const ( MinCandidateNodesPercentageKey = "minCandidateNodesPercentage" MinCandidateNodesAbsoluteKey = "minCandidateNodesAbsolute" MaxCandidateNodesAbsoluteKey = "maxCandidateNodesAbsolute" + + // EnableStrictGangPreemptionKey gang.EnablePreemptable must be disabled for this feature to work flawlessly. + EnableStrictGangPreemptionKey = "enableStrictGangPreemption" ) type Action struct { @@ -69,6 +72,8 @@ type Action struct { minCandidateNodesPercentage int minCandidateNodesAbsolute int maxCandidateNodesAbsolute int + + enableStrictGangPreemption bool } func New() *Action { @@ -79,6 +84,7 @@ func New() *Action { minCandidateNodesPercentage: 10, minCandidateNodesAbsolute: 1, maxCandidateNodesAbsolute: 100, + enableStrictGangPreemption: false, } } @@ -96,6 +102,7 @@ func (pmpt *Action) parseArguments(ssn *framework.Session) { arguments.GetInt(&pmpt.minCandidateNodesPercentage, MinCandidateNodesPercentageKey) arguments.GetInt(&pmpt.minCandidateNodesAbsolute, MinCandidateNodesAbsoluteKey) arguments.GetInt(&pmpt.maxCandidateNodesAbsolute, MaxCandidateNodesAbsoluteKey) + arguments.GetBool(&pmpt.enableStrictGangPreemption, EnableStrictGangPreemptionKey) pmpt.ssn = ssn } @@ -316,6 +323,27 @@ func (pmpt *Action) normalPreempt( currentQueue := ssn.Queues[job.Queue] + // Check if we have a node with resources available due to strictGangPreemption + if pmpt.enableStrictGangPreemption { + klog.V(4).Infof("Checking if nodes have free space due to previous strict gang preemption.") + for _, node := range selectedNodes { + if !preemptor.InitResreq.LessEqual(node.FutureIdle(), api.Zero) { + continue + } + if ssn.Allocatable(currentQueue, preemptor) && preemptor.InitResreq.LessEqual(node.FutureIdle(), api.Zero) { + if err := stmt.Pipeline(preemptor, node.Name, true); err != nil { + klog.Errorf("Failed to pipeline Task <%s/%s> on Node <%s>", + preemptor.Namespace, preemptor.Name, node.Name) + if rollbackErr := stmt.UnPipeline(preemptor); rollbackErr != nil { + klog.Errorf("Failed to unpipeline Task %v on %v in Session %v for %v.", + preemptor.UID, node.Name, ssn.UID, rollbackErr) + } + } + return true, nil + } + } + } + assigned := false for _, node := range selectedNodes { @@ -356,15 +384,30 @@ func (pmpt *Action) normalPreempt( if ssn.Allocatable(currentQueue, preemptor) && preemptor.InitResreq.LessEqual(node.FutureIdle(), api.Zero) { break } - preemptee := victimsQueue.Pop().(*api.TaskInfo) - klog.V(3).Infof("Try to preempt Task <%s/%s> for Task <%s/%s>", - preemptee.Namespace, preemptee.Name, preemptor.Namespace, preemptor.Name) - if err := stmt.Evict(preemptee, "preempt"); err != nil { - klog.Errorf("Failed to preempt Task <%s/%s> for Task <%s/%s>: %v", - preemptee.Namespace, preemptee.Name, preemptor.Namespace, preemptor.Name, err) - continue + var evictionPreemptees []*api.TaskInfo + if pmpt.enableStrictGangPreemption { + parentPreemptee := victimsQueue.Pop().(*api.TaskInfo) + klog.V(3).Infof("StrictGangPreemption: Victim Task <%s/%s> from Job <%s/%s> for Task <%s/%s>", + parentPreemptee.Namespace, parentPreemptee.Name, ssn.Jobs[parentPreemptee.Job].Name, + ssn.Jobs[parentPreemptee.Job].Namespace, preemptor.Namespace, preemptor.Name) + for _, t := range ssn.Jobs[parentPreemptee.Job].Tasks { + evictionPreemptees = append(evictionPreemptees, t) + } + } else { + preemptee := victimsQueue.Pop().(*api.TaskInfo) + evictionPreemptees = append(evictionPreemptees, preemptee) + } + + for _, preemptee := range evictionPreemptees { + klog.V(3).Infof("Try to preempt Task <%s/%s> for Task <%s/%s>", + preemptee.Namespace, preemptee.Name, preemptor.Namespace, preemptor.Name) + if err := stmt.Evict(preemptee, "preempt"); err != nil { + klog.Errorf("Failed to preempt Task <%s/%s> for Task <%s/%s>: %v", + preemptee.Namespace, preemptee.Name, preemptor.Namespace, preemptor.Name, err) + continue + } + preempted.Add(preemptee.Resreq) } - preempted.Add(preemptee.Resreq) } evictionOccurred := false From 232ef0dcdc2809b7bbc3f16f7e94ae52d6f677be Mon Sep 17 00:00:00 2001 From: Mohit Vinchoo Date: Tue, 23 Sep 2025 23:09:58 -0700 Subject: [PATCH 02/13] Remove double check Signed-off-by: Mohit Vinchoo --- pkg/scheduler/actions/preempt/preempt.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/pkg/scheduler/actions/preempt/preempt.go b/pkg/scheduler/actions/preempt/preempt.go index f49e22dd3d..2f2ea5c85c 100644 --- a/pkg/scheduler/actions/preempt/preempt.go +++ b/pkg/scheduler/actions/preempt/preempt.go @@ -327,9 +327,6 @@ func (pmpt *Action) normalPreempt( if pmpt.enableStrictGangPreemption { klog.V(4).Infof("Checking if nodes have free space due to previous strict gang preemption.") for _, node := range selectedNodes { - if !preemptor.InitResreq.LessEqual(node.FutureIdle(), api.Zero) { - continue - } if ssn.Allocatable(currentQueue, preemptor) && preemptor.InitResreq.LessEqual(node.FutureIdle(), api.Zero) { if err := stmt.Pipeline(preemptor, node.Name, true); err != nil { klog.Errorf("Failed to pipeline Task <%s/%s> on Node <%s>", From c75285337a4b1e84990d674d9d0143fe3f925618 Mon Sep 17 00:00:00 2001 From: Mohit Vinchoo Date: Tue, 23 Sep 2025 23:13:01 -0700 Subject: [PATCH 03/13] Fix potential double eviction Signed-off-by: Mohit Vinchoo --- pkg/scheduler/actions/preempt/preempt.go | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/pkg/scheduler/actions/preempt/preempt.go b/pkg/scheduler/actions/preempt/preempt.go index 2f2ea5c85c..dcb9063240 100644 --- a/pkg/scheduler/actions/preempt/preempt.go +++ b/pkg/scheduler/actions/preempt/preempt.go @@ -384,11 +384,21 @@ func (pmpt *Action) normalPreempt( var evictionPreemptees []*api.TaskInfo if pmpt.enableStrictGangPreemption { parentPreemptee := victimsQueue.Pop().(*api.TaskInfo) + + // If the victim task's job is already being preempted (status is Releasing), skip. + // This avoids double-counting resources and redundant evictions. + if task, found := ssn.Jobs[parentPreemptee.Job].Tasks[parentPreemptee.UID]; found && task.Status == api.Releasing { + continue + } + klog.V(3).Infof("StrictGangPreemption: Victim Task <%s/%s> from Job <%s/%s> for Task <%s/%s>", parentPreemptee.Namespace, parentPreemptee.Name, ssn.Jobs[parentPreemptee.Job].Name, ssn.Jobs[parentPreemptee.Job].Namespace, preemptor.Namespace, preemptor.Name) for _, t := range ssn.Jobs[parentPreemptee.Job].Tasks { - evictionPreemptees = append(evictionPreemptees, t) + // Only evict tasks that are not in a terminal state. + if t.Status != api.Succeeded && t.Status != api.Failed { + evictionPreemptees = append(evictionPreemptees, t) + } } } else { preemptee := victimsQueue.Pop().(*api.TaskInfo) From 495eec4ec58d22a3c85593ad758740da076d20cd Mon Sep 17 00:00:00 2001 From: Mohit Vinchoo Date: Tue, 23 Sep 2025 23:30:17 -0700 Subject: [PATCH 04/13] Add basic unit test with feature flag off Signed-off-by: Mohit Vinchoo --- pkg/scheduler/actions/preempt/preempt_test.go | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pkg/scheduler/actions/preempt/preempt_test.go b/pkg/scheduler/actions/preempt/preempt_test.go index bc332f8838..0f423a7844 100644 --- a/pkg/scheduler/actions/preempt/preempt_test.go +++ b/pkg/scheduler/actions/preempt/preempt_test.go @@ -314,6 +314,20 @@ func TestPreempt(t *testing.T) { } }) } + actions = []framework.Action{New()} + for i, test := range tests { + test.Plugins = plugins + test.PriClass = []*schedulingv1.PriorityClass{highPrio, lowPrio} + t.Run(test.Name, func(t *testing.T) { + test.RegisterSession(tiers, []conf.Configuration{{Name: actions[0].Name(), + Arguments: map[string]interface{}{EnableStrictGangPreemptionKey: false}}}) + defer test.Close() + test.Run(actions) + if err := test.CheckAll(i); err != nil { + t.Fatal(err) + } + }) + } } func TestTopologyAwarePreempt(t *testing.T) { From 069143c935ecebafe82358a4c30ac783e724efd4 Mon Sep 17 00:00:00 2001 From: Mohit Vinchoo Date: Tue, 23 Sep 2025 23:38:20 -0700 Subject: [PATCH 05/13] Merge arguments map Signed-off-by: Mohit Vinchoo --- pkg/scheduler/actions/preempt/preempt_test.go | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/pkg/scheduler/actions/preempt/preempt_test.go b/pkg/scheduler/actions/preempt/preempt_test.go index 0f423a7844..5ebc2db0a3 100644 --- a/pkg/scheduler/actions/preempt/preempt_test.go +++ b/pkg/scheduler/actions/preempt/preempt_test.go @@ -306,21 +306,8 @@ func TestPreempt(t *testing.T) { test.PriClass = []*schedulingv1.PriorityClass{highPrio, lowPrio} t.Run(test.Name, func(t *testing.T) { test.RegisterSession(tiers, []conf.Configuration{{Name: actions[0].Name(), - Arguments: map[string]interface{}{EnableTopologyAwarePreemptionKey: false}}}) - defer test.Close() - test.Run(actions) - if err := test.CheckAll(i); err != nil { - t.Fatal(err) - } - }) - } - actions = []framework.Action{New()} - for i, test := range tests { - test.Plugins = plugins - test.PriClass = []*schedulingv1.PriorityClass{highPrio, lowPrio} - t.Run(test.Name, func(t *testing.T) { - test.RegisterSession(tiers, []conf.Configuration{{Name: actions[0].Name(), - Arguments: map[string]interface{}{EnableStrictGangPreemptionKey: false}}}) + Arguments: map[string]interface{}{EnableTopologyAwarePreemptionKey: false, + EnableStrictGangPreemptionKey: false}}}) defer test.Close() test.Run(actions) if err := test.CheckAll(i); err != nil { From f563df6292a4ed754f4d05dfbf7a3f6a4c0deadc Mon Sep 17 00:00:00 2001 From: Mohit Vinchoo Date: Tue, 23 Sep 2025 23:59:03 -0700 Subject: [PATCH 06/13] Add basic happy path unittest Signed-off-by: Mohit Vinchoo --- pkg/scheduler/actions/preempt/preempt_test.go | 82 +++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/pkg/scheduler/actions/preempt/preempt_test.go b/pkg/scheduler/actions/preempt/preempt_test.go index 5ebc2db0a3..d51128b3fb 100644 --- a/pkg/scheduler/actions/preempt/preempt_test.go +++ b/pkg/scheduler/actions/preempt/preempt_test.go @@ -612,6 +612,88 @@ func TestTopologyAwarePreempt(t *testing.T) { } } +func TestStrictGangPreempt(t *testing.T) { + plugins := map[string]framework.PluginBuilder{ + conformance.PluginName: conformance.New, + gang.PluginName: gang.New, + priority.PluginName: priority.New, + proportion.PluginName: proportion.New, + } + highPrio := util.BuildPriorityClass("high-priority", 100000) + lowPrio := util.BuildPriorityClass("low-priority", 10) + + tests := []uthelper.TestCommonStruct{ + { + Name: "when one task is preempted all gang members are preempted", + PodGroups: []*schedulingv1beta1.PodGroup{ + util.BuildPodGroupWithPrio("pg1", "c1", "q1", 1, map[string]int32{"": 3}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg2", "c1", "q1", 1, map[string]int32{"": 2}, schedulingv1beta1.PodGroupInqueue, "high-priority"), + }, + Pods: []*v1.Pod{ + util.BuildPod("c1", "preemptee1", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee2", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "false"}, make(map[string]string)), + util.BuildPod("c1", "preemptee3", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptor1", "", v1.PodPending, api.BuildResourceList("1", "1G"), "pg2", make(map[string]string), make(map[string]string)), + }, + Nodes: []*v1.Node{ + util.BuildNode("n1", api.BuildResourceList("3", "3G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), + }, + Queues: []*schedulingv1beta1.Queue{ + util.BuildQueue("q1", 1, nil), + }, + ExpectEvicted: []string{"c1/preemptee1", "c1/preemptee2", "c1/preemptee3"}, + ExpectEvictNum: 3, + }, + } + + trueValue := true + falseValue := false + tiers := []conf.Tier{ + { + Plugins: []conf.PluginOption{ + { + Name: conformance.PluginName, + EnabledPreemptable: &trueValue, + }, + { + Name: gang.PluginName, + EnabledPreemptable: &falseValue, + EnabledJobPipelined: &trueValue, + EnabledJobStarving: &trueValue, + }, + { + Name: priority.PluginName, + EnabledTaskOrder: &trueValue, + EnabledJobOrder: &trueValue, + EnabledPreemptable: &trueValue, + EnabledJobPipelined: &trueValue, + EnabledJobStarving: &trueValue, + }, + { + Name: proportion.PluginName, + EnabledOverused: &trueValue, + EnabledAllocatable: &trueValue, + EnabledQueueOrder: &trueValue, + }, + }, + }} + + actions := []framework.Action{New()} + for i, test := range tests { + test.Plugins = plugins + test.PriClass = []*schedulingv1.PriorityClass{highPrio, lowPrio} + t.Run(test.Name, func(t *testing.T) { + test.RegisterSession(tiers, []conf.Configuration{{Name: actions[0].Name(), + Arguments: map[string]interface{}{EnableStrictGangPreemptionKey: true}}}) + defer test.Close() + test.Run(actions) + if err := test.CheckAll(i); err != nil { + t.Fatal(err) + } + }) + } +} + func buildPodWithPodAntiAffinity(name, namespace, node string, phase v1.PodPhase, req v1.ResourceList, groupName string, labels map[string]string, selector map[string]string, topologyKey string) *v1.Pod { pod := util.BuildPod(name, namespace, node, phase, req, groupName, labels, selector) From 211ab87c7a1637f3cf3170aad63dbdfb8e9a13d3 Mon Sep 17 00:00:00 2001 From: Mohit Vinchoo Date: Wed, 24 Sep 2025 18:34:50 -0700 Subject: [PATCH 07/13] Add test Signed-off-by: Mohit Vinchoo --- pkg/scheduler/actions/preempt/preempt_test.go | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/pkg/scheduler/actions/preempt/preempt_test.go b/pkg/scheduler/actions/preempt/preempt_test.go index d51128b3fb..0f9fab01da 100644 --- a/pkg/scheduler/actions/preempt/preempt_test.go +++ b/pkg/scheduler/actions/preempt/preempt_test.go @@ -644,6 +644,34 @@ func TestStrictGangPreempt(t *testing.T) { ExpectEvicted: []string{"c1/preemptee1", "c1/preemptee2", "c1/preemptee3"}, ExpectEvictNum: 3, }, + { + Name: "when strict preemption makes space dont preempt more", + PodGroups: []*schedulingv1beta1.PodGroup{ + util.BuildPodGroupWithPrio("pg1", "c1", "q1", 1, map[string]int32{"": 2}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg2", "c1", "q1", 1, map[string]int32{"": 2}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg3", "c1", "q1", 1, map[string]int32{"": 1}, schedulingv1beta1.PodGroupInqueue, "high-priority"), + util.BuildPodGroupWithPrio("pg4", "c1", "q1", 1, map[string]int32{"": 1}, schedulingv1beta1.PodGroupInqueue, "high-priority"), + }, + Pods: []*v1.Pod{ + util.BuildPod("c1", "preemptee1", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee2", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "false"}, make(map[string]string)), + util.BuildPod("c1", "preemptee3", "n3", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg2", map[string]string{schedulingv1beta1.PodPreemptable: "false"}, make(map[string]string)), + util.BuildPod("c1", "preemptee4", "n4", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg2", map[string]string{schedulingv1beta1.PodPreemptable: "false"}, make(map[string]string)), + util.BuildPod("c1", "preemptor1", "", v1.PodPending, api.BuildResourceList("1", "1G"), "pg3", make(map[string]string), make(map[string]string)), + util.BuildPod("c1", "preemptor2", "", v1.PodPending, api.BuildResourceList("1", "1G"), "pg4", make(map[string]string), make(map[string]string)), + }, + Nodes: []*v1.Node{ + util.BuildNode("n1", api.BuildResourceList("1", "1G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), + util.BuildNode("n2", api.BuildResourceList("1", "1G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), + util.BuildNode("n3", api.BuildResourceList("1", "1G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), + util.BuildNode("n4", api.BuildResourceList("1", "1G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), + }, + Queues: []*schedulingv1beta1.Queue{ + util.BuildQueue("q1", 1, nil), + }, + ExpectEvicted: []string{"c1/preemptee1", "c1/preemptee2"}, + ExpectEvictNum: 2, + }, } trueValue := true From 6d082c2fc22882aedc103c2242d37006f3efb3cd Mon Sep 17 00:00:00 2001 From: Mohit Vinchoo Date: Sun, 28 Sep 2025 04:24:55 -0700 Subject: [PATCH 08/13] Add support for plugin level overrides Signed-off-by: Mohit Vinchoo --- pkg/scheduler/actions/preempt/preempt.go | 40 +++++++++-------- pkg/scheduler/actions/preempt/preempt_test.go | 43 ++++++++++++++----- 2 files changed, 54 insertions(+), 29 deletions(-) diff --git a/pkg/scheduler/actions/preempt/preempt.go b/pkg/scheduler/actions/preempt/preempt.go index dcb9063240..efe52a4578 100644 --- a/pkg/scheduler/actions/preempt/preempt.go +++ b/pkg/scheduler/actions/preempt/preempt.go @@ -57,7 +57,6 @@ const ( MinCandidateNodesAbsoluteKey = "minCandidateNodesAbsolute" MaxCandidateNodesAbsoluteKey = "maxCandidateNodesAbsolute" - // EnableStrictGangPreemptionKey gang.EnablePreemptable must be disabled for this feature to work flawlessly. EnableStrictGangPreemptionKey = "enableStrictGangPreemption" ) @@ -381,39 +380,44 @@ func (pmpt *Action) normalPreempt( if ssn.Allocatable(currentQueue, preemptor) && preemptor.InitResreq.LessEqual(node.FutureIdle(), api.Zero) { break } + preemptee := victimsQueue.Pop().(*api.TaskInfo) var evictionPreemptees []*api.TaskInfo + evictionPreemptees = append(evictionPreemptees, preemptee) if pmpt.enableStrictGangPreemption { - parentPreemptee := victimsQueue.Pop().(*api.TaskInfo) - // If the victim task's job is already being preempted (status is Releasing), skip. // This avoids double-counting resources and redundant evictions. - if task, found := ssn.Jobs[parentPreemptee.Job].Tasks[parentPreemptee.UID]; found && task.Status == api.Releasing { + if task, f := ssn.Jobs[preemptee.Job].Tasks[preemptee.UID]; f && task.Status == api.Releasing { continue } + klog.V(3).Infof("StrictGangPreemption: Initial Victim Task <%s/%s> from Job <%s/%s> for Task <%s/%s>", + preemptee.Namespace, preemptee.Name, ssn.Jobs[preemptee.Job].Name, + ssn.Jobs[preemptee.Job].Namespace, preemptor.Namespace, preemptor.Name) - klog.V(3).Infof("StrictGangPreemption: Victim Task <%s/%s> from Job <%s/%s> for Task <%s/%s>", - parentPreemptee.Namespace, parentPreemptee.Name, ssn.Jobs[parentPreemptee.Job].Name, - ssn.Jobs[parentPreemptee.Job].Namespace, preemptor.Namespace, preemptor.Name) - for _, t := range ssn.Jobs[parentPreemptee.Job].Tasks { + var victimJobTasksToEvict []*api.TaskInfo + for _, t := range ssn.Jobs[preemptee.Job].Tasks { // Only evict tasks that are not in a terminal state. - if t.Status != api.Succeeded && t.Status != api.Failed { - evictionPreemptees = append(evictionPreemptees, t) + if t.Status != api.Succeeded && t.Status != api.Failed && t.UID != preemptee.UID { + victimJobTasksToEvict = append(victimJobTasksToEvict, t) } } - } else { - preemptee := victimsQueue.Pop().(*api.TaskInfo) - evictionPreemptees = append(evictionPreemptees, preemptee) + // Refilter preemptees as all the tasks in PreempteeJob are now potential victims. + // This ensures that plugin level overrides are respected. + victimJobPQ := ssn.BuildVictimsPriorityQueue(victimJobTasksToEvict, preemptor) + for !victimJobPQ.Empty() { + evictionPreemptees = append(evictionPreemptees, victimJobPQ.Pop().(*api.TaskInfo)) + } + evictionPreemptees = ssn.Preemptable(preemptor, evictionPreemptees) } - for _, preemptee := range evictionPreemptees { + for _, p := range evictionPreemptees { klog.V(3).Infof("Try to preempt Task <%s/%s> for Task <%s/%s>", - preemptee.Namespace, preemptee.Name, preemptor.Namespace, preemptor.Name) - if err := stmt.Evict(preemptee, "preempt"); err != nil { + p.Namespace, p.Name, preemptor.Namespace, preemptor.Name) + if err := stmt.Evict(p, "preempt"); err != nil { klog.Errorf("Failed to preempt Task <%s/%s> for Task <%s/%s>: %v", - preemptee.Namespace, preemptee.Name, preemptor.Namespace, preemptor.Name, err) + p.Namespace, p.Name, preemptor.Namespace, preemptor.Name, err) continue } - preempted.Add(preemptee.Resreq) + preempted.Add(p.Resreq) } } diff --git a/pkg/scheduler/actions/preempt/preempt_test.go b/pkg/scheduler/actions/preempt/preempt_test.go index 0f9fab01da..b8a661d1f9 100644 --- a/pkg/scheduler/actions/preempt/preempt_test.go +++ b/pkg/scheduler/actions/preempt/preempt_test.go @@ -621,17 +621,17 @@ func TestStrictGangPreempt(t *testing.T) { } highPrio := util.BuildPriorityClass("high-priority", 100000) lowPrio := util.BuildPriorityClass("low-priority", 10) - + priority3, priority2, priority1 := int32(3), int32(2), int32(1) tests := []uthelper.TestCommonStruct{ { - Name: "when one task is preempted all gang members are preempted", + Name: "when one task is preempted all possible gang members are preempted", PodGroups: []*schedulingv1beta1.PodGroup{ - util.BuildPodGroupWithPrio("pg1", "c1", "q1", 1, map[string]int32{"": 3}, schedulingv1beta1.PodGroupInqueue, "low-priority"), - util.BuildPodGroupWithPrio("pg2", "c1", "q1", 1, map[string]int32{"": 2}, schedulingv1beta1.PodGroupInqueue, "high-priority"), + util.BuildPodGroupWithPrio("pg1", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg2", "c1", "q1", 1, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "high-priority"), }, Pods: []*v1.Pod{ util.BuildPod("c1", "preemptee1", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), - util.BuildPod("c1", "preemptee2", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "false"}, make(map[string]string)), + util.BuildPod("c1", "preemptee2", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), util.BuildPod("c1", "preemptee3", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), util.BuildPod("c1", "preemptor1", "", v1.PodPending, api.BuildResourceList("1", "1G"), "pg2", make(map[string]string), make(map[string]string)), }, @@ -644,13 +644,35 @@ func TestStrictGangPreempt(t *testing.T) { ExpectEvicted: []string{"c1/preemptee1", "c1/preemptee2", "c1/preemptee3"}, ExpectEvictNum: 3, }, + { + Name: "when one task is preempted minMembers field is respected for preemptee Job", + PodGroups: []*schedulingv1beta1.PodGroup{ + util.BuildPodGroupWithPrio("pg1", "c1", "q1", 1, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg2", "c1", "q1", 1, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "high-priority"), + }, + Pods: []*v1.Pod{ + // Using BuildPodWithPriority to keep the test case deterministic + util.BuildPodWithPriority("c1", "preemptee1", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string), &priority1), + util.BuildPodWithPriority("c1", "preemptee2", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string), &priority2), + util.BuildPodWithPriority("c1", "preemptee3", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string), &priority3), + util.BuildPod("c1", "preemptor1", "", v1.PodPending, api.BuildResourceList("1", "1G"), "pg2", make(map[string]string), make(map[string]string)), + }, + Nodes: []*v1.Node{ + util.BuildNode("n1", api.BuildResourceList("3", "3G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), + }, + Queues: []*schedulingv1beta1.Queue{ + util.BuildQueue("q1", 1, nil), + }, + ExpectEvicted: []string{"c1/preemptee1", "c1/preemptee2"}, + ExpectEvictNum: 2, + }, { Name: "when strict preemption makes space dont preempt more", PodGroups: []*schedulingv1beta1.PodGroup{ - util.BuildPodGroupWithPrio("pg1", "c1", "q1", 1, map[string]int32{"": 2}, schedulingv1beta1.PodGroupInqueue, "low-priority"), - util.BuildPodGroupWithPrio("pg2", "c1", "q1", 1, map[string]int32{"": 2}, schedulingv1beta1.PodGroupInqueue, "low-priority"), - util.BuildPodGroupWithPrio("pg3", "c1", "q1", 1, map[string]int32{"": 1}, schedulingv1beta1.PodGroupInqueue, "high-priority"), - util.BuildPodGroupWithPrio("pg4", "c1", "q1", 1, map[string]int32{"": 1}, schedulingv1beta1.PodGroupInqueue, "high-priority"), + util.BuildPodGroupWithPrio("pg1", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg2", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg3", "c1", "q1", 1, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "high-priority"), + util.BuildPodGroupWithPrio("pg4", "c1", "q1", 1, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "high-priority"), }, Pods: []*v1.Pod{ util.BuildPod("c1", "preemptee1", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), @@ -675,7 +697,6 @@ func TestStrictGangPreempt(t *testing.T) { } trueValue := true - falseValue := false tiers := []conf.Tier{ { Plugins: []conf.PluginOption{ @@ -685,7 +706,7 @@ func TestStrictGangPreempt(t *testing.T) { }, { Name: gang.PluginName, - EnabledPreemptable: &falseValue, + EnabledPreemptable: &trueValue, EnabledJobPipelined: &trueValue, EnabledJobStarving: &trueValue, }, From 952d858c2139152c0404134c557433189bfd09ba Mon Sep 17 00:00:00 2001 From: Mohit Vinchoo Date: Sat, 4 Oct 2025 18:15:28 -0700 Subject: [PATCH 09/13] Add full feature with multiple policies Signed-off-by: Mohit Vinchoo --- pkg/scheduler/actions/preempt/preempt.go | 308 ++++++++++++--- pkg/scheduler/actions/preempt/preempt_test.go | 358 ++++++++++++++++-- 2 files changed, 566 insertions(+), 100 deletions(-) diff --git a/pkg/scheduler/actions/preempt/preempt.go b/pkg/scheduler/actions/preempt/preempt.go index efe52a4578..1942e7688e 100644 --- a/pkg/scheduler/actions/preempt/preempt.go +++ b/pkg/scheduler/actions/preempt/preempt.go @@ -29,6 +29,7 @@ import ( "math" "math/rand" "sort" + "strings" "sync" "sync/atomic" "time" @@ -56,8 +57,15 @@ const ( MinCandidateNodesPercentageKey = "minCandidateNodesPercentage" MinCandidateNodesAbsoluteKey = "minCandidateNodesAbsolute" MaxCandidateNodesAbsoluteKey = "maxCandidateNodesAbsolute" + GangPreemptionModeKey = "gangPreemptionMode" +) + +type GangPreemptionMode uint8 - EnableStrictGangPreemptionKey = "enableStrictGangPreemption" +const ( + GPModeOff GangPreemptionMode = iota + GPModeMinimal + GPModeAtomic ) type Action struct { @@ -72,7 +80,7 @@ type Action struct { minCandidateNodesAbsolute int maxCandidateNodesAbsolute int - enableStrictGangPreemption bool + gpMode GangPreemptionMode } func New() *Action { @@ -83,7 +91,7 @@ func New() *Action { minCandidateNodesPercentage: 10, minCandidateNodesAbsolute: 1, maxCandidateNodesAbsolute: 100, - enableStrictGangPreemption: false, + gpMode: GPModeOff, } } @@ -101,10 +109,26 @@ func (pmpt *Action) parseArguments(ssn *framework.Session) { arguments.GetInt(&pmpt.minCandidateNodesPercentage, MinCandidateNodesPercentageKey) arguments.GetInt(&pmpt.minCandidateNodesAbsolute, MinCandidateNodesAbsoluteKey) arguments.GetInt(&pmpt.maxCandidateNodesAbsolute, MaxCandidateNodesAbsoluteKey) - arguments.GetBool(&pmpt.enableStrictGangPreemption, EnableStrictGangPreemptionKey) + var gpModeStr string + arguments.GetString(&gpModeStr, GangPreemptionModeKey) + pmpt.gpMode = parseGPMode(gpModeStr) pmpt.ssn = ssn } +func parseGPMode(s string) GangPreemptionMode { + switch strings.ToLower(strings.TrimSpace(s)) { + case "", "off", "disabled": + return GPModeOff + case "minimal": + return GPModeMinimal + case "atomic": + return GPModeAtomic + default: + klog.V(3).Infof("Unrecognized Gang Preemption Mode, defaulting to `disabled`") + return GPModeOff + } +} + func (pmpt *Action) Execute(ssn *framework.Session) { klog.V(5).Infof("Enter Preempt ...") defer klog.V(5).Infof("Leaving Preempt ...") @@ -300,7 +324,9 @@ func (pmpt *Action) preempt( if pmpt.enableTopologyAwarePreemption { return pmpt.topologyAwarePreempt(ssn, stmt, preemptor, filter, predicateNodes) } - + if pmpt.gpMode != GPModeOff { + return pmpt.gangPreempt(ssn, stmt, preemptor, filter, predicateNodes) + } return pmpt.normalPreempt(ssn, stmt, preemptor, filter, predicateNodes) } @@ -322,24 +348,6 @@ func (pmpt *Action) normalPreempt( currentQueue := ssn.Queues[job.Queue] - // Check if we have a node with resources available due to strictGangPreemption - if pmpt.enableStrictGangPreemption { - klog.V(4).Infof("Checking if nodes have free space due to previous strict gang preemption.") - for _, node := range selectedNodes { - if ssn.Allocatable(currentQueue, preemptor) && preemptor.InitResreq.LessEqual(node.FutureIdle(), api.Zero) { - if err := stmt.Pipeline(preemptor, node.Name, true); err != nil { - klog.Errorf("Failed to pipeline Task <%s/%s> on Node <%s>", - preemptor.Namespace, preemptor.Name, node.Name) - if rollbackErr := stmt.UnPipeline(preemptor); rollbackErr != nil { - klog.Errorf("Failed to unpipeline Task %v on %v in Session %v for %v.", - preemptor.UID, node.Name, ssn.UID, rollbackErr) - } - } - return true, nil - } - } - } - assigned := false for _, node := range selectedNodes { @@ -381,44 +389,14 @@ func (pmpt *Action) normalPreempt( break } preemptee := victimsQueue.Pop().(*api.TaskInfo) - var evictionPreemptees []*api.TaskInfo - evictionPreemptees = append(evictionPreemptees, preemptee) - if pmpt.enableStrictGangPreemption { - // If the victim task's job is already being preempted (status is Releasing), skip. - // This avoids double-counting resources and redundant evictions. - if task, f := ssn.Jobs[preemptee.Job].Tasks[preemptee.UID]; f && task.Status == api.Releasing { - continue - } - klog.V(3).Infof("StrictGangPreemption: Initial Victim Task <%s/%s> from Job <%s/%s> for Task <%s/%s>", - preemptee.Namespace, preemptee.Name, ssn.Jobs[preemptee.Job].Name, - ssn.Jobs[preemptee.Job].Namespace, preemptor.Namespace, preemptor.Name) - - var victimJobTasksToEvict []*api.TaskInfo - for _, t := range ssn.Jobs[preemptee.Job].Tasks { - // Only evict tasks that are not in a terminal state. - if t.Status != api.Succeeded && t.Status != api.Failed && t.UID != preemptee.UID { - victimJobTasksToEvict = append(victimJobTasksToEvict, t) - } - } - // Refilter preemptees as all the tasks in PreempteeJob are now potential victims. - // This ensures that plugin level overrides are respected. - victimJobPQ := ssn.BuildVictimsPriorityQueue(victimJobTasksToEvict, preemptor) - for !victimJobPQ.Empty() { - evictionPreemptees = append(evictionPreemptees, victimJobPQ.Pop().(*api.TaskInfo)) - } - evictionPreemptees = ssn.Preemptable(preemptor, evictionPreemptees) - } - - for _, p := range evictionPreemptees { - klog.V(3).Infof("Try to preempt Task <%s/%s> for Task <%s/%s>", - p.Namespace, p.Name, preemptor.Namespace, preemptor.Name) - if err := stmt.Evict(p, "preempt"); err != nil { - klog.Errorf("Failed to preempt Task <%s/%s> for Task <%s/%s>: %v", - p.Namespace, p.Name, preemptor.Namespace, preemptor.Name, err) - continue - } - preempted.Add(p.Resreq) + klog.V(3).Infof("Try to preempt Task <%s/%s> for Task <%s/%s>", + preemptee.Namespace, preemptee.Name, preemptor.Namespace, preemptor.Name) + if err := stmt.Evict(preemptee, "preempt"); err != nil { + klog.Errorf("Failed to preempt Task <%s/%s> for Task <%s/%s>: %v", + preemptee.Namespace, preemptee.Name, preemptor.Namespace, preemptor.Name, err) + continue } + preempted.Add(preemptee.Resreq) } evictionOccurred := false @@ -451,6 +429,216 @@ func (pmpt *Action) normalPreempt( return assigned, nil } +func (pmpt *Action) gangPreempt( + ssn *framework.Session, + stmt *framework.Statement, + preemptor *api.TaskInfo, + filter func(*api.TaskInfo) bool, + predicateNodes []*api.NodeInfo, +) (bool, error) { + klog.V(3).Infof("Running Gang Preemption with mode: %v", pmpt.gpMode) + preemptorJob, found := ssn.Jobs[preemptor.Job] + if !found { + return false, fmt.Errorf("not found Job %s in Session", preemptor.Job) + } + currentQueue := ssn.Queues[preemptorJob.Queue] + var preemptees []*api.TaskInfo + for _, node := range predicateNodes { + for _, task := range node.Tasks { + if filter == nil { + preemptees = append(preemptees, task.Clone()) + } else if filter(task) { + preemptees = append(preemptees, task.Clone()) + } + } + } + preemptees = ssn.Preemptable(preemptor, preemptees) + nodeJobPreempteesMap := make(map[string]map[api.JobID][]*api.TaskInfo) + for _, p := range preemptees { + nodeName := p.NodeName + if _, ok := nodeJobPreempteesMap[nodeName]; !ok { + nodeJobPreempteesMap[nodeName] = make(map[api.JobID][]*api.TaskInfo) + } + jobID := p.Job + if preemptorJob.UID == jobID { + continue + } + nodeJobPreempteesMap[nodeName][jobID] = append(nodeJobPreempteesMap[nodeName][jobID], p) + } + // Node order comes into play to keep results deterministic when there is a tie for best fitting gang + nodeScores := util.PrioritizeNodes(preemptor, predicateNodes, ssn.BatchNodeOrderFn, ssn.NodeOrderMapFn, ssn.NodeOrderReduceFn) + selectedNodes := util.SortNodes(nodeScores) + preempteeJobIds, targetNode := pmpt.findBestPreemptionTarget(selectedNodes, nodeJobPreempteesMap, preemptor) + if targetNode == nil { + klog.V(3).Infof("No suitable target nodes for preemptor: <%s/%s>", preemptor.Namespace, preemptor.Name) + return false, nil + } + preempted := api.EmptyResource() + var victims []*api.TaskInfo + for _, jid := range preempteeJobIds { + var vics []*api.TaskInfo + if pmpt.gpMode == GPModeMinimal { + vics = nodeJobPreempteesMap[targetNode.Name][jid] + } else { + for _, t := range ssn.Jobs[jid].Tasks { + vics = append(vics, t) + } + } + victims = append(victims, vics...) + } + + if pmpt.gpMode == GPModeMinimal { + // If we are evicting minimal tasks, preempt the lower priority ones first. + vq := ssn.BuildVictimsPriorityQueue(victims, preemptor) + victims = victims[:0] + for vq.Len() > 0 { + victims = append(victims, vq.Pop().(*api.TaskInfo)) + } + } + + for _, preemptee := range victims { + metrics.RegisterPreemptionAttempts() + klog.V(3).Infof("Try to preempt Task <%s/%s> for Task <%s/%s>", + preemptee.Namespace, preemptee.Name, preemptor.Namespace, preemptor.Name) + if err := stmt.Evict(preemptee, "preempt"); err != nil { + klog.Errorf("Failed to preempt Task <%s/%s> for Task <%s/%s>: %v", + preemptee.Namespace, preemptee.Name, preemptor.Namespace, preemptor.Name, err) + continue + } + preempted.Add(preemptee.Resreq) + if pmpt.gpMode == GPModeMinimal && preemptor.InitResreq.LessEqual(targetNode.FutureIdle(), api.Zero) { + break + } + } + // If this check fails, it implies some Evictions failed. + // Since we are optimizing for gangs per node we should try again in next session + if ssn.Allocatable(currentQueue, preemptor) && preemptor.InitResreq.LessEqual(targetNode.FutureIdle(), api.Zero) { + if err := stmt.Pipeline(preemptor, targetNode.Name, !preempted.IsEmpty()); err != nil { + klog.Errorf("Failed to pipeline Task <%s/%s> on Node <%s>", + preemptor.Namespace, preemptor.Name, targetNode.Name) + if rollbackErr := stmt.UnPipeline(preemptor); rollbackErr != nil { + klog.Errorf("Failed to unpipeline Task %v on %v in Session %v for %v.", + preemptor.UID, targetNode.Name, ssn.UID, rollbackErr) + } + } + } + return true, nil +} + +// Returns the node and the minimal-count set of victim jobs on that node. +// If no preemption is needed: jobs == nil, node != nil. +// If impossible on all nodes: jobs == nil, node == nil. +func (pmpt *Action) findBestPreemptionTarget( + selectedNodes []*api.NodeInfo, + nodeJobVictimsMap map[string]map[api.JobID][]*api.TaskInfo, + preemptor *api.TaskInfo, +) (jobs []api.JobID, node *api.NodeInfo) { + // ---- Phase 1: try best single-job victim (minimal overage) ---- + var ( + bestSingleOver *api.Resource + bestSingleJob api.JobID + bestSingleNode *api.NodeInfo + ) + for _, n := range selectedNodes { + idle := n.FutureIdle() + // Fits without preemption + if preemptor.InitResreq.LessEqual(idle, api.Zero) { + return nil, n + } + need := preemptor.InitResreq.Clone() + // idle could be 0 + need.SubWithoutAssert(idle) + for j, jtasks := range nodeJobVictimsMap[n.Name] { + sum := api.EmptyResource() + for _, t := range jtasks { + if t.Job == preemptor.Job { + continue + } + sum.Add(t.Resreq) + } + if !need.LessEqual(sum, api.Zero) { + continue + } + diff := sum.Sub(need) // overage = sum - need + if bestSingleOver == nil || !bestSingleOver.LessEqual(diff, api.Zero) { + bestSingleOver = diff + bestSingleJob = j + bestSingleNode = n + } + } + } + if bestSingleOver != nil { + return []api.JobID{bestSingleJob}, bestSingleNode + } + + // ---- Phase 2: greedy, largest first fit Jobs per node, minimal Job count combo per node ---- + type combo struct { + node *api.NodeInfo + jobs []api.JobID + overage *api.Resource + } + var best *combo + + for _, n := range selectedNodes { + idle := n.FutureIdle() + need := preemptor.InitResreq.Clone() + // idle could be 0 + need.SubWithoutAssert(idle) + + // Build candidates: per-job sum on this node + type jr struct { + id api.JobID + res *api.Resource + } + var cand []jr + for j, jtasks := range nodeJobVictimsMap[n.Name] { + sum := api.EmptyResource() + for _, t := range jtasks { + if t.Job == preemptor.Job { + continue + } + sum.Add(t.Resreq) + } + cand = append(cand, jr{j, sum}) + } + // Sort descending by resource (largest first). + sort.Slice(cand, func(i, j int) bool { + return !cand[i].res.LessEqual(cand[j].res, api.Zero) + }) + + acc := api.EmptyResource() + var picked []api.JobID + for _, c := range cand { + if need.LessEqual(acc, api.Zero) { + break + } + picked = append(picked, c.id) + acc.Add(c.res) + } + + if !need.LessEqual(acc, api.Zero) { + continue + } + + over := acc.Clone() + over.Sub(need) + // If we don't have the best candidate jobs OR + // If number of selected Jobs is less than the best candidates (disrupt min gangs) OR + // If the disruption cost is same, pick the ones with the least wastage + if best == nil || + len(picked) < len(best.jobs) || + (len(picked) == len(best.jobs) && over.Less(best.overage, api.Zero)) { + cp := append([]api.JobID(nil), picked...) + best = &combo{node: n, jobs: cp, overage: over} + } + } + + if best != nil { + return best.jobs, best.node + } + return nil, nil +} + func (pmpt *Action) taskEligibleToPreempt(preemptor *api.TaskInfo) error { if preemptor.Pod.Spec.PreemptionPolicy != nil && *preemptor.Pod.Spec.PreemptionPolicy == v1.PreemptNever { return fmt.Errorf("not eligible to preempt other tasks due to preemptionPolicy is Never") diff --git a/pkg/scheduler/actions/preempt/preempt_test.go b/pkg/scheduler/actions/preempt/preempt_test.go index b8a661d1f9..cdc1217df4 100644 --- a/pkg/scheduler/actions/preempt/preempt_test.go +++ b/pkg/scheduler/actions/preempt/preempt_test.go @@ -306,8 +306,7 @@ func TestPreempt(t *testing.T) { test.PriClass = []*schedulingv1.PriorityClass{highPrio, lowPrio} t.Run(test.Name, func(t *testing.T) { test.RegisterSession(tiers, []conf.Configuration{{Name: actions[0].Name(), - Arguments: map[string]interface{}{EnableTopologyAwarePreemptionKey: false, - EnableStrictGangPreemptionKey: false}}}) + Arguments: map[string]interface{}{EnableTopologyAwarePreemptionKey: false}}}) defer test.Close() test.Run(actions) if err := test.CheckAll(i); err != nil { @@ -612,7 +611,7 @@ func TestTopologyAwarePreempt(t *testing.T) { } } -func TestStrictGangPreempt(t *testing.T) { +func TestGangPreempt(t *testing.T) { plugins := map[string]framework.PluginBuilder{ conformance.PluginName: conformance.New, gang.PluginName: gang.New, @@ -622,39 +621,214 @@ func TestStrictGangPreempt(t *testing.T) { highPrio := util.BuildPriorityClass("high-priority", 100000) lowPrio := util.BuildPriorityClass("low-priority", 10) priority3, priority2, priority1 := int32(3), int32(2), int32(1) - tests := []uthelper.TestCommonStruct{ + testsMinimal := []uthelper.TestCommonStruct{ { - Name: "when one task is preempted all possible gang members are preempted", + Name: "pick the gang with the least overage for minimal mode case tight fit", PodGroups: []*schedulingv1beta1.PodGroup{ util.BuildPodGroupWithPrio("pg1", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), - util.BuildPodGroupWithPrio("pg2", "c1", "q1", 1, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "high-priority"), + util.BuildPodGroupWithPrio("pg2", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg3", "c1", "q1", 1, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "high-priority"), }, Pods: []*v1.Pod{ util.BuildPod("c1", "preemptee1", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), util.BuildPod("c1", "preemptee2", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), util.BuildPod("c1", "preemptee3", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), - util.BuildPod("c1", "preemptor1", "", v1.PodPending, api.BuildResourceList("1", "1G"), "pg2", make(map[string]string), make(map[string]string)), + util.BuildPod("c1", "preemptee4", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg2", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee5", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg2", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptor1", "", v1.PodPending, api.BuildResourceList("2", "2G"), "pg3", make(map[string]string), make(map[string]string)), }, Nodes: []*v1.Node{ util.BuildNode("n1", api.BuildResourceList("3", "3G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), + util.BuildNode("n2", api.BuildResourceList("2", "2G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), }, Queues: []*schedulingv1beta1.Queue{ util.BuildQueue("q1", 1, nil), }, - ExpectEvicted: []string{"c1/preemptee1", "c1/preemptee2", "c1/preemptee3"}, - ExpectEvictNum: 3, + ExpectEvicted: []string{"c1/preemptee4", "c1/preemptee5"}, + ExpectEvictNum: 2, }, { - Name: "when one task is preempted minMembers field is respected for preemptee Job", + Name: "pick the gang with the least overage for minimal mode deterministic nodes for tied overage", PodGroups: []*schedulingv1beta1.PodGroup{ - util.BuildPodGroupWithPrio("pg1", "c1", "q1", 1, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg1", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg2", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg3", "c1", "q1", 1, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "high-priority"), + }, + Pods: []*v1.Pod{ + util.BuildPod("c1", "preemptee1", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee2", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "false"}, make(map[string]string)), + util.BuildPod("c1", "preemptee3", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee4", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg2", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee5", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg2", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptor1", "", v1.PodPending, api.BuildResourceList("2", "2G"), "pg3", make(map[string]string), make(map[string]string)), + }, + Nodes: []*v1.Node{ + util.BuildNode("n1", api.BuildResourceList("3", "3G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), + util.BuildNode("n2", api.BuildResourceList("2", "2G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), + }, + Queues: []*schedulingv1beta1.Queue{ + util.BuildQueue("q1", 1, nil), + }, + ExpectEvicted: []string{"c1/preemptee1", "c1/preemptee3"}, + ExpectEvictNum: 2, + }, + { + Name: "Only one gang should be suitable after preemptable filter", + PodGroups: []*schedulingv1beta1.PodGroup{ + util.BuildPodGroupWithPrio("pg1", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg2", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg3", "c1", "q1", 1, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "high-priority"), + }, + Pods: []*v1.Pod{ + util.BuildPod("c1", "preemptee1", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "false"}, make(map[string]string)), + util.BuildPod("c1", "preemptee2", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "false"}, make(map[string]string)), + util.BuildPod("c1", "preemptee3", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee4", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg2", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee5", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg2", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptor1", "", v1.PodPending, api.BuildResourceList("2", "2G"), "pg3", make(map[string]string), make(map[string]string)), + }, + Nodes: []*v1.Node{ + util.BuildNode("n1", api.BuildResourceList("3", "3G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), + util.BuildNode("n2", api.BuildResourceList("2", "2G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), + }, + Queues: []*schedulingv1beta1.Queue{ + util.BuildQueue("q1", 1, nil), + }, + ExpectEvicted: []string{"c1/preemptee4", "c1/preemptee5"}, + ExpectEvictNum: 2, + }, + { + Name: "pick the gang with the least overage for minimal mode deterministic nodes for tied overage", + PodGroups: []*schedulingv1beta1.PodGroup{ + util.BuildPodGroupWithPrio("pg1", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg2", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg3", "c1", "q1", 1, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "high-priority"), + }, + Pods: []*v1.Pod{ + util.BuildPod("c1", "preemptee1", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee2", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "false"}, make(map[string]string)), + util.BuildPod("c1", "preemptee3", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee4", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg2", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee5", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg2", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptor1", "", v1.PodPending, api.BuildResourceList("2", "2G"), "pg3", make(map[string]string), make(map[string]string)), + }, + Nodes: []*v1.Node{ + util.BuildNode("n1", api.BuildResourceList("3", "3G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), + util.BuildNode("n2", api.BuildResourceList("2", "2G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), + }, + Queues: []*schedulingv1beta1.Queue{ + util.BuildQueue("q1", 1, nil), + }, + ExpectEvicted: []string{"c1/preemptee1", "c1/preemptee3"}, + ExpectEvictNum: 2, + }, + { + Name: "Pick best gang when cluster has only one node", + PodGroups: []*schedulingv1beta1.PodGroup{ + util.BuildPodGroupWithPrio("pg1", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg2", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg3", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg4", "c1", "q1", 1, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "high-priority"), + }, + Pods: []*v1.Pod{ + util.BuildPod("c1", "preemptee1", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee2", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee3", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee4", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg2", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee5", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg2", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee6", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg3", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptor1", "", v1.PodPending, api.BuildResourceList("1", "1G"), "pg4", make(map[string]string), make(map[string]string)), + }, + Nodes: []*v1.Node{ + util.BuildNode("n1", api.BuildResourceList("6", "6G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), + }, + Queues: []*schedulingv1beta1.Queue{ + util.BuildQueue("q1", 1, nil), + }, + ExpectEvicted: []string{"c1/preemptee6"}, + ExpectEvictNum: 1, + }, + { + Name: "Multiple gangs need to be preempted on the same node for a large preemptor", + PodGroups: []*schedulingv1beta1.PodGroup{ + util.BuildPodGroupWithPrio("pg1", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg2", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg3", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg4", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg5", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg6", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg7", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg8", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg9", "c1", "q1", 1, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "high-priority"), + }, + Pods: []*v1.Pod{ + util.BuildPod("c1", "preemptee1", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee2", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee3", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg2", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee4", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg2", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee5", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg3", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee6", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg3", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee7", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg4", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee8", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg4", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee9", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg5", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee10", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg5", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee11", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg5", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee12", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg6", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee13", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg6", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee14", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg6", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee15", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg7", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee16", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg8", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptor1", "", v1.PodPending, api.BuildResourceList("6", "6G"), "pg9", make(map[string]string), make(map[string]string)), + }, + Nodes: []*v1.Node{ + util.BuildNode("n1", api.BuildResourceList("8", "8G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), + util.BuildNode("n2", api.BuildResourceList("8", "8G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), + }, + Queues: []*schedulingv1beta1.Queue{ + util.BuildQueue("q1", 1, nil), + }, + ExpectEvicted: []string{"c1/preemptee9", "c1/preemptee10", "c1/preemptee11", "c1/preemptee12", "c1/preemptee13", "c1/preemptee14"}, + ExpectEvictNum: 6, + }, + { + Name: "Single gang can be preempted on a node for a large preemptor with idle on node", + PodGroups: []*schedulingv1beta1.PodGroup{ + util.BuildPodGroupWithPrio("pg1", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg2", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg3", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg4", "c1", "q1", 1, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "high-priority"), + }, + Pods: []*v1.Pod{ + util.BuildPod("c1", "preemptee1", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee2", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee5", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg2", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee6", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg2", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee7", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg2", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee8", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg3", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee9", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg3", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee10", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg3", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptor1", "", v1.PodPending, api.BuildResourceList("6", "6G"), "pg4", make(map[string]string), make(map[string]string)), + }, + Nodes: []*v1.Node{ + util.BuildNode("n1", api.BuildResourceList("6", "6G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), + util.BuildNode("n2", api.BuildResourceList("6", "6G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), + }, + Queues: []*schedulingv1beta1.Queue{ + util.BuildQueue("q1", 1, nil), + }, + ExpectEvicted: []string{"c1/preemptee1", "c1/preemptee2"}, + ExpectEvictNum: 2, + }, + { + Name: "After selecting the gang, preempt the lowest priority task for minimal mode", + PodGroups: []*schedulingv1beta1.PodGroup{ + util.BuildPodGroupWithPrio("pg1", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), util.BuildPodGroupWithPrio("pg2", "c1", "q1", 1, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "high-priority"), }, Pods: []*v1.Pod{ - // Using BuildPodWithPriority to keep the test case deterministic - util.BuildPodWithPriority("c1", "preemptee1", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string), &priority1), + util.BuildPodWithPriority("c1", "preemptee3", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string), &priority1), util.BuildPodWithPriority("c1", "preemptee2", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string), &priority2), - util.BuildPodWithPriority("c1", "preemptee3", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string), &priority3), + util.BuildPodWithPriority("c1", "preemptee1", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string), &priority3), util.BuildPod("c1", "preemptor1", "", v1.PodPending, api.BuildResourceList("1", "1G"), "pg2", make(map[string]string), make(map[string]string)), }, Nodes: []*v1.Node{ @@ -663,39 +837,136 @@ func TestStrictGangPreempt(t *testing.T) { Queues: []*schedulingv1beta1.Queue{ util.BuildQueue("q1", 1, nil), }, - ExpectEvicted: []string{"c1/preemptee1", "c1/preemptee2"}, - ExpectEvictNum: 2, + ExpectEvicted: []string{"c1/preemptee3"}, + ExpectEvictNum: 1, }, + } + testsAtomic := []uthelper.TestCommonStruct{ { - Name: "when strict preemption makes space dont preempt more", + Name: "pick the gang with the least overage for minimal mode case tight fit but evict across nodes later", + PodGroups: []*schedulingv1beta1.PodGroup{ + util.BuildPodGroupWithPrio("pg1", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg2", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg3", "c1", "q1", 1, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "high-priority"), + }, + Pods: []*v1.Pod{ + util.BuildPod("c1", "preemptee1", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee2", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee3", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee4", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg2", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee5", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg2", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee6", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg2", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptor1", "", v1.PodPending, api.BuildResourceList("2", "2G"), "pg3", make(map[string]string), make(map[string]string)), + }, + Nodes: []*v1.Node{ + util.BuildNode("n1", api.BuildResourceList("4", "4G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), + util.BuildNode("n2", api.BuildResourceList("2", "2G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), + }, + Queues: []*schedulingv1beta1.Queue{ + util.BuildQueue("q1", 1, nil), + }, + ExpectEvicted: []string{"c1/preemptee4", "c1/preemptee5", "c1/preemptee6"}, + ExpectEvictNum: 3, + }, + { + Name: "pick the gang with the least overage for minimal mode deterministic nodes for tied overage and atomic gang preemption across nodes", PodGroups: []*schedulingv1beta1.PodGroup{ util.BuildPodGroupWithPrio("pg1", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), util.BuildPodGroupWithPrio("pg2", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), util.BuildPodGroupWithPrio("pg3", "c1", "q1", 1, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "high-priority"), - util.BuildPodGroupWithPrio("pg4", "c1", "q1", 1, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "high-priority"), }, Pods: []*v1.Pod{ util.BuildPod("c1", "preemptee1", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), - util.BuildPod("c1", "preemptee2", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "false"}, make(map[string]string)), - util.BuildPod("c1", "preemptee3", "n3", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg2", map[string]string{schedulingv1beta1.PodPreemptable: "false"}, make(map[string]string)), - util.BuildPod("c1", "preemptee4", "n4", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg2", map[string]string{schedulingv1beta1.PodPreemptable: "false"}, make(map[string]string)), - util.BuildPod("c1", "preemptor1", "", v1.PodPending, api.BuildResourceList("1", "1G"), "pg3", make(map[string]string), make(map[string]string)), - util.BuildPod("c1", "preemptor2", "", v1.PodPending, api.BuildResourceList("1", "1G"), "pg4", make(map[string]string), make(map[string]string)), + util.BuildPod("c1", "preemptee2", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "false"}, make(map[string]string)), + util.BuildPod("c1", "preemptee3", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee4", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg2", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee5", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg2", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee6", "n3", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee7", "n4", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptor1", "", v1.PodPending, api.BuildResourceList("2", "2G"), "pg3", make(map[string]string), make(map[string]string)), }, Nodes: []*v1.Node{ - util.BuildNode("n1", api.BuildResourceList("1", "1G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), - util.BuildNode("n2", api.BuildResourceList("1", "1G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), + util.BuildNode("n1", api.BuildResourceList("3", "3G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), + util.BuildNode("n2", api.BuildResourceList("2", "2G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), util.BuildNode("n3", api.BuildResourceList("1", "1G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), util.BuildNode("n4", api.BuildResourceList("1", "1G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), }, Queues: []*schedulingv1beta1.Queue{ util.BuildQueue("q1", 1, nil), }, - ExpectEvicted: []string{"c1/preemptee1", "c1/preemptee2"}, - ExpectEvictNum: 2, + ExpectEvicted: []string{"c1/preemptee1", "c1/preemptee2", "c1/preemptee3", "c1/preemptee6", "c1/preemptee7"}, + ExpectEvictNum: 5, + }, + { + Name: "Multiple gangs need to be preempted on the same node for a large preemptor plus other gang members on different nodes", + PodGroups: []*schedulingv1beta1.PodGroup{ + util.BuildPodGroupWithPrio("pg1", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg2", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg3", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg4", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg5", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg6", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg7", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg8", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg9", "c1", "q1", 1, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "high-priority"), + }, + Pods: []*v1.Pod{ + util.BuildPod("c1", "preemptee1", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee2", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee3", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg2", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee4", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg2", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee5", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg3", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee6", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg3", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee7", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg4", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee8", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg4", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee9", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg5", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee10", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg5", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee11", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg5", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee12", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg6", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee13", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg6", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee14", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg6", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee15", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg7", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee16", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg8", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee17", "n3", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg6", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee18", "n4", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg6", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee19", "n5", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg6", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptor1", "", v1.PodPending, api.BuildResourceList("6", "6G"), "pg9", make(map[string]string), make(map[string]string)), + }, + Nodes: []*v1.Node{ + util.BuildNode("n1", api.BuildResourceList("8", "8G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), + util.BuildNode("n2", api.BuildResourceList("8", "8G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), + util.BuildNode("n3", api.BuildResourceList("1", "1G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), + util.BuildNode("n4", api.BuildResourceList("1", "1G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), + util.BuildNode("n5", api.BuildResourceList("1", "1G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), + }, + Queues: []*schedulingv1beta1.Queue{ + util.BuildQueue("q1", 1, nil), + }, + ExpectEvicted: []string{"c1/preemptee9", "c1/preemptee10", "c1/preemptee11", "c1/preemptee12", "c1/preemptee13", "c1/preemptee14", "c1/preemptee17", "c1/preemptee18", "c1/preemptee19"}, + ExpectEvictNum: 9, + }, + { + Name: "After selecting the gang, preempt all tasks irrespective of priority", + PodGroups: []*schedulingv1beta1.PodGroup{ + util.BuildPodGroupWithPrio("pg1", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), + util.BuildPodGroupWithPrio("pg2", "c1", "q1", 1, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "high-priority"), + }, + Pods: []*v1.Pod{ + util.BuildPodWithPriority("c1", "preemptee3", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string), &priority1), + util.BuildPodWithPriority("c1", "preemptee2", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string), &priority2), + util.BuildPodWithPriority("c1", "preemptee1", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string), &priority3), + util.BuildPod("c1", "preemptor1", "", v1.PodPending, api.BuildResourceList("1", "1G"), "pg2", make(map[string]string), make(map[string]string)), + }, + Nodes: []*v1.Node{ + util.BuildNode("n1", api.BuildResourceList("3", "3G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), + }, + Queues: []*schedulingv1beta1.Queue{ + util.BuildQueue("q1", 1, nil), + }, + ExpectEvicted: []string{"c1/preemptee1", "c1/preemptee2", "c1/preemptee3"}, + ExpectEvictNum: 3, }, } - trueValue := true tiers := []conf.Tier{ { @@ -728,19 +999,26 @@ func TestStrictGangPreempt(t *testing.T) { }} actions := []framework.Action{New()} - for i, test := range tests { - test.Plugins = plugins - test.PriClass = []*schedulingv1.PriorityClass{highPrio, lowPrio} - t.Run(test.Name, func(t *testing.T) { - test.RegisterSession(tiers, []conf.Configuration{{Name: actions[0].Name(), - Arguments: map[string]interface{}{EnableStrictGangPreemptionKey: true}}}) - defer test.Close() - test.Run(actions) - if err := test.CheckAll(i); err != nil { - t.Fatal(err) - } - }) + testCases := map[string][]uthelper.TestCommonStruct{ + "minimal": testsMinimal, + "atomic": testsAtomic, + } + for key, tests := range testCases { + for i, test := range tests { + test.Plugins = plugins + test.PriClass = []*schedulingv1.PriorityClass{highPrio, lowPrio} + t.Run(test.Name, func(t *testing.T) { + test.RegisterSession(tiers, []conf.Configuration{{Name: actions[0].Name(), + Arguments: map[string]interface{}{GangPreemptionModeKey: key}}}) + defer test.Close() + test.Run(actions) + if err := test.CheckAll(i); err != nil { + t.Fatal(err) + } + }) + } } + } func buildPodWithPodAntiAffinity(name, namespace, node string, phase v1.PodPhase, req v1.ResourceList, groupName string, labels map[string]string, selector map[string]string, topologyKey string) *v1.Pod { From 93b8c29626ec08afa32ab55fccebcd7bdc7848ca Mon Sep 17 00:00:00 2001 From: Mohit Vinchoo Date: Sat, 4 Oct 2025 19:58:50 -0700 Subject: [PATCH 10/13] Set scalar pods to 0 for idle res, fix flakey unit test Signed-off-by: Mohit Vinchoo --- pkg/scheduler/actions/preempt/preempt.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pkg/scheduler/actions/preempt/preempt.go b/pkg/scheduler/actions/preempt/preempt.go index 1942e7688e..cb8ee83880 100644 --- a/pkg/scheduler/actions/preempt/preempt.go +++ b/pkg/scheduler/actions/preempt/preempt.go @@ -541,13 +541,14 @@ func (pmpt *Action) findBestPreemptionTarget( ) for _, n := range selectedNodes { idle := n.FutureIdle() + idle.SetScalar("pods", 0) // Fits without preemption if preemptor.InitResreq.LessEqual(idle, api.Zero) { return nil, n } need := preemptor.InitResreq.Clone() // idle could be 0 - need.SubWithoutAssert(idle) + need.Sub(idle) for j, jtasks := range nodeJobVictimsMap[n.Name] { sum := api.EmptyResource() for _, t := range jtasks { @@ -581,9 +582,9 @@ func (pmpt *Action) findBestPreemptionTarget( for _, n := range selectedNodes { idle := n.FutureIdle() + idle.SetScalar("pods", 0) need := preemptor.InitResreq.Clone() - // idle could be 0 - need.SubWithoutAssert(idle) + need.Sub(idle) // Build candidates: per-job sum on this node type jr struct { From e66559c8087bff4b8620f82d2d48a950033becbb Mon Sep 17 00:00:00 2001 From: Mohit Vinchoo Date: Sun, 5 Oct 2025 19:30:50 -0700 Subject: [PATCH 11/13] restart tests Signed-off-by: Mohit Vinchoo From df03b2d11d568e5d14d36dee344e60dabbcb9a9e Mon Sep 17 00:00:00 2001 From: Mohit Vinchoo Date: Tue, 21 Oct 2025 17:13:17 -0700 Subject: [PATCH 12/13] Remove redundant checks and one duplicate test Signed-off-by: Mohit Vinchoo --- pkg/scheduler/actions/preempt/preempt.go | 19 ++++++-------- pkg/scheduler/actions/preempt/preempt_test.go | 25 ------------------- 2 files changed, 8 insertions(+), 36 deletions(-) diff --git a/pkg/scheduler/actions/preempt/preempt.go b/pkg/scheduler/actions/preempt/preempt.go index cb8ee83880..24ac4b9072 100644 --- a/pkg/scheduler/actions/preempt/preempt.go +++ b/pkg/scheduler/actions/preempt/preempt.go @@ -510,6 +510,7 @@ func (pmpt *Action) gangPreempt( break } } + assigned := false // If this check fails, it implies some Evictions failed. // Since we are optimizing for gangs per node we should try again in next session if ssn.Allocatable(currentQueue, preemptor) && preemptor.InitResreq.LessEqual(targetNode.FutureIdle(), api.Zero) { @@ -520,9 +521,11 @@ func (pmpt *Action) gangPreempt( klog.Errorf("Failed to unpipeline Task %v on %v in Session %v for %v.", preemptor.UID, targetNode.Name, ssn.UID, rollbackErr) } + } else { + assigned = true } } - return true, nil + return assigned, nil } // Returns the node and the minimal-count set of victim jobs on that node. @@ -530,7 +533,7 @@ func (pmpt *Action) gangPreempt( // If impossible on all nodes: jobs == nil, node == nil. func (pmpt *Action) findBestPreemptionTarget( selectedNodes []*api.NodeInfo, - nodeJobVictimsMap map[string]map[api.JobID][]*api.TaskInfo, + nodeJobPreempteesMap map[string]map[api.JobID][]*api.TaskInfo, preemptor *api.TaskInfo, ) (jobs []api.JobID, node *api.NodeInfo) { // ---- Phase 1: try best single-job victim (minimal overage) ---- @@ -549,12 +552,9 @@ func (pmpt *Action) findBestPreemptionTarget( need := preemptor.InitResreq.Clone() // idle could be 0 need.Sub(idle) - for j, jtasks := range nodeJobVictimsMap[n.Name] { + for j, jtasks := range nodeJobPreempteesMap[n.Name] { sum := api.EmptyResource() for _, t := range jtasks { - if t.Job == preemptor.Job { - continue - } sum.Add(t.Resreq) } if !need.LessEqual(sum, api.Zero) { @@ -592,19 +592,16 @@ func (pmpt *Action) findBestPreemptionTarget( res *api.Resource } var cand []jr - for j, jtasks := range nodeJobVictimsMap[n.Name] { + for j, jtasks := range nodeJobPreempteesMap[n.Name] { sum := api.EmptyResource() for _, t := range jtasks { - if t.Job == preemptor.Job { - continue - } sum.Add(t.Resreq) } cand = append(cand, jr{j, sum}) } // Sort descending by resource (largest first). sort.Slice(cand, func(i, j int) bool { - return !cand[i].res.LessEqual(cand[j].res, api.Zero) + return cand[j].res.Less(cand[i].res, api.Zero) }) acc := api.EmptyResource() diff --git a/pkg/scheduler/actions/preempt/preempt_test.go b/pkg/scheduler/actions/preempt/preempt_test.go index cdc1217df4..7af1502784 100644 --- a/pkg/scheduler/actions/preempt/preempt_test.go +++ b/pkg/scheduler/actions/preempt/preempt_test.go @@ -697,31 +697,6 @@ func TestGangPreempt(t *testing.T) { ExpectEvicted: []string{"c1/preemptee4", "c1/preemptee5"}, ExpectEvictNum: 2, }, - { - Name: "pick the gang with the least overage for minimal mode deterministic nodes for tied overage", - PodGroups: []*schedulingv1beta1.PodGroup{ - util.BuildPodGroupWithPrio("pg1", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), - util.BuildPodGroupWithPrio("pg2", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), - util.BuildPodGroupWithPrio("pg3", "c1", "q1", 1, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "high-priority"), - }, - Pods: []*v1.Pod{ - util.BuildPod("c1", "preemptee1", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), - util.BuildPod("c1", "preemptee2", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "false"}, make(map[string]string)), - util.BuildPod("c1", "preemptee3", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), - util.BuildPod("c1", "preemptee4", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg2", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), - util.BuildPod("c1", "preemptee5", "n2", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg2", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), - util.BuildPod("c1", "preemptor1", "", v1.PodPending, api.BuildResourceList("2", "2G"), "pg3", make(map[string]string), make(map[string]string)), - }, - Nodes: []*v1.Node{ - util.BuildNode("n1", api.BuildResourceList("3", "3G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), - util.BuildNode("n2", api.BuildResourceList("2", "2G", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), - }, - Queues: []*schedulingv1beta1.Queue{ - util.BuildQueue("q1", 1, nil), - }, - ExpectEvicted: []string{"c1/preemptee1", "c1/preemptee3"}, - ExpectEvictNum: 2, - }, { Name: "Pick best gang when cluster has only one node", PodGroups: []*schedulingv1beta1.PodGroup{ From 326f80021f53d4daa1796124b5f24e047f1a6344 Mon Sep 17 00:00:00 2001 From: Mohit Vinchoo Date: Tue, 21 Oct 2025 18:33:44 -0700 Subject: [PATCH 13/13] Update test names and add clarification for sorting logic Signed-off-by: Mohit Vinchoo --- pkg/scheduler/actions/preempt/preempt.go | 4 ++- pkg/scheduler/actions/preempt/preempt_test.go | 28 ++++++++++++------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/pkg/scheduler/actions/preempt/preempt.go b/pkg/scheduler/actions/preempt/preempt.go index 24ac4b9072..0b303c24d5 100644 --- a/pkg/scheduler/actions/preempt/preempt.go +++ b/pkg/scheduler/actions/preempt/preempt.go @@ -600,8 +600,10 @@ func (pmpt *Action) findBestPreemptionTarget( cand = append(cand, jr{j, sum}) } // Sort descending by resource (largest first). + // We need not-less-equal sort because number of pods in the ScalarResources can be same (like 0). + // In this case strictly less check fails hence we use strictly greater (not-less-equal) sort.Slice(cand, func(i, j int) bool { - return cand[j].res.Less(cand[i].res, api.Zero) + return !cand[i].res.LessEqual(cand[j].res, api.Zero) }) acc := api.EmptyResource() diff --git a/pkg/scheduler/actions/preempt/preempt_test.go b/pkg/scheduler/actions/preempt/preempt_test.go index 7af1502784..2c90cef68c 100644 --- a/pkg/scheduler/actions/preempt/preempt_test.go +++ b/pkg/scheduler/actions/preempt/preempt_test.go @@ -23,6 +23,7 @@ package preempt import ( "flag" + "strconv" "testing" v1 "k8s.io/api/core/v1" @@ -623,7 +624,7 @@ func TestGangPreempt(t *testing.T) { priority3, priority2, priority1 := int32(3), int32(2), int32(1) testsMinimal := []uthelper.TestCommonStruct{ { - Name: "pick the gang with the least overage for minimal mode case tight fit", + Name: "minimal mode: pick the gang with the least overage and exact fit", PodGroups: []*schedulingv1beta1.PodGroup{ util.BuildPodGroupWithPrio("pg1", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), util.BuildPodGroupWithPrio("pg2", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), @@ -648,7 +649,7 @@ func TestGangPreempt(t *testing.T) { ExpectEvictNum: 2, }, { - Name: "pick the gang with the least overage for minimal mode deterministic nodes for tied overage", + Name: "minimal mode: pick the gang with the least overage and deterministic nodes for tied overage", PodGroups: []*schedulingv1beta1.PodGroup{ util.BuildPodGroupWithPrio("pg1", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), util.BuildPodGroupWithPrio("pg2", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), @@ -673,7 +674,7 @@ func TestGangPreempt(t *testing.T) { ExpectEvictNum: 2, }, { - Name: "Only one gang should be suitable after preemptable filter", + Name: "minimal mode: only one gang should be suitable after preemptable filter", PodGroups: []*schedulingv1beta1.PodGroup{ util.BuildPodGroupWithPrio("pg1", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), util.BuildPodGroupWithPrio("pg2", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), @@ -698,7 +699,7 @@ func TestGangPreempt(t *testing.T) { ExpectEvictNum: 2, }, { - Name: "Pick best gang when cluster has only one node", + Name: "minimal mode: pick best gang when cluster has just one node", PodGroups: []*schedulingv1beta1.PodGroup{ util.BuildPodGroupWithPrio("pg1", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), util.BuildPodGroupWithPrio("pg2", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), @@ -724,7 +725,7 @@ func TestGangPreempt(t *testing.T) { ExpectEvictNum: 1, }, { - Name: "Multiple gangs need to be preempted on the same node for a large preemptor", + Name: "minimal mode: multiple gangs need to be preempted on the same node for a large preemptor", PodGroups: []*schedulingv1beta1.PodGroup{ util.BuildPodGroupWithPrio("pg1", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), util.BuildPodGroupWithPrio("pg2", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), @@ -766,7 +767,7 @@ func TestGangPreempt(t *testing.T) { ExpectEvictNum: 6, }, { - Name: "Single gang can be preempted on a node for a large preemptor with idle on node", + Name: "minimal mode: single gang can be preempted on a node (with idle resources) for a large preemptor", PodGroups: []*schedulingv1beta1.PodGroup{ util.BuildPodGroupWithPrio("pg1", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), util.BuildPodGroupWithPrio("pg2", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), @@ -795,7 +796,7 @@ func TestGangPreempt(t *testing.T) { ExpectEvictNum: 2, }, { - Name: "After selecting the gang, preempt the lowest priority task for minimal mode", + Name: "minimal mode: after selecting the gang, preempt the lowest priority task for minimal mode", PodGroups: []*schedulingv1beta1.PodGroup{ util.BuildPodGroupWithPrio("pg1", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), util.BuildPodGroupWithPrio("pg2", "c1", "q1", 1, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "high-priority"), @@ -818,7 +819,7 @@ func TestGangPreempt(t *testing.T) { } testsAtomic := []uthelper.TestCommonStruct{ { - Name: "pick the gang with the least overage for minimal mode case tight fit but evict across nodes later", + Name: "atomic mode: pick best-fit gang on one node and evict all its tasks across nodes", PodGroups: []*schedulingv1beta1.PodGroup{ util.BuildPodGroupWithPrio("pg1", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), util.BuildPodGroupWithPrio("pg2", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), @@ -844,7 +845,7 @@ func TestGangPreempt(t *testing.T) { ExpectEvictNum: 3, }, { - Name: "pick the gang with the least overage for minimal mode deterministic nodes for tied overage and atomic gang preemption across nodes", + Name: "atomic mode: deterministic node selection on tied overage, with cluster-wide gang eviction", PodGroups: []*schedulingv1beta1.PodGroup{ util.BuildPodGroupWithPrio("pg1", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), util.BuildPodGroupWithPrio("pg2", "c1", "q1", 0, map[string]int32{}, schedulingv1beta1.PodGroupInqueue, "low-priority"), @@ -869,6 +870,7 @@ func TestGangPreempt(t *testing.T) { Queues: []*schedulingv1beta1.Queue{ util.BuildQueue("q1", 1, nil), }, + // overage is tied on n1 and n2 as one pod is not preemtable on n1 ExpectEvicted: []string{"c1/preemptee1", "c1/preemptee2", "c1/preemptee3", "c1/preemptee6", "c1/preemptee7"}, ExpectEvictNum: 5, }, @@ -963,6 +965,7 @@ func TestGangPreempt(t *testing.T) { EnabledPreemptable: &trueValue, EnabledJobPipelined: &trueValue, EnabledJobStarving: &trueValue, + EnabledNodeOrder: &trueValue, }, { Name: proportion.PluginName, @@ -983,8 +986,13 @@ func TestGangPreempt(t *testing.T) { test.Plugins = plugins test.PriClass = []*schedulingv1.PriorityClass{highPrio, lowPrio} t.Run(test.Name, func(t *testing.T) { - test.RegisterSession(tiers, []conf.Configuration{{Name: actions[0].Name(), + testSsn := test.RegisterSession(tiers, []conf.Configuration{{Name: actions[0].Name(), Arguments: map[string]interface{}{GangPreemptionModeKey: key}}}) + // node score = {n1=-1, n2=-2, n3=-3...} + testSsn.AddNodeOrderFn("priority", func(_ *api.TaskInfo, n *api.NodeInfo) (float64, error) { + i, _ := strconv.Atoi(n.Name[1:]) + return -float64(i), nil + }) defer test.Close() test.Run(actions) if err := test.CheckAll(i); err != nil {