From 827e7d579d3e530daad93a6494b1fe4f88a246eb Mon Sep 17 00:00:00 2001
From: Krisztian Litkey <krisztian.litkey@intel.com>
Date: Tue, 2 Sep 2025 13:28:51 +0300
Subject: [PATCH 1/2] toplogy-aware: pick pools by burstable CPU limit.

When picking a pool for a burstable QoS class container, take
into account the CPU limit, too. For limited containers prefer
pools with enough free capacity for the limit. For unlimited
containers prefer pools with more free capacity left, which
means in practice that such containers will typically end up
in the root pool, unless affinity of topology hints dictate
otherwise.

Signed-off-by: Krisztian Litkey <krisztian.litkey@intel.com>
---
 .../topology-aware/policy/pod-preferences.go  | 52 +++++++++----
 .../policy/pod-preferences_test.go            |  2 +-
 cmd/plugins/topology-aware/policy/pools.go    | 73 +++++++++++++++----
 .../topology-aware/policy/resources.go        | 11 ++-
 4 files changed, 106 insertions(+), 32 deletions(-)

diff --git a/cmd/plugins/topology-aware/policy/pod-preferences.go b/cmd/plugins/topology-aware/policy/pod-preferences.go
index a4d9f080d..37ce1fa77 100644
--- a/cmd/plugins/topology-aware/policy/pod-preferences.go
+++ b/cmd/plugins/topology-aware/policy/pod-preferences.go
@@ -17,6 +17,7 @@ package topologyaware
 import (
 	"encoding/json"
 	"fmt"
+	"math"
 	"path/filepath"
 	"strconv"
 	"strings"
@@ -58,6 +59,8 @@ const (
 	hideHyperthreadsKey = keyHideHyperthreads + "." + kubernetes.ResmgrKeyNamespace
 	// effective annotation key for picking resources by topology hints
 	pickResourcesByHints = keyPickResourcesByHints + "." + kubernetes.ResmgrKeyNamespace
+
+	unlimitedCPU = math.MaxInt // 'unlimited' burstable CPU limit
 )
 
 type prefKind int
@@ -308,10 +311,11 @@ func checkReservedCPUsAnnotations(c cache.Container) (bool, bool) {
 // Returned values:
 // 1. full: number of full CPUs
 // 2. fraction: amount of fractional CPU in milli-CPU
-// 3. isolate: (bool) whether to prefer isolated full CPUs
-// 4. cpuType: (cpuClass) class of CPU to allocate (reserved vs. normal)
-// 5. cpuPrio: preferred CPU allocator priority for CPU allocation.
-func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, int, bool, cpuClass, cpuPrio) {
+// 3. limit: CPU limit for this container
+// 4. isolate: (bool) whether to prefer isolated full CPUs
+// 5. cpuType: (cpuClass) class of CPU to allocate (reserved vs. normal)
+// 6. cpuPrio: preferred CPU allocator priority for CPU allocation.
+func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, int, int, bool, cpuClass, cpuPrio) {
 	//
 	// CPU allocation preferences for a container consist of
 	//
@@ -381,52 +385,68 @@ func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, in
 	qosClass := pod.GetQOSClass()
 	fraction := int(request.MilliValue())
 	prio := defaultPrio // ignored for fractional allocations
+	limit := 0
+
+	switch qosClass {
+	case corev1.PodQOSBestEffort:
+	case corev1.PodQOSBurstable:
+		if lim, ok := reqs.Limits[corev1.ResourceCPU]; ok {
+			limit = int(lim.MilliValue())
+		} else {
+			limit = unlimitedCPU
+		}
+	case corev1.PodQOSGuaranteed:
+		if lim, ok := reqs.Limits[corev1.ResourceCPU]; ok {
+			limit = int(lim.MilliValue())
+		}
+	}
 
 	// easy cases: kube-system namespace, Burstable or BestEffort QoS class containers
 	preferReserved, explicitReservation := checkReservedCPUsAnnotations(container)
 	switch {
 	case container.PreserveCpuResources():
-		return 0, fraction, false, cpuPreserve, prio
+		return 0, fraction, limit, false, cpuPreserve, prio
 	case preferReserved:
-		return 0, fraction, false, cpuReserved, prio
+		return 0, fraction, limit, false, cpuReserved, prio
 	case checkReservedPoolNamespaces(namespace) && !explicitReservation:
-		return 0, fraction, false, cpuReserved, prio
+		return 0, fraction, limit, false, cpuReserved, prio
 	case qosClass == corev1.PodQOSBurstable:
-		return 0, fraction, false, cpuNormal, prio
+		return 0, fraction, limit, false, cpuNormal, prio
 	case qosClass == corev1.PodQOSBestEffort:
-		return 0, 0, false, cpuNormal, prio
+		return 0, 0, 0, false, cpuNormal, prio
 	}
 
 	// complex case: Guaranteed QoS class containers
 	cores := fraction / 1000
 	fraction = fraction % 1000
+	limit = 1000*cores + fraction
 	preferIsolated, isolPrefKind := isolatedCPUsPreference(pod, container)
 	preferShared, sharedPrefKind := sharedCPUsPreference(pod, container)
 	prio = cpuPrioPreference(pod, container, defaultPrio) // ignored for fractional allocations
 
 	switch {
 	case cores == 0: // sub-core CPU request
-		return 0, fraction, false, cpuNormal, prio
+		return 0, fraction, limit, false, cpuNormal, prio
 	case cores < 2: // 1 <= CPU request < 2
 		if preferShared {
-			return 0, 1000*cores + fraction, false, cpuNormal, prio
+			return 0, 1000*cores + fraction, limit, false, cpuNormal, prio
 		}
 		// potentially mixed allocation (1 core + some fraction)
-		return cores, fraction, preferIsolated, cpuNormal, prio
+		return cores, fraction, limit, preferIsolated, cpuNormal, prio
 	default: // CPU request >= 2
 		// fractional allocation, only mixed if explicitly annotated as unshared
 		if fraction > 0 {
 			if !preferShared && sharedPrefKind == prefAnnotated {
-				return cores, fraction, preferIsolated, cpuNormal, prio
+				return cores, fraction, limit, preferIsolated, cpuNormal, prio
 			}
-			return 0, 1000*cores + fraction, false, cpuNormal, prio
+			return 0, 1000*cores + fraction, limit, false, cpuNormal, prio
 		}
 		// non-fractional allocation
 		if preferShared {
-			return 0, 1000 * cores, false, cpuNormal, prio
+			return 0, 1000 * cores, limit, false, cpuNormal, prio
 		}
 		// for multiple cores, isolated preference must be explicitly annotated
-		return cores, 0, preferIsolated && isolPrefKind == prefAnnotated, cpuNormal, prio
+		return cores, 0, limit, preferIsolated && isolPrefKind == prefAnnotated, cpuNormal, prio
 	}
 }
 
diff --git a/cmd/plugins/topology-aware/policy/pod-preferences_test.go b/cmd/plugins/topology-aware/policy/pod-preferences_test.go
index 5e03a5982..ca7933594 100644
--- a/cmd/plugins/topology-aware/policy/pod-preferences_test.go
+++ b/cmd/plugins/topology-aware/policy/pod-preferences_test.go
@@ -1052,7 +1052,7 @@ func TestCpuAllocationPreferences(t *testing.T) {
 			}
 			opt.PreferIsolated, opt.PreferShared = &tc.preferIsolated, &tc.preferShared
 			opt.ReservedPoolNamespaces = tc.reservedPoolNamespaces
-			full, fraction, isolate, cpuType, _ := cpuAllocationPreferences(tc.pod, tc.container)
+			full, fraction, _, isolate, cpuType, _ := cpuAllocationPreferences(tc.pod, tc.container)
 			require.Equal(t, tc.expectedFull, full, "full CPU cores")
 			require.Equal(t, tc.expectedFraction, fraction, "CPU core fraction")
 			require.Equal(t, tc.expectedIsolate, isolate, "isolation preference")
diff --git a/cmd/plugins/topology-aware/policy/pools.go b/cmd/plugins/topology-aware/policy/pools.go
index 60290b7b7..4009cfb26 100644
--- a/cmd/plugins/topology-aware/policy/pools.go
+++ b/cmd/plugins/topology-aware/policy/pools.go
@@ -20,6 +20,7 @@ import (
 	"sort"
 
 	"github.com/containers/nri-plugins/pkg/utils/cpuset"
+	corev1 "k8s.io/api/core/v1"
 
 	"github.com/containers/nri-plugins/pkg/resmgr/cache"
 	libmem "github.com/containers/nri-plugins/pkg/resmgr/lib/memory"
@@ -673,7 +674,9 @@ func (p *policy) compareScores(request Request, pools []Node, scores map[int]Sco
 	//   - if we have topology hints
 	//       * better hint score wins
 	//       * for a tie, prefer the lower node then the smaller id
-	//   - if we have a better matching or tighter fitting memory offer, it wins
+	//   - if we have a better matching memory offer, it wins
+	//   - if we have a burstable container, sufficient capacity for the limit wins
+	//   - if we have or tighter fitting memory offer, it wins
 	//   - if only one node matches the memory type request, it wins
 	//   - for low-prio and high-prio CPU preference, if only one node has such CPUs, it wins
 	//   - if a node is lower in the tree it wins
@@ -772,7 +775,7 @@ func (p *policy) compareScores(request Request, pools []Node, scores map[int]Sco
 		}
 	}
 
-	// better matching or tighter memory offer wins
+	// better matching offer wins
 	switch {
 	case o1 != nil && o2 == nil:
 		log.Debug("  => %s loses on memory offer (failed offer)", node2.Name())
@@ -809,22 +812,64 @@ func (p *policy) compareScores(request Request, pools []Node, scores map[int]Sco
 			}
 			log.Debug("  - memory offers burstability are a TIE")
 		}
+	}
 
-		if m1.Size() < m2.Size() {
-			log.Debug("   - %s loses on memory offer (%s less tight than %s)",
-				node2.Name(), m2, m1)
-			return true
-		}
-		if m2.Size() < m1.Size() {
-			log.Debug("   - %s loses on memory offer (%s less tight than %s)",
-				node1.Name(), m1, m2)
-			return false
-		}
-		if m2.Size() == m1.Size() {
-			log.Debug("  - memory offers are a TIE (%s vs. %s)", m1, m2)
+	if request.GetContainer().GetQOSClass() == corev1.PodQOSBurstable {
+		var (
+			limit = request.CPULimit()
+			b1    = score1.Supply().AllocatableSharedCPU()
+			b2    = score2.Supply().AllocatableSharedCPU()
+			r1    = b1 - limit
+			r2    = b2 - limit
+		)
+
+		log.Debug("  - CPU burstability %s=%d, %s=%d, limit=%d",
+			node1.Name(), b1, node2.Name(), b2, limit)
+
+		if limit != unlimitedCPU {
+			// prefer pool with enough burstable capacity
+			switch {
+			case r1 >= 0 && r2 < 0:
+				log.Debug("  - %s loses on insufficient CPU burstability (%d vs. %d for limit %d)",
+					node2.Name(), b1, b2, limit)
+				return true
+			case r2 >= 0 && r1 < 0:
+				log.Debug("  - %s loses on insufficient CPU burstability", node1.Name())
+				return false
+			default:
+				log.Debug("  - CPU burstability is a TIE")
+			}
+		} else {
+			// prefer pool with more burstable capacity
+			switch {
+			case b1 > b2:
+				log.Debug("  - %s WINS on more CPU burstability", node1.Name())
+				return true
+			case b2 > b1:
+				log.Debug("  - %s WINS on more CPU burstability", node2.Name())
+				return false
+			default:
+				log.Debug("  - CPU burstability is a TIE")
+			}
 		}
 	}
 
+	// tighter memory offer wins
+	m1, m2 := o1.NodeMask(), o2.NodeMask()
+	if m1.Size() < m2.Size() {
+		log.Debug("   - %s loses on memory offer (%s less tight than %s)",
+			node2.Name(), m2, m1)
+		return true
+	}
+	if m2.Size() < m1.Size() {
+		log.Debug("   - %s loses on memory offer (%s less tight than %s)",
+			node1.Name(), m1, m2)
+		return false
+	}
+	if m2.Size() == m1.Size() {
+		log.Debug("  - memory offers are a TIE (%s vs. %s)", m1, m2)
+	}
+
 	// matching memory type wins
 	if reqType := request.MemoryType(); reqType != memoryUnspec && reqType != memoryPreserve {
 		if node1.HasMemoryType(reqType) && !node2.HasMemoryType(reqType) {
diff --git a/cmd/plugins/topology-aware/policy/resources.go b/cmd/plugins/topology-aware/policy/resources.go
index 5ea4f94cf..ca234b04f 100644
--- a/cmd/plugins/topology-aware/policy/resources.go
+++ b/cmd/plugins/topology-aware/policy/resources.go
@@ -107,6 +107,8 @@ type Request interface {
 	FullCPUs() int
 	// CPUFraction returns the amount of fractional milli-CPU requested.
 	CPUFraction() int
+	// CPULimit returns the amount of fractional CPU limit.
+	CPULimit() int
 	// Isolate returns whether isolated CPUs are preferred for this request.
 	Isolate() bool
 	// MemoryType returns the type(s) of requested memory.
@@ -223,6 +225,7 @@ type request struct {
 	container   cache.Container // container for this request
 	full        int             // number of full CPUs requested
 	fraction    int             // amount of fractional CPU requested
+	limit       int             // CPU limit, MaxInt for no limit
 	isolate     bool            // prefer isolated exclusive CPUs
 	cpuType     cpuClass        // preferred CPU type (normal, reserved)
 	prio        cpuPrio         // CPU priority preference, ignored for fraction requests
@@ -715,7 +718,7 @@ func prettyMem(value int64) string {
 // newRequest creates a new request for the given container.
 func newRequest(container cache.Container, types libmem.TypeMask) Request {
 	pod, _ := container.GetPod()
-	full, fraction, isolate, cpuType, prio := cpuAllocationPreferences(pod, container)
+	full, fraction, cpuLimit, isolate, cpuType, prio := cpuAllocationPreferences(pod, container)
 	req, lim, mtype := memoryAllocationPreference(pod, container)
 	coldStart := time.Duration(0)
 
@@ -752,6 +755,7 @@ func newRequest(container cache.Container, types libmem.TypeMask) Request {
 		container:   container,
 		full:        full,
 		fraction:    fraction,
+		limit:       cpuLimit,
 		isolate:     isolate,
 		cpuType:     cpuType,
 		memReq:      req,
@@ -815,6 +819,11 @@ func (cr *request) CPUFraction() int {
 	return cr.fraction
 }
 
+// CPULimit returns the amount of fractional milli-CPU limit.
+func (cr *request) CPULimit() int {
+	return cr.limit
+}
+
 // Isolate returns whether isolated CPUs are preferred for this request.
 func (cr *request) Isolate() bool {
 	return cr.isolate

From 21943c5a4597385b0127c6b4ab911c988d0b0684 Mon Sep 17 00:00:00 2001
From: Krisztian Litkey <krisztian.litkey@intel.com>
Date: Tue, 2 Sep 2025 15:58:41 +0300
Subject: [PATCH 2/2] e2e: update topology-aware test for burstable placement.

Update existing burstable placement test for altered behavior.
Add new test to verify CPU limit based burstable placement of
limited and unlimited burstable containers.

Signed-off-by: Krisztian Litkey <krisztian.litkey@intel.com>
---
 test/e2e/files/burstable.yaml.in              |  2 ++
 .../n4c16/test00-basic-placement/code.var.sh  |  2 +-
 .../n4c16/test14-burstable/code.var.sh        | 23 +++++++++++++++++++
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/test/e2e/files/burstable.yaml.in b/test/e2e/files/burstable.yaml.in
index c311fbb08..aaa509933 100644
--- a/test/e2e/files/burstable.yaml.in
+++ b/test/e2e/files/burstable.yaml.in
@@ -31,7 +31,9 @@ spec:
         cpu: ${CPUREQ}
         memory: ${MEMREQ}
       limits:
+        $( ( [ -n "$CPULIM" ] && [ "$CPULIM" != "0" ] ) && echo "
         cpu: ${CPULIM}
+        ")
         $( ( [ -n "$MEMLIM" ] && [ "$MEMLIM" != "0" ] ) && echo "
         memory: ${MEMLIM}
         ")
diff --git a/test/e2e/policies.test-suite/topology-aware/n4c16/test00-basic-placement/code.var.sh b/test/e2e/policies.test-suite/topology-aware/n4c16/test00-basic-placement/code.var.sh
index 4d5feef2b..9d68c3481 100644
--- a/test/e2e/policies.test-suite/topology-aware/n4c16/test00-basic-placement/code.var.sh
+++ b/test/e2e/policies.test-suite/topology-aware/n4c16/test00-basic-placement/code.var.sh
@@ -54,7 +54,7 @@ vm-command "kubectl delete pods --all --now"
 
 # pod2: Test that 4 burstable containers not eligible for isolated/exclusive CPU allocation
 # gets evenly spread over NUMA nodes.
-CONTCOUNT=4 CPUREQ=2 CPULIM=4 create burstable
+CONTCOUNT=4 CPUREQ=2 CPULIM=2002m create burstable
 report allowed
 verify \
     'disjoint_sets(cpus["pod2c0"], cpus["pod2c1"], cpus["pod2c2"], cpus["pod2c3"])' \
diff --git a/test/e2e/policies.test-suite/topology-aware/n4c16/test14-burstable/code.var.sh b/test/e2e/policies.test-suite/topology-aware/n4c16/test14-burstable/code.var.sh
index 8084435b6..a9845be01 100644
--- a/test/e2e/policies.test-suite/topology-aware/n4c16/test14-burstable/code.var.sh
+++ b/test/e2e/policies.test-suite/topology-aware/n4c16/test14-burstable/code.var.sh
@@ -24,4 +24,27 @@ verify \
     'len(nodes["pod2c0"]) == 2' \
     'len(nodes["pod3c0"]) == 4'
 
+vm-command "kubectl delete pods --all --now"
+
+helm-terminate
+helm_config=$(COLOCATE_PODS=false instantiate helm-config.yaml) helm-launch topology-aware
+
+# Limited burstable containers get assigned to the lowest pool where
+# there is enough free capacity for their limit. In this case, a socket.
+CONTCOUNT=2 CPUREQ=2 CPULIM=5 MEMREQ=100M create burstable
+report allowed
+verify \
+    'nodes["pod4c0"] == { "node2" ,"node3" }' \
+    'nodes["pod4c1"] == { "node0", "node1" }'
+
+# Unlimited burstable containers get allocated to the root pool unless
+# affinity dictates otherwise.
+CONTCOUNT=2 CPUREQ=2 CPULIM=0 MEMREQ=100M create burstable
+report allowed
+verify \
+    'nodes["pod5c0"] == { "node0", "node1", "node2", "node3" }' \
+    'nodes["pod5c1"] == { "node0", "node1", "node2", "node3" }'
+
+vm-command "kubectl delete pods --all --now"
+
 helm-terminate