containers · klihub · Sep 2, 2025 · Sep 2, 2025
diff --git a/cmd/plugins/topology-aware/policy/pod-preferences.go b/cmd/plugins/topology-aware/policy/pod-preferences.go
@@ -17,6 +17,7 @@ package topologyaware
 import (
 	"encoding/json"
 	"fmt"
+	"math"
 	"path/filepath"
 	"strconv"
 	"strings"
@@ -58,6 +59,8 @@ const (
 	hideHyperthreadsKey = keyHideHyperthreads + "." + kubernetes.ResmgrKeyNamespace
 	// effective annotation key for picking resources by topology hints
 	pickResourcesByHints = keyPickResourcesByHints + "." + kubernetes.ResmgrKeyNamespace
+
+	unlimitedCPU = math.MaxInt // 'unlimited' burstable CPU limit
 )
 
 type prefKind int
@@ -308,10 +311,11 @@ func checkReservedCPUsAnnotations(c cache.Container) (bool, bool) {
 // Returned values:
 // 1. full: number of full CPUs
 // 2. fraction: amount of fractional CPU in milli-CPU
-// 3. isolate: (bool) whether to prefer isolated full CPUs
-// 4. cpuType: (cpuClass) class of CPU to allocate (reserved vs. normal)
-// 5. cpuPrio: preferred CPU allocator priority for CPU allocation.
-func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, int, bool, cpuClass, cpuPrio) {
+// 3. limit: CPU limit for this container
+// 4. isolate: (bool) whether to prefer isolated full CPUs
+// 5. cpuType: (cpuClass) class of CPU to allocate (reserved vs. normal)
+// 6. cpuPrio: preferred CPU allocator priority for CPU allocation.
+func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, int, int, bool, cpuClass, cpuPrio) {
 	//
 	// CPU allocation preferences for a container consist of
 	//
@@ -381,52 +385,68 @@ func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, in
 	qosClass := pod.GetQOSClass()
 	fraction := int(request.MilliValue())
 	prio := defaultPrio // ignored for fractional allocations
+	limit := 0
+
+	switch qosClass {
+	case corev1.PodQOSBestEffort:
+	case corev1.PodQOSBurstable:
+		if lim, ok := reqs.Limits[corev1.ResourceCPU]; ok {
+			limit = int(lim.MilliValue())
+		} else {
+			limit = unlimitedCPU
+		}
+	case corev1.PodQOSGuaranteed:
+		if lim, ok := reqs.Limits[corev1.ResourceCPU]; ok {
+			limit = int(lim.MilliValue())
+		}
+	}
 
 	// easy cases: kube-system namespace, Burstable or BestEffort QoS class containers
 	preferReserved, explicitReservation := checkReservedCPUsAnnotations(container)
 	switch {
 	case container.PreserveCpuResources():
-		return 0, fraction, false, cpuPreserve, prio
+		return 0, fraction, limit, false, cpuPreserve, prio
 	case preferReserved:
-		return 0, fraction, false, cpuReserved, prio
+		return 0, fraction, limit, false, cpuReserved, prio
 	case checkReservedPoolNamespaces(namespace) && !explicitReservation:
-		return 0, fraction, false, cpuReserved, prio
+		return 0, fraction, limit, false, cpuReserved, prio
 	case qosClass == corev1.PodQOSBurstable:
-		return 0, fraction, false, cpuNormal, prio
+		return 0, fraction, limit, false, cpuNormal, prio
 	case qosClass == corev1.PodQOSBestEffort:
-		return 0, 0, false, cpuNormal, prio
+		return 0, 0, 0, false, cpuNormal, prio
 	}
 
 	// complex case: Guaranteed QoS class containers
 	cores := fraction / 1000
 	fraction = fraction % 1000
+	limit = 1000*cores + fraction
 	preferIsolated, isolPrefKind := isolatedCPUsPreference(pod, container)
 	preferShared, sharedPrefKind := sharedCPUsPreference(pod, container)
 	prio = cpuPrioPreference(pod, container, defaultPrio) // ignored for fractional allocations
 
 	switch {
 	case cores == 0: // sub-core CPU request
-		return 0, fraction, false, cpuNormal, prio
+		return 0, fraction, limit, false, cpuNormal, prio
 	case cores < 2: // 1 <= CPU request < 2
 		if preferShared {
-			return 0, 1000*cores + fraction, false, cpuNormal, prio
+			return 0, 1000*cores + fraction, limit, false, cpuNormal, prio
 		}
 		// potentially mixed allocation (1 core + some fraction)
-		return cores, fraction, preferIsolated, cpuNormal, prio
+		return cores, fraction, limit, preferIsolated, cpuNormal, prio
 	default: // CPU request >= 2
 		// fractional allocation, only mixed if explicitly annotated as unshared
 		if fraction > 0 {
 			if !preferShared && sharedPrefKind == prefAnnotated {
-				return cores, fraction, preferIsolated, cpuNormal, prio
+				return cores, fraction, limit, preferIsolated, cpuNormal, prio
 			}
-			return 0, 1000*cores + fraction, false, cpuNormal, prio
+			return 0, 1000*cores + fraction, limit, false, cpuNormal, prio
 		}
 		// non-fractional allocation
 		if preferShared {
-			return 0, 1000 * cores, false, cpuNormal, prio
+			return 0, 1000 * cores, limit, false, cpuNormal, prio
 		}
 		// for multiple cores, isolated preference must be explicitly annotated
-		return cores, 0, preferIsolated && isolPrefKind == prefAnnotated, cpuNormal, prio
+		return cores, 0, limit, preferIsolated && isolPrefKind == prefAnnotated, cpuNormal, prio
 	}
 }
 

diff --git a/cmd/plugins/topology-aware/policy/pod-preferences_test.go b/cmd/plugins/topology-aware/policy/pod-preferences_test.go
@@ -1052,7 +1052,7 @@ func TestCpuAllocationPreferences(t *testing.T) {
 			}
 			opt.PreferIsolated, opt.PreferShared = &tc.preferIsolated, &tc.preferShared
 			opt.ReservedPoolNamespaces = tc.reservedPoolNamespaces
-			full, fraction, isolate, cpuType, _ := cpuAllocationPreferences(tc.pod, tc.container)
+			full, fraction, _, isolate, cpuType, _ := cpuAllocationPreferences(tc.pod, tc.container)
 			require.Equal(t, tc.expectedFull, full, "full CPU cores")
 			require.Equal(t, tc.expectedFraction, fraction, "CPU core fraction")
 			require.Equal(t, tc.expectedIsolate, isolate, "isolation preference")

diff --git a/cmd/plugins/topology-aware/policy/pools.go b/cmd/plugins/topology-aware/policy/pools.go
@@ -20,6 +20,7 @@ import (
 	"sort"
 
 	"github.com/containers/nri-plugins/pkg/utils/cpuset"
+	corev1 "k8s.io/api/core/v1"
 
 	"github.com/containers/nri-plugins/pkg/resmgr/cache"
 	libmem "github.com/containers/nri-plugins/pkg/resmgr/lib/memory"
@@ -673,7 +674,9 @@ func (p *policy) compareScores(request Request, pools []Node, scores map[int]Sco
 	//   - if we have topology hints
 	//       * better hint score wins
 	//       * for a tie, prefer the lower node then the smaller id
-	//   - if we have a better matching or tighter fitting memory offer, it wins
+	//   - if we have a better matching memory offer, it wins
+	//   - if we have a burstable container, sufficient capacity for the limit wins
+	//   - if we have or tighter fitting memory offer, it wins
 	//   - if only one node matches the memory type request, it wins
 	//   - for low-prio and high-prio CPU preference, if only one node has such CPUs, it wins
 	//   - if a node is lower in the tree it wins
@@ -772,7 +775,7 @@ func (p *policy) compareScores(request Request, pools []Node, scores map[int]Sco
 		}
 	}
 
-	// better matching or tighter memory offer wins
+	// better matching offer wins
 	switch {
 	case o1 != nil && o2 == nil:
 		log.Debug("  => %s loses on memory offer (failed offer)", node2.Name())
@@ -809,22 +812,64 @@ func (p *policy) compareScores(request Request, pools []Node, scores map[int]Sco
 			}
 			log.Debug("  - memory offers burstability are a TIE")
 		}
+	}
 
-		if m1.Size() < m2.Size() {
-			log.Debug("   - %s loses on memory offer (%s less tight than %s)",
-				node2.Name(), m2, m1)
-			return true
-		}
-		if m2.Size() < m1.Size() {
-			log.Debug("   - %s loses on memory offer (%s less tight than %s)",
-				node1.Name(), m1, m2)
-			return false
-		}
-		if m2.Size() == m1.Size() {
-			log.Debug("  - memory offers are a TIE (%s vs. %s)", m1, m2)
+	if request.GetContainer().GetQOSClass() == corev1.PodQOSBurstable {
+		var (
+			limit = request.CPULimit()
+			b1    = score1.Supply().AllocatableSharedCPU()
+			b2    = score2.Supply().AllocatableSharedCPU()
+			r1    = b1 - limit
+			r2    = b2 - limit
+		)
+
+		log.Debug("  - CPU burstability %s=%d, %s=%d, limit=%d",
+			node1.Name(), b1, node2.Name(), b2, limit)
+
+		if limit != unlimitedCPU {
+			// prefer pool with enough burstable capacity
+			switch {
+			case r1 >= 0 && r2 < 0:
+				log.Debug("  - %s loses on insufficient CPU burstability (%d vs. %d for limit %d)",
+					node2.Name(), b1, b2, limit)
+				return true
+			case r2 >= 0 && r1 < 0:
+				log.Debug("  - %s loses on insufficient CPU burstability", node1.Name())
+				return false
+			default:
+				log.Debug("  - CPU burstability is a TIE")
+			}
+		} else {
+			// prefer pool with more burstable capacity
+			switch {
+			case b1 > b2:
+				log.Debug("  - %s WINS on more CPU burstability", node1.Name())
+				return true
+			case b2 > b1:
+				log.Debug("  - %s WINS on more CPU burstability", node2.Name())
+				return false
+			default:
+				log.Debug("  - CPU burstability is a TIE")
+			}
 		}
 	}
 
+	// tighter memory offer wins
+	m1, m2 := o1.NodeMask(), o2.NodeMask()
+	if m1.Size() < m2.Size() {
+		log.Debug("   - %s loses on memory offer (%s less tight than %s)",
+			node2.Name(), m2, m1)
+		return true
+	}
+	if m2.Size() < m1.Size() {
+		log.Debug("   - %s loses on memory offer (%s less tight than %s)",
+			node1.Name(), m1, m2)
+		return false
+	}
+	if m2.Size() == m1.Size() {
+		log.Debug("  - memory offers are a TIE (%s vs. %s)", m1, m2)
+	}
+
 	// matching memory type wins
 	if reqType := request.MemoryType(); reqType != memoryUnspec && reqType != memoryPreserve {
 		if node1.HasMemoryType(reqType) && !node2.HasMemoryType(reqType) {

diff --git a/cmd/plugins/topology-aware/policy/resources.go b/cmd/plugins/topology-aware/policy/resources.go
@@ -107,6 +107,8 @@ type Request interface {
 	FullCPUs() int
 	// CPUFraction returns the amount of fractional milli-CPU requested.
 	CPUFraction() int
+	// CPULimit returns the amount of fractional CPU limit.
+	CPULimit() int
 	// Isolate returns whether isolated CPUs are preferred for this request.
 	Isolate() bool
 	// MemoryType returns the type(s) of requested memory.
@@ -223,6 +225,7 @@ type request struct {
 	container   cache.Container // container for this request
 	full        int             // number of full CPUs requested
 	fraction    int             // amount of fractional CPU requested
+	limit       int             // CPU limit, MaxInt for no limit
 	isolate     bool            // prefer isolated exclusive CPUs
 	cpuType     cpuClass        // preferred CPU type (normal, reserved)
 	prio        cpuPrio         // CPU priority preference, ignored for fraction requests
@@ -715,7 +718,7 @@ func prettyMem(value int64) string {
 // newRequest creates a new request for the given container.
 func newRequest(container cache.Container, types libmem.TypeMask) Request {
 	pod, _ := container.GetPod()
-	full, fraction, isolate, cpuType, prio := cpuAllocationPreferences(pod, container)
+	full, fraction, cpuLimit, isolate, cpuType, prio := cpuAllocationPreferences(pod, container)
 	req, lim, mtype := memoryAllocationPreference(pod, container)
 	coldStart := time.Duration(0)
 
@@ -752,6 +755,7 @@ func newRequest(container cache.Container, types libmem.TypeMask) Request {
 		container:   container,
 		full:        full,
 		fraction:    fraction,
+		limit:       cpuLimit,
 		isolate:     isolate,
 		cpuType:     cpuType,
 		memReq:      req,
@@ -815,6 +819,11 @@ func (cr *request) CPUFraction() int {
 	return cr.fraction
 }
 
+// CPULimit returns the amount of fractional milli-CPU limit.
+func (cr *request) CPULimit() int {
+	return cr.limit
+}
+
 // Isolate returns whether isolated CPUs are preferred for this request.
 func (cr *request) Isolate() bool {
 	return cr.isolate

diff --git a/test/e2e/files/burstable.yaml.in b/test/e2e/files/burstable.yaml.in
@@ -31,7 +31,9 @@ spec:
         cpu: ${CPUREQ}
         memory: ${MEMREQ}
       limits:
+        $( ( [ -n "$CPULIM" ] && [ "$CPULIM" != "0" ] ) && echo "
         cpu: ${CPULIM}
+        ")
         $( ( [ -n "$MEMLIM" ] && [ "$MEMLIM" != "0" ] ) && echo "
         memory: ${MEMLIM}
         ")

diff --git a/test/e2e/policies.test-suite/topology-aware/n4c16/test00-basic-placement/code.var.sh b/test/e2e/policies.test-suite/topology-aware/n4c16/test00-basic-placement/code.var.sh
@@ -54,7 +54,7 @@ vm-command "kubectl delete pods --all --now"
 
 # pod2: Test that 4 burstable containers not eligible for isolated/exclusive CPU allocation
 # gets evenly spread over NUMA nodes.
-CONTCOUNT=4 CPUREQ=2 CPULIM=4 create burstable
+CONTCOUNT=4 CPUREQ=2 CPULIM=2002m create burstable
 report allowed
 verify \
     'disjoint_sets(cpus["pod2c0"], cpus["pod2c1"], cpus["pod2c2"], cpus["pod2c3"])' \

diff --git a/test/e2e/policies.test-suite/topology-aware/n4c16/test14-burstable/code.var.sh b/test/e2e/policies.test-suite/topology-aware/n4c16/test14-burstable/code.var.sh
@@ -24,4 +24,27 @@ verify \
     'len(nodes["pod2c0"]) == 2' \
     'len(nodes["pod3c0"]) == 4'
 
+vm-command "kubectl delete pods --all --now"
+
+helm-terminate
+helm_config=$(COLOCATE_PODS=false instantiate helm-config.yaml) helm-launch topology-aware
+
+# Limited burstable containers get assigned to the lowest pool where
+# there is enough free capacity for their limit. In this case, a socket.
+CONTCOUNT=2 CPUREQ=2 CPULIM=5 MEMREQ=100M create burstable
+report allowed
+verify \
+    'nodes["pod4c0"] == { "node2" ,"node3" }' \
+    'nodes["pod4c1"] == { "node0", "node1" }'
+
+# Unlimited burstable containers get allocated to the root pool unless
+# affinity dictates otherwise.
+CONTCOUNT=2 CPUREQ=2 CPULIM=0 MEMREQ=100M create burstable
+report allowed
+verify \
+    'nodes["pod5c0"] == { "node0", "node1", "node2", "node3" }' \
+    'nodes["pod5c1"] == { "node0", "node1", "node2", "node3" }'
+
+vm-command "kubectl delete pods --all --now"
+
 helm-terminate