From 827e7d579d3e530daad93a6494b1fe4f88a246eb Mon Sep 17 00:00:00 2001 From: Krisztian Litkey Date: Tue, 2 Sep 2025 13:28:51 +0300 Subject: [PATCH 1/2] toplogy-aware: pick pools by burstable CPU limit. When picking a pool for a burstable QoS class container, take into account the CPU limit, too. For limited containers prefer pools with enough free capacity for the limit. For unlimited containers prefer pools with more free capacity left, which means in practice that such containers will typically end up in the root pool, unless affinity of topology hints dictate otherwise. Signed-off-by: Krisztian Litkey --- .../topology-aware/policy/pod-preferences.go | 52 +++++++++---- .../policy/pod-preferences_test.go | 2 +- cmd/plugins/topology-aware/policy/pools.go | 73 +++++++++++++++---- .../topology-aware/policy/resources.go | 11 ++- 4 files changed, 106 insertions(+), 32 deletions(-) diff --git a/cmd/plugins/topology-aware/policy/pod-preferences.go b/cmd/plugins/topology-aware/policy/pod-preferences.go index a4d9f080d..37ce1fa77 100644 --- a/cmd/plugins/topology-aware/policy/pod-preferences.go +++ b/cmd/plugins/topology-aware/policy/pod-preferences.go @@ -17,6 +17,7 @@ package topologyaware import ( "encoding/json" "fmt" + "math" "path/filepath" "strconv" "strings" @@ -58,6 +59,8 @@ const ( hideHyperthreadsKey = keyHideHyperthreads + "." + kubernetes.ResmgrKeyNamespace // effective annotation key for picking resources by topology hints pickResourcesByHints = keyPickResourcesByHints + "." + kubernetes.ResmgrKeyNamespace + + unlimitedCPU = math.MaxInt // 'unlimited' burstable CPU limit ) type prefKind int @@ -308,10 +311,11 @@ func checkReservedCPUsAnnotations(c cache.Container) (bool, bool) { // Returned values: // 1. full: number of full CPUs // 2. fraction: amount of fractional CPU in milli-CPU -// 3. isolate: (bool) whether to prefer isolated full CPUs -// 4. cpuType: (cpuClass) class of CPU to allocate (reserved vs. normal) -// 5. cpuPrio: preferred CPU allocator priority for CPU allocation. -func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, int, bool, cpuClass, cpuPrio) { +// 3. limit: CPU limit for this container +// 4. isolate: (bool) whether to prefer isolated full CPUs +// 5. cpuType: (cpuClass) class of CPU to allocate (reserved vs. normal) +// 6. cpuPrio: preferred CPU allocator priority for CPU allocation. +func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, int, int, bool, cpuClass, cpuPrio) { // // CPU allocation preferences for a container consist of // @@ -381,52 +385,68 @@ func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, in qosClass := pod.GetQOSClass() fraction := int(request.MilliValue()) prio := defaultPrio // ignored for fractional allocations + limit := 0 + + switch qosClass { + case corev1.PodQOSBestEffort: + case corev1.PodQOSBurstable: + if lim, ok := reqs.Limits[corev1.ResourceCPU]; ok { + limit = int(lim.MilliValue()) + } else { + limit = unlimitedCPU + } + case corev1.PodQOSGuaranteed: + if lim, ok := reqs.Limits[corev1.ResourceCPU]; ok { + limit = int(lim.MilliValue()) + } + } // easy cases: kube-system namespace, Burstable or BestEffort QoS class containers preferReserved, explicitReservation := checkReservedCPUsAnnotations(container) switch { case container.PreserveCpuResources(): - return 0, fraction, false, cpuPreserve, prio + return 0, fraction, limit, false, cpuPreserve, prio case preferReserved: - return 0, fraction, false, cpuReserved, prio + return 0, fraction, limit, false, cpuReserved, prio case checkReservedPoolNamespaces(namespace) && !explicitReservation: - return 0, fraction, false, cpuReserved, prio + return 0, fraction, limit, false, cpuReserved, prio case qosClass == corev1.PodQOSBurstable: - return 0, fraction, false, cpuNormal, prio + return 0, fraction, limit, false, cpuNormal, prio case qosClass == corev1.PodQOSBestEffort: - return 0, 0, false, cpuNormal, prio + return 0, 0, 0, false, cpuNormal, prio } // complex case: Guaranteed QoS class containers cores := fraction / 1000 fraction = fraction % 1000 + limit = 1000*cores + fraction preferIsolated, isolPrefKind := isolatedCPUsPreference(pod, container) preferShared, sharedPrefKind := sharedCPUsPreference(pod, container) prio = cpuPrioPreference(pod, container, defaultPrio) // ignored for fractional allocations switch { case cores == 0: // sub-core CPU request - return 0, fraction, false, cpuNormal, prio + return 0, fraction, limit, false, cpuNormal, prio case cores < 2: // 1 <= CPU request < 2 if preferShared { - return 0, 1000*cores + fraction, false, cpuNormal, prio + return 0, 1000*cores + fraction, limit, false, cpuNormal, prio } // potentially mixed allocation (1 core + some fraction) - return cores, fraction, preferIsolated, cpuNormal, prio + return cores, fraction, limit, preferIsolated, cpuNormal, prio default: // CPU request >= 2 // fractional allocation, only mixed if explicitly annotated as unshared if fraction > 0 { if !preferShared && sharedPrefKind == prefAnnotated { - return cores, fraction, preferIsolated, cpuNormal, prio + return cores, fraction, limit, preferIsolated, cpuNormal, prio } - return 0, 1000*cores + fraction, false, cpuNormal, prio + return 0, 1000*cores + fraction, limit, false, cpuNormal, prio } // non-fractional allocation if preferShared { - return 0, 1000 * cores, false, cpuNormal, prio + return 0, 1000 * cores, limit, false, cpuNormal, prio } // for multiple cores, isolated preference must be explicitly annotated - return cores, 0, preferIsolated && isolPrefKind == prefAnnotated, cpuNormal, prio + return cores, 0, limit, preferIsolated && isolPrefKind == prefAnnotated, cpuNormal, prio } } diff --git a/cmd/plugins/topology-aware/policy/pod-preferences_test.go b/cmd/plugins/topology-aware/policy/pod-preferences_test.go index 5e03a5982..ca7933594 100644 --- a/cmd/plugins/topology-aware/policy/pod-preferences_test.go +++ b/cmd/plugins/topology-aware/policy/pod-preferences_test.go @@ -1052,7 +1052,7 @@ func TestCpuAllocationPreferences(t *testing.T) { } opt.PreferIsolated, opt.PreferShared = &tc.preferIsolated, &tc.preferShared opt.ReservedPoolNamespaces = tc.reservedPoolNamespaces - full, fraction, isolate, cpuType, _ := cpuAllocationPreferences(tc.pod, tc.container) + full, fraction, _, isolate, cpuType, _ := cpuAllocationPreferences(tc.pod, tc.container) require.Equal(t, tc.expectedFull, full, "full CPU cores") require.Equal(t, tc.expectedFraction, fraction, "CPU core fraction") require.Equal(t, tc.expectedIsolate, isolate, "isolation preference") diff --git a/cmd/plugins/topology-aware/policy/pools.go b/cmd/plugins/topology-aware/policy/pools.go index 60290b7b7..4009cfb26 100644 --- a/cmd/plugins/topology-aware/policy/pools.go +++ b/cmd/plugins/topology-aware/policy/pools.go @@ -20,6 +20,7 @@ import ( "sort" "github.com/containers/nri-plugins/pkg/utils/cpuset" + corev1 "k8s.io/api/core/v1" "github.com/containers/nri-plugins/pkg/resmgr/cache" libmem "github.com/containers/nri-plugins/pkg/resmgr/lib/memory" @@ -673,7 +674,9 @@ func (p *policy) compareScores(request Request, pools []Node, scores map[int]Sco // - if we have topology hints // * better hint score wins // * for a tie, prefer the lower node then the smaller id - // - if we have a better matching or tighter fitting memory offer, it wins + // - if we have a better matching memory offer, it wins + // - if we have a burstable container, sufficient capacity for the limit wins + // - if we have or tighter fitting memory offer, it wins // - if only one node matches the memory type request, it wins // - for low-prio and high-prio CPU preference, if only one node has such CPUs, it wins // - if a node is lower in the tree it wins @@ -772,7 +775,7 @@ func (p *policy) compareScores(request Request, pools []Node, scores map[int]Sco } } - // better matching or tighter memory offer wins + // better matching offer wins switch { case o1 != nil && o2 == nil: log.Debug(" => %s loses on memory offer (failed offer)", node2.Name()) @@ -809,22 +812,64 @@ func (p *policy) compareScores(request Request, pools []Node, scores map[int]Sco } log.Debug(" - memory offers burstability are a TIE") } + } - if m1.Size() < m2.Size() { - log.Debug(" - %s loses on memory offer (%s less tight than %s)", - node2.Name(), m2, m1) - return true - } - if m2.Size() < m1.Size() { - log.Debug(" - %s loses on memory offer (%s less tight than %s)", - node1.Name(), m1, m2) - return false - } - if m2.Size() == m1.Size() { - log.Debug(" - memory offers are a TIE (%s vs. %s)", m1, m2) + if request.GetContainer().GetQOSClass() == corev1.PodQOSBurstable { + var ( + limit = request.CPULimit() + b1 = score1.Supply().AllocatableSharedCPU() + b2 = score2.Supply().AllocatableSharedCPU() + r1 = b1 - limit + r2 = b2 - limit + ) + + log.Debug(" - CPU burstability %s=%d, %s=%d, limit=%d", + node1.Name(), b1, node2.Name(), b2, limit) + + if limit != unlimitedCPU { + // prefer pool with enough burstable capacity + switch { + case r1 >= 0 && r2 < 0: + log.Debug(" - %s loses on insufficient CPU burstability (%d vs. %d for limit %d)", + node2.Name(), b1, b2, limit) + return true + case r2 >= 0 && r1 < 0: + log.Debug(" - %s loses on insufficient CPU burstability", node1.Name()) + return false + default: + log.Debug(" - CPU burstability is a TIE") + } + } else { + // prefer pool with more burstable capacity + switch { + case b1 > b2: + log.Debug(" - %s WINS on more CPU burstability", node1.Name()) + return true + case b2 > b1: + log.Debug(" - %s WINS on more CPU burstability", node2.Name()) + return false + default: + log.Debug(" - CPU burstability is a TIE") + } } } + // tighter memory offer wins + m1, m2 := o1.NodeMask(), o2.NodeMask() + if m1.Size() < m2.Size() { + log.Debug(" - %s loses on memory offer (%s less tight than %s)", + node2.Name(), m2, m1) + return true + } + if m2.Size() < m1.Size() { + log.Debug(" - %s loses on memory offer (%s less tight than %s)", + node1.Name(), m1, m2) + return false + } + if m2.Size() == m1.Size() { + log.Debug(" - memory offers are a TIE (%s vs. %s)", m1, m2) + } + // matching memory type wins if reqType := request.MemoryType(); reqType != memoryUnspec && reqType != memoryPreserve { if node1.HasMemoryType(reqType) && !node2.HasMemoryType(reqType) { diff --git a/cmd/plugins/topology-aware/policy/resources.go b/cmd/plugins/topology-aware/policy/resources.go index 5ea4f94cf..ca234b04f 100644 --- a/cmd/plugins/topology-aware/policy/resources.go +++ b/cmd/plugins/topology-aware/policy/resources.go @@ -107,6 +107,8 @@ type Request interface { FullCPUs() int // CPUFraction returns the amount of fractional milli-CPU requested. CPUFraction() int + // CPULimit returns the amount of fractional CPU limit. + CPULimit() int // Isolate returns whether isolated CPUs are preferred for this request. Isolate() bool // MemoryType returns the type(s) of requested memory. @@ -223,6 +225,7 @@ type request struct { container cache.Container // container for this request full int // number of full CPUs requested fraction int // amount of fractional CPU requested + limit int // CPU limit, MaxInt for no limit isolate bool // prefer isolated exclusive CPUs cpuType cpuClass // preferred CPU type (normal, reserved) prio cpuPrio // CPU priority preference, ignored for fraction requests @@ -715,7 +718,7 @@ func prettyMem(value int64) string { // newRequest creates a new request for the given container. func newRequest(container cache.Container, types libmem.TypeMask) Request { pod, _ := container.GetPod() - full, fraction, isolate, cpuType, prio := cpuAllocationPreferences(pod, container) + full, fraction, cpuLimit, isolate, cpuType, prio := cpuAllocationPreferences(pod, container) req, lim, mtype := memoryAllocationPreference(pod, container) coldStart := time.Duration(0) @@ -752,6 +755,7 @@ func newRequest(container cache.Container, types libmem.TypeMask) Request { container: container, full: full, fraction: fraction, + limit: cpuLimit, isolate: isolate, cpuType: cpuType, memReq: req, @@ -815,6 +819,11 @@ func (cr *request) CPUFraction() int { return cr.fraction } +// CPULimit returns the amount of fractional milli-CPU limit. +func (cr *request) CPULimit() int { + return cr.limit +} + // Isolate returns whether isolated CPUs are preferred for this request. func (cr *request) Isolate() bool { return cr.isolate From 21943c5a4597385b0127c6b4ab911c988d0b0684 Mon Sep 17 00:00:00 2001 From: Krisztian Litkey Date: Tue, 2 Sep 2025 15:58:41 +0300 Subject: [PATCH 2/2] e2e: update topology-aware test for burstable placement. Update existing burstable placement test for altered behavior. Add new test to verify CPU limit based burstable placement of limited and unlimited burstable containers. Signed-off-by: Krisztian Litkey --- test/e2e/files/burstable.yaml.in | 2 ++ .../n4c16/test00-basic-placement/code.var.sh | 2 +- .../n4c16/test14-burstable/code.var.sh | 23 +++++++++++++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/test/e2e/files/burstable.yaml.in b/test/e2e/files/burstable.yaml.in index c311fbb08..aaa509933 100644 --- a/test/e2e/files/burstable.yaml.in +++ b/test/e2e/files/burstable.yaml.in @@ -31,7 +31,9 @@ spec: cpu: ${CPUREQ} memory: ${MEMREQ} limits: + $( ( [ -n "$CPULIM" ] && [ "$CPULIM" != "0" ] ) && echo " cpu: ${CPULIM} + ") $( ( [ -n "$MEMLIM" ] && [ "$MEMLIM" != "0" ] ) && echo " memory: ${MEMLIM} ") diff --git a/test/e2e/policies.test-suite/topology-aware/n4c16/test00-basic-placement/code.var.sh b/test/e2e/policies.test-suite/topology-aware/n4c16/test00-basic-placement/code.var.sh index 4d5feef2b..9d68c3481 100644 --- a/test/e2e/policies.test-suite/topology-aware/n4c16/test00-basic-placement/code.var.sh +++ b/test/e2e/policies.test-suite/topology-aware/n4c16/test00-basic-placement/code.var.sh @@ -54,7 +54,7 @@ vm-command "kubectl delete pods --all --now" # pod2: Test that 4 burstable containers not eligible for isolated/exclusive CPU allocation # gets evenly spread over NUMA nodes. -CONTCOUNT=4 CPUREQ=2 CPULIM=4 create burstable +CONTCOUNT=4 CPUREQ=2 CPULIM=2002m create burstable report allowed verify \ 'disjoint_sets(cpus["pod2c0"], cpus["pod2c1"], cpus["pod2c2"], cpus["pod2c3"])' \ diff --git a/test/e2e/policies.test-suite/topology-aware/n4c16/test14-burstable/code.var.sh b/test/e2e/policies.test-suite/topology-aware/n4c16/test14-burstable/code.var.sh index 8084435b6..a9845be01 100644 --- a/test/e2e/policies.test-suite/topology-aware/n4c16/test14-burstable/code.var.sh +++ b/test/e2e/policies.test-suite/topology-aware/n4c16/test14-burstable/code.var.sh @@ -24,4 +24,27 @@ verify \ 'len(nodes["pod2c0"]) == 2' \ 'len(nodes["pod3c0"]) == 4' +vm-command "kubectl delete pods --all --now" + +helm-terminate +helm_config=$(COLOCATE_PODS=false instantiate helm-config.yaml) helm-launch topology-aware + +# Limited burstable containers get assigned to the lowest pool where +# there is enough free capacity for their limit. In this case, a socket. +CONTCOUNT=2 CPUREQ=2 CPULIM=5 MEMREQ=100M create burstable +report allowed +verify \ + 'nodes["pod4c0"] == { "node2" ,"node3" }' \ + 'nodes["pod4c1"] == { "node0", "node1" }' + +# Unlimited burstable containers get allocated to the root pool unless +# affinity dictates otherwise. +CONTCOUNT=2 CPUREQ=2 CPULIM=0 MEMREQ=100M create burstable +report allowed +verify \ + 'nodes["pod5c0"] == { "node0", "node1", "node2", "node3" }' \ + 'nodes["pod5c1"] == { "node0", "node1", "node2", "node3" }' + +vm-command "kubectl delete pods --all --now" + helm-terminate