diff --git a/cmd/plugins/topology-aware/policy/pod-preferences.go b/cmd/plugins/topology-aware/policy/pod-preferences.go index a4d9f080d..37ce1fa77 100644 --- a/cmd/plugins/topology-aware/policy/pod-preferences.go +++ b/cmd/plugins/topology-aware/policy/pod-preferences.go @@ -17,6 +17,7 @@ package topologyaware import ( "encoding/json" "fmt" + "math" "path/filepath" "strconv" "strings" @@ -58,6 +59,8 @@ const ( hideHyperthreadsKey = keyHideHyperthreads + "." + kubernetes.ResmgrKeyNamespace // effective annotation key for picking resources by topology hints pickResourcesByHints = keyPickResourcesByHints + "." + kubernetes.ResmgrKeyNamespace + + unlimitedCPU = math.MaxInt // 'unlimited' burstable CPU limit ) type prefKind int @@ -308,10 +311,11 @@ func checkReservedCPUsAnnotations(c cache.Container) (bool, bool) { // Returned values: // 1. full: number of full CPUs // 2. fraction: amount of fractional CPU in milli-CPU -// 3. isolate: (bool) whether to prefer isolated full CPUs -// 4. cpuType: (cpuClass) class of CPU to allocate (reserved vs. normal) -// 5. cpuPrio: preferred CPU allocator priority for CPU allocation. -func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, int, bool, cpuClass, cpuPrio) { +// 3. limit: CPU limit for this container +// 4. isolate: (bool) whether to prefer isolated full CPUs +// 5. cpuType: (cpuClass) class of CPU to allocate (reserved vs. normal) +// 6. cpuPrio: preferred CPU allocator priority for CPU allocation. +func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, int, int, bool, cpuClass, cpuPrio) { // // CPU allocation preferences for a container consist of // @@ -381,52 +385,68 @@ func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, in qosClass := pod.GetQOSClass() fraction := int(request.MilliValue()) prio := defaultPrio // ignored for fractional allocations + limit := 0 + + switch qosClass { + case corev1.PodQOSBestEffort: + case corev1.PodQOSBurstable: + if lim, ok := reqs.Limits[corev1.ResourceCPU]; ok { + limit = int(lim.MilliValue()) + } else { + limit = unlimitedCPU + } + case corev1.PodQOSGuaranteed: + if lim, ok := reqs.Limits[corev1.ResourceCPU]; ok { + limit = int(lim.MilliValue()) + } + } // easy cases: kube-system namespace, Burstable or BestEffort QoS class containers preferReserved, explicitReservation := checkReservedCPUsAnnotations(container) switch { case container.PreserveCpuResources(): - return 0, fraction, false, cpuPreserve, prio + return 0, fraction, limit, false, cpuPreserve, prio case preferReserved: - return 0, fraction, false, cpuReserved, prio + return 0, fraction, limit, false, cpuReserved, prio case checkReservedPoolNamespaces(namespace) && !explicitReservation: - return 0, fraction, false, cpuReserved, prio + return 0, fraction, limit, false, cpuReserved, prio case qosClass == corev1.PodQOSBurstable: - return 0, fraction, false, cpuNormal, prio + return 0, fraction, limit, false, cpuNormal, prio case qosClass == corev1.PodQOSBestEffort: - return 0, 0, false, cpuNormal, prio + return 0, 0, 0, false, cpuNormal, prio } // complex case: Guaranteed QoS class containers cores := fraction / 1000 fraction = fraction % 1000 + limit = 1000*cores + fraction preferIsolated, isolPrefKind := isolatedCPUsPreference(pod, container) preferShared, sharedPrefKind := sharedCPUsPreference(pod, container) prio = cpuPrioPreference(pod, container, defaultPrio) // ignored for fractional allocations switch { case cores == 0: // sub-core CPU request - return 0, fraction, false, cpuNormal, prio + return 0, fraction, limit, false, cpuNormal, prio case cores < 2: // 1 <= CPU request < 2 if preferShared { - return 0, 1000*cores + fraction, false, cpuNormal, prio + return 0, 1000*cores + fraction, limit, false, cpuNormal, prio } // potentially mixed allocation (1 core + some fraction) - return cores, fraction, preferIsolated, cpuNormal, prio + return cores, fraction, limit, preferIsolated, cpuNormal, prio default: // CPU request >= 2 // fractional allocation, only mixed if explicitly annotated as unshared if fraction > 0 { if !preferShared && sharedPrefKind == prefAnnotated { - return cores, fraction, preferIsolated, cpuNormal, prio + return cores, fraction, limit, preferIsolated, cpuNormal, prio } - return 0, 1000*cores + fraction, false, cpuNormal, prio + return 0, 1000*cores + fraction, limit, false, cpuNormal, prio } // non-fractional allocation if preferShared { - return 0, 1000 * cores, false, cpuNormal, prio + return 0, 1000 * cores, limit, false, cpuNormal, prio } // for multiple cores, isolated preference must be explicitly annotated - return cores, 0, preferIsolated && isolPrefKind == prefAnnotated, cpuNormal, prio + return cores, 0, limit, preferIsolated && isolPrefKind == prefAnnotated, cpuNormal, prio } } diff --git a/cmd/plugins/topology-aware/policy/pod-preferences_test.go b/cmd/plugins/topology-aware/policy/pod-preferences_test.go index 5e03a5982..ca7933594 100644 --- a/cmd/plugins/topology-aware/policy/pod-preferences_test.go +++ b/cmd/plugins/topology-aware/policy/pod-preferences_test.go @@ -1052,7 +1052,7 @@ func TestCpuAllocationPreferences(t *testing.T) { } opt.PreferIsolated, opt.PreferShared = &tc.preferIsolated, &tc.preferShared opt.ReservedPoolNamespaces = tc.reservedPoolNamespaces - full, fraction, isolate, cpuType, _ := cpuAllocationPreferences(tc.pod, tc.container) + full, fraction, _, isolate, cpuType, _ := cpuAllocationPreferences(tc.pod, tc.container) require.Equal(t, tc.expectedFull, full, "full CPU cores") require.Equal(t, tc.expectedFraction, fraction, "CPU core fraction") require.Equal(t, tc.expectedIsolate, isolate, "isolation preference") diff --git a/cmd/plugins/topology-aware/policy/pools.go b/cmd/plugins/topology-aware/policy/pools.go index 60290b7b7..4009cfb26 100644 --- a/cmd/plugins/topology-aware/policy/pools.go +++ b/cmd/plugins/topology-aware/policy/pools.go @@ -20,6 +20,7 @@ import ( "sort" "github.com/containers/nri-plugins/pkg/utils/cpuset" + corev1 "k8s.io/api/core/v1" "github.com/containers/nri-plugins/pkg/resmgr/cache" libmem "github.com/containers/nri-plugins/pkg/resmgr/lib/memory" @@ -673,7 +674,9 @@ func (p *policy) compareScores(request Request, pools []Node, scores map[int]Sco // - if we have topology hints // * better hint score wins // * for a tie, prefer the lower node then the smaller id - // - if we have a better matching or tighter fitting memory offer, it wins + // - if we have a better matching memory offer, it wins + // - if we have a burstable container, sufficient capacity for the limit wins + // - if we have or tighter fitting memory offer, it wins // - if only one node matches the memory type request, it wins // - for low-prio and high-prio CPU preference, if only one node has such CPUs, it wins // - if a node is lower in the tree it wins @@ -772,7 +775,7 @@ func (p *policy) compareScores(request Request, pools []Node, scores map[int]Sco } } - // better matching or tighter memory offer wins + // better matching offer wins switch { case o1 != nil && o2 == nil: log.Debug(" => %s loses on memory offer (failed offer)", node2.Name()) @@ -809,22 +812,64 @@ func (p *policy) compareScores(request Request, pools []Node, scores map[int]Sco } log.Debug(" - memory offers burstability are a TIE") } + } - if m1.Size() < m2.Size() { - log.Debug(" - %s loses on memory offer (%s less tight than %s)", - node2.Name(), m2, m1) - return true - } - if m2.Size() < m1.Size() { - log.Debug(" - %s loses on memory offer (%s less tight than %s)", - node1.Name(), m1, m2) - return false - } - if m2.Size() == m1.Size() { - log.Debug(" - memory offers are a TIE (%s vs. %s)", m1, m2) + if request.GetContainer().GetQOSClass() == corev1.PodQOSBurstable { + var ( + limit = request.CPULimit() + b1 = score1.Supply().AllocatableSharedCPU() + b2 = score2.Supply().AllocatableSharedCPU() + r1 = b1 - limit + r2 = b2 - limit + ) + + log.Debug(" - CPU burstability %s=%d, %s=%d, limit=%d", + node1.Name(), b1, node2.Name(), b2, limit) + + if limit != unlimitedCPU { + // prefer pool with enough burstable capacity + switch { + case r1 >= 0 && r2 < 0: + log.Debug(" - %s loses on insufficient CPU burstability (%d vs. %d for limit %d)", + node2.Name(), b1, b2, limit) + return true + case r2 >= 0 && r1 < 0: + log.Debug(" - %s loses on insufficient CPU burstability", node1.Name()) + return false + default: + log.Debug(" - CPU burstability is a TIE") + } + } else { + // prefer pool with more burstable capacity + switch { + case b1 > b2: + log.Debug(" - %s WINS on more CPU burstability", node1.Name()) + return true + case b2 > b1: + log.Debug(" - %s WINS on more CPU burstability", node2.Name()) + return false + default: + log.Debug(" - CPU burstability is a TIE") + } } } + // tighter memory offer wins + m1, m2 := o1.NodeMask(), o2.NodeMask() + if m1.Size() < m2.Size() { + log.Debug(" - %s loses on memory offer (%s less tight than %s)", + node2.Name(), m2, m1) + return true + } + if m2.Size() < m1.Size() { + log.Debug(" - %s loses on memory offer (%s less tight than %s)", + node1.Name(), m1, m2) + return false + } + if m2.Size() == m1.Size() { + log.Debug(" - memory offers are a TIE (%s vs. %s)", m1, m2) + } + // matching memory type wins if reqType := request.MemoryType(); reqType != memoryUnspec && reqType != memoryPreserve { if node1.HasMemoryType(reqType) && !node2.HasMemoryType(reqType) { diff --git a/cmd/plugins/topology-aware/policy/resources.go b/cmd/plugins/topology-aware/policy/resources.go index 5ea4f94cf..ca234b04f 100644 --- a/cmd/plugins/topology-aware/policy/resources.go +++ b/cmd/plugins/topology-aware/policy/resources.go @@ -107,6 +107,8 @@ type Request interface { FullCPUs() int // CPUFraction returns the amount of fractional milli-CPU requested. CPUFraction() int + // CPULimit returns the amount of fractional CPU limit. + CPULimit() int // Isolate returns whether isolated CPUs are preferred for this request. Isolate() bool // MemoryType returns the type(s) of requested memory. @@ -223,6 +225,7 @@ type request struct { container cache.Container // container for this request full int // number of full CPUs requested fraction int // amount of fractional CPU requested + limit int // CPU limit, MaxInt for no limit isolate bool // prefer isolated exclusive CPUs cpuType cpuClass // preferred CPU type (normal, reserved) prio cpuPrio // CPU priority preference, ignored for fraction requests @@ -715,7 +718,7 @@ func prettyMem(value int64) string { // newRequest creates a new request for the given container. func newRequest(container cache.Container, types libmem.TypeMask) Request { pod, _ := container.GetPod() - full, fraction, isolate, cpuType, prio := cpuAllocationPreferences(pod, container) + full, fraction, cpuLimit, isolate, cpuType, prio := cpuAllocationPreferences(pod, container) req, lim, mtype := memoryAllocationPreference(pod, container) coldStart := time.Duration(0) @@ -752,6 +755,7 @@ func newRequest(container cache.Container, types libmem.TypeMask) Request { container: container, full: full, fraction: fraction, + limit: cpuLimit, isolate: isolate, cpuType: cpuType, memReq: req, @@ -815,6 +819,11 @@ func (cr *request) CPUFraction() int { return cr.fraction } +// CPULimit returns the amount of fractional milli-CPU limit. +func (cr *request) CPULimit() int { + return cr.limit +} + // Isolate returns whether isolated CPUs are preferred for this request. func (cr *request) Isolate() bool { return cr.isolate diff --git a/test/e2e/files/burstable.yaml.in b/test/e2e/files/burstable.yaml.in index c311fbb08..aaa509933 100644 --- a/test/e2e/files/burstable.yaml.in +++ b/test/e2e/files/burstable.yaml.in @@ -31,7 +31,9 @@ spec: cpu: ${CPUREQ} memory: ${MEMREQ} limits: + $( ( [ -n "$CPULIM" ] && [ "$CPULIM" != "0" ] ) && echo " cpu: ${CPULIM} + ") $( ( [ -n "$MEMLIM" ] && [ "$MEMLIM" != "0" ] ) && echo " memory: ${MEMLIM} ") diff --git a/test/e2e/policies.test-suite/topology-aware/n4c16/test00-basic-placement/code.var.sh b/test/e2e/policies.test-suite/topology-aware/n4c16/test00-basic-placement/code.var.sh index 4d5feef2b..9d68c3481 100644 --- a/test/e2e/policies.test-suite/topology-aware/n4c16/test00-basic-placement/code.var.sh +++ b/test/e2e/policies.test-suite/topology-aware/n4c16/test00-basic-placement/code.var.sh @@ -54,7 +54,7 @@ vm-command "kubectl delete pods --all --now" # pod2: Test that 4 burstable containers not eligible for isolated/exclusive CPU allocation # gets evenly spread over NUMA nodes. -CONTCOUNT=4 CPUREQ=2 CPULIM=4 create burstable +CONTCOUNT=4 CPUREQ=2 CPULIM=2002m create burstable report allowed verify \ 'disjoint_sets(cpus["pod2c0"], cpus["pod2c1"], cpus["pod2c2"], cpus["pod2c3"])' \ diff --git a/test/e2e/policies.test-suite/topology-aware/n4c16/test14-burstable/code.var.sh b/test/e2e/policies.test-suite/topology-aware/n4c16/test14-burstable/code.var.sh index 8084435b6..a9845be01 100644 --- a/test/e2e/policies.test-suite/topology-aware/n4c16/test14-burstable/code.var.sh +++ b/test/e2e/policies.test-suite/topology-aware/n4c16/test14-burstable/code.var.sh @@ -24,4 +24,27 @@ verify \ 'len(nodes["pod2c0"]) == 2' \ 'len(nodes["pod3c0"]) == 4' +vm-command "kubectl delete pods --all --now" + +helm-terminate +helm_config=$(COLOCATE_PODS=false instantiate helm-config.yaml) helm-launch topology-aware + +# Limited burstable containers get assigned to the lowest pool where +# there is enough free capacity for their limit. In this case, a socket. +CONTCOUNT=2 CPUREQ=2 CPULIM=5 MEMREQ=100M create burstable +report allowed +verify \ + 'nodes["pod4c0"] == { "node2" ,"node3" }' \ + 'nodes["pod4c1"] == { "node0", "node1" }' + +# Unlimited burstable containers get allocated to the root pool unless +# affinity dictates otherwise. +CONTCOUNT=2 CPUREQ=2 CPULIM=0 MEMREQ=100M create burstable +report allowed +verify \ + 'nodes["pod5c0"] == { "node0", "node1", "node2", "node3" }' \ + 'nodes["pod5c1"] == { "node0", "node1", "node2", "node3" }' + +vm-command "kubectl delete pods --all --now" + helm-terminate