Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 36 additions & 16 deletions cmd/plugins/topology-aware/policy/pod-preferences.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package topologyaware
import (
"encoding/json"
"fmt"
"math"
"path/filepath"
"strconv"
"strings"
Expand Down Expand Up @@ -58,6 +59,8 @@ const (
hideHyperthreadsKey = keyHideHyperthreads + "." + kubernetes.ResmgrKeyNamespace
// effective annotation key for picking resources by topology hints
pickResourcesByHints = keyPickResourcesByHints + "." + kubernetes.ResmgrKeyNamespace

unlimitedCPU = math.MaxInt // 'unlimited' burstable CPU limit
)

type prefKind int
Expand Down Expand Up @@ -308,10 +311,11 @@ func checkReservedCPUsAnnotations(c cache.Container) (bool, bool) {
// Returned values:
// 1. full: number of full CPUs
// 2. fraction: amount of fractional CPU in milli-CPU
// 3. isolate: (bool) whether to prefer isolated full CPUs
// 4. cpuType: (cpuClass) class of CPU to allocate (reserved vs. normal)
// 5. cpuPrio: preferred CPU allocator priority for CPU allocation.
func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, int, bool, cpuClass, cpuPrio) {
// 3. limit: CPU limit for this container
// 4. isolate: (bool) whether to prefer isolated full CPUs
// 5. cpuType: (cpuClass) class of CPU to allocate (reserved vs. normal)
// 6. cpuPrio: preferred CPU allocator priority for CPU allocation.
func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, int, int, bool, cpuClass, cpuPrio) {
//
// CPU allocation preferences for a container consist of
//
Expand Down Expand Up @@ -381,52 +385,68 @@ func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, in
qosClass := pod.GetQOSClass()
fraction := int(request.MilliValue())
prio := defaultPrio // ignored for fractional allocations
limit := 0

switch qosClass {
case corev1.PodQOSBestEffort:
case corev1.PodQOSBurstable:
if lim, ok := reqs.Limits[corev1.ResourceCPU]; ok {
limit = int(lim.MilliValue())
} else {
limit = unlimitedCPU
}
case corev1.PodQOSGuaranteed:
if lim, ok := reqs.Limits[corev1.ResourceCPU]; ok {
limit = int(lim.MilliValue())
}
}

// easy cases: kube-system namespace, Burstable or BestEffort QoS class containers
preferReserved, explicitReservation := checkReservedCPUsAnnotations(container)
switch {
case container.PreserveCpuResources():
return 0, fraction, false, cpuPreserve, prio
return 0, fraction, limit, false, cpuPreserve, prio
case preferReserved:
return 0, fraction, false, cpuReserved, prio
return 0, fraction, limit, false, cpuReserved, prio
case checkReservedPoolNamespaces(namespace) && !explicitReservation:
return 0, fraction, false, cpuReserved, prio
return 0, fraction, limit, false, cpuReserved, prio
case qosClass == corev1.PodQOSBurstable:
return 0, fraction, false, cpuNormal, prio
return 0, fraction, limit, false, cpuNormal, prio
case qosClass == corev1.PodQOSBestEffort:
return 0, 0, false, cpuNormal, prio
return 0, 0, 0, false, cpuNormal, prio
}

// complex case: Guaranteed QoS class containers
cores := fraction / 1000
fraction = fraction % 1000
limit = 1000*cores + fraction
preferIsolated, isolPrefKind := isolatedCPUsPreference(pod, container)
preferShared, sharedPrefKind := sharedCPUsPreference(pod, container)
prio = cpuPrioPreference(pod, container, defaultPrio) // ignored for fractional allocations

switch {
case cores == 0: // sub-core CPU request
return 0, fraction, false, cpuNormal, prio
return 0, fraction, limit, false, cpuNormal, prio
case cores < 2: // 1 <= CPU request < 2
if preferShared {
return 0, 1000*cores + fraction, false, cpuNormal, prio
return 0, 1000*cores + fraction, limit, false, cpuNormal, prio
}
// potentially mixed allocation (1 core + some fraction)
return cores, fraction, preferIsolated, cpuNormal, prio
return cores, fraction, limit, preferIsolated, cpuNormal, prio
default: // CPU request >= 2
// fractional allocation, only mixed if explicitly annotated as unshared
if fraction > 0 {
if !preferShared && sharedPrefKind == prefAnnotated {
return cores, fraction, preferIsolated, cpuNormal, prio
return cores, fraction, limit, preferIsolated, cpuNormal, prio
}
return 0, 1000*cores + fraction, false, cpuNormal, prio
return 0, 1000*cores + fraction, limit, false, cpuNormal, prio
}
// non-fractional allocation
if preferShared {
return 0, 1000 * cores, false, cpuNormal, prio
return 0, 1000 * cores, limit, false, cpuNormal, prio
}
// for multiple cores, isolated preference must be explicitly annotated
return cores, 0, preferIsolated && isolPrefKind == prefAnnotated, cpuNormal, prio
return cores, 0, limit, preferIsolated && isolPrefKind == prefAnnotated, cpuNormal, prio
}
}

Expand Down
2 changes: 1 addition & 1 deletion cmd/plugins/topology-aware/policy/pod-preferences_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1052,7 +1052,7 @@ func TestCpuAllocationPreferences(t *testing.T) {
}
opt.PreferIsolated, opt.PreferShared = &tc.preferIsolated, &tc.preferShared
opt.ReservedPoolNamespaces = tc.reservedPoolNamespaces
full, fraction, isolate, cpuType, _ := cpuAllocationPreferences(tc.pod, tc.container)
full, fraction, _, isolate, cpuType, _ := cpuAllocationPreferences(tc.pod, tc.container)
require.Equal(t, tc.expectedFull, full, "full CPU cores")
require.Equal(t, tc.expectedFraction, fraction, "CPU core fraction")
require.Equal(t, tc.expectedIsolate, isolate, "isolation preference")
Expand Down
73 changes: 59 additions & 14 deletions cmd/plugins/topology-aware/policy/pools.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"sort"

"github.com/containers/nri-plugins/pkg/utils/cpuset"
corev1 "k8s.io/api/core/v1"

"github.com/containers/nri-plugins/pkg/resmgr/cache"
libmem "github.com/containers/nri-plugins/pkg/resmgr/lib/memory"
Expand Down Expand Up @@ -673,7 +674,9 @@ func (p *policy) compareScores(request Request, pools []Node, scores map[int]Sco
// - if we have topology hints
// * better hint score wins
// * for a tie, prefer the lower node then the smaller id
// - if we have a better matching or tighter fitting memory offer, it wins
// - if we have a better matching memory offer, it wins
// - if we have a burstable container, sufficient capacity for the limit wins
// - if we have or tighter fitting memory offer, it wins
// - if only one node matches the memory type request, it wins
// - for low-prio and high-prio CPU preference, if only one node has such CPUs, it wins
// - if a node is lower in the tree it wins
Expand Down Expand Up @@ -772,7 +775,7 @@ func (p *policy) compareScores(request Request, pools []Node, scores map[int]Sco
}
}

// better matching or tighter memory offer wins
// better matching offer wins
switch {
case o1 != nil && o2 == nil:
log.Debug(" => %s loses on memory offer (failed offer)", node2.Name())
Expand Down Expand Up @@ -809,22 +812,64 @@ func (p *policy) compareScores(request Request, pools []Node, scores map[int]Sco
}
log.Debug(" - memory offers burstability are a TIE")
}
}

if m1.Size() < m2.Size() {
log.Debug(" - %s loses on memory offer (%s less tight than %s)",
node2.Name(), m2, m1)
return true
}
if m2.Size() < m1.Size() {
log.Debug(" - %s loses on memory offer (%s less tight than %s)",
node1.Name(), m1, m2)
return false
}
if m2.Size() == m1.Size() {
log.Debug(" - memory offers are a TIE (%s vs. %s)", m1, m2)
if request.GetContainer().GetQOSClass() == corev1.PodQOSBurstable {
var (
limit = request.CPULimit()
b1 = score1.Supply().AllocatableSharedCPU()
b2 = score2.Supply().AllocatableSharedCPU()
r1 = b1 - limit
r2 = b2 - limit
)

log.Debug(" - CPU burstability %s=%d, %s=%d, limit=%d",
node1.Name(), b1, node2.Name(), b2, limit)

if limit != unlimitedCPU {
// prefer pool with enough burstable capacity
switch {
case r1 >= 0 && r2 < 0:
log.Debug(" - %s loses on insufficient CPU burstability (%d vs. %d for limit %d)",
node2.Name(), b1, b2, limit)
return true
case r2 >= 0 && r1 < 0:
log.Debug(" - %s loses on insufficient CPU burstability", node1.Name())
return false
default:
log.Debug(" - CPU burstability is a TIE")
}
} else {
// prefer pool with more burstable capacity
switch {
case b1 > b2:
log.Debug(" - %s WINS on more CPU burstability", node1.Name())
return true
case b2 > b1:
log.Debug(" - %s WINS on more CPU burstability", node2.Name())
return false
default:
log.Debug(" - CPU burstability is a TIE")
}
}
}

// tighter memory offer wins
m1, m2 := o1.NodeMask(), o2.NodeMask()
if m1.Size() < m2.Size() {
log.Debug(" - %s loses on memory offer (%s less tight than %s)",
node2.Name(), m2, m1)
return true
}
if m2.Size() < m1.Size() {
log.Debug(" - %s loses on memory offer (%s less tight than %s)",
node1.Name(), m1, m2)
return false
}
if m2.Size() == m1.Size() {
log.Debug(" - memory offers are a TIE (%s vs. %s)", m1, m2)
}

// matching memory type wins
if reqType := request.MemoryType(); reqType != memoryUnspec && reqType != memoryPreserve {
if node1.HasMemoryType(reqType) && !node2.HasMemoryType(reqType) {
Expand Down
11 changes: 10 additions & 1 deletion cmd/plugins/topology-aware/policy/resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ type Request interface {
FullCPUs() int
// CPUFraction returns the amount of fractional milli-CPU requested.
CPUFraction() int
// CPULimit returns the amount of fractional CPU limit.
CPULimit() int
// Isolate returns whether isolated CPUs are preferred for this request.
Isolate() bool
// MemoryType returns the type(s) of requested memory.
Expand Down Expand Up @@ -223,6 +225,7 @@ type request struct {
container cache.Container // container for this request
full int // number of full CPUs requested
fraction int // amount of fractional CPU requested
limit int // CPU limit, MaxInt for no limit
isolate bool // prefer isolated exclusive CPUs
cpuType cpuClass // preferred CPU type (normal, reserved)
prio cpuPrio // CPU priority preference, ignored for fraction requests
Expand Down Expand Up @@ -715,7 +718,7 @@ func prettyMem(value int64) string {
// newRequest creates a new request for the given container.
func newRequest(container cache.Container, types libmem.TypeMask) Request {
pod, _ := container.GetPod()
full, fraction, isolate, cpuType, prio := cpuAllocationPreferences(pod, container)
full, fraction, cpuLimit, isolate, cpuType, prio := cpuAllocationPreferences(pod, container)
req, lim, mtype := memoryAllocationPreference(pod, container)
coldStart := time.Duration(0)

Expand Down Expand Up @@ -752,6 +755,7 @@ func newRequest(container cache.Container, types libmem.TypeMask) Request {
container: container,
full: full,
fraction: fraction,
limit: cpuLimit,
isolate: isolate,
cpuType: cpuType,
memReq: req,
Expand Down Expand Up @@ -815,6 +819,11 @@ func (cr *request) CPUFraction() int {
return cr.fraction
}

// CPULimit returns the amount of fractional milli-CPU limit.
func (cr *request) CPULimit() int {
return cr.limit
}

// Isolate returns whether isolated CPUs are preferred for this request.
func (cr *request) Isolate() bool {
return cr.isolate
Expand Down
2 changes: 2 additions & 0 deletions test/e2e/files/burstable.yaml.in
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@ spec:
cpu: ${CPUREQ}
memory: ${MEMREQ}
limits:
$( ( [ -n "$CPULIM" ] && [ "$CPULIM" != "0" ] ) && echo "
cpu: ${CPULIM}
")
$( ( [ -n "$MEMLIM" ] && [ "$MEMLIM" != "0" ] ) && echo "
memory: ${MEMLIM}
")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ vm-command "kubectl delete pods --all --now"

# pod2: Test that 4 burstable containers not eligible for isolated/exclusive CPU allocation
# gets evenly spread over NUMA nodes.
CONTCOUNT=4 CPUREQ=2 CPULIM=4 create burstable
CONTCOUNT=4 CPUREQ=2 CPULIM=2002m create burstable
report allowed
verify \
'disjoint_sets(cpus["pod2c0"], cpus["pod2c1"], cpus["pod2c2"], cpus["pod2c3"])' \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,27 @@ verify \
'len(nodes["pod2c0"]) == 2' \
'len(nodes["pod3c0"]) == 4'

vm-command "kubectl delete pods --all --now"

helm-terminate
helm_config=$(COLOCATE_PODS=false instantiate helm-config.yaml) helm-launch topology-aware

# Limited burstable containers get assigned to the lowest pool where
# there is enough free capacity for their limit. In this case, a socket.
CONTCOUNT=2 CPUREQ=2 CPULIM=5 MEMREQ=100M create burstable
report allowed
verify \
'nodes["pod4c0"] == { "node2" ,"node3" }' \
'nodes["pod4c1"] == { "node0", "node1" }'

# Unlimited burstable containers get allocated to the root pool unless
# affinity dictates otherwise.
CONTCOUNT=2 CPUREQ=2 CPULIM=0 MEMREQ=100M create burstable
report allowed
verify \
'nodes["pod5c0"] == { "node0", "node1", "node2", "node3" }' \
'nodes["pod5c1"] == { "node0", "node1", "node2", "node3" }'

vm-command "kubectl delete pods --all --now"

helm-terminate