Ignore mostly empty nodes when scheduling small-ish apps (#104)

Alexis-D · web-flow · commit a823627be06f · 2023-03-21T17:17:56.000Z
diff --git a/pkg/binpack/minimal_fragmentation.go b/pkg/binpack/minimal_fragmentation.go
@@ -22,8 +22,8 @@ import (
 	"github.com/palantir/k8s-spark-scheduler-lib/pkg/resources"
 )
 
-// MinimalFragmentation is a SparkBinPackFunction that tries to put the driver pod on the first possible node with
-// enough capacity, and then tries to pack executors onto as few nodes as possible.
+// MinimalFragmentation is a SparkBinPackFunction that tries to minimize spark app fragmentation across the cluster.
+// see minimalFragmentation for more details.
 var MinimalFragmentation = SparkBinPackFunction(func(
 	ctx context.Context,
 	driverResources, executorResources *resources.Resources,
@@ -33,66 +33,104 @@ var MinimalFragmentation = SparkBinPackFunction(func(
 	return SparkBinPack(ctx, driverResources, executorResources, executorCount, driverNodePriorityOrder, executorNodePriorityOrder, nodesSchedulingMetadata, minimalFragmentation)
 })
 
-// minimalFragmentation attempts to pack executors onto as few nodes as possible, ideally a single one
+// minimalFragmentation attempts to pack executors onto as few nodes as possible, ideally a single one.
 // nodePriorityOrder is still used as a guideline, i.e. if an application can fit on multiple nodes, it will pick
-// the first eligible node according to nodePriorityOrder
+// the first eligible node according to nodePriorityOrder. additionally, minimalFragmentation will attempt to avoid
+// mostly empty nodes unless those are required for scheduling or they provide a perfect fit, see a couple examples below.
 //
-// for instance if nodePriorityOrder = [a, b, c, d, e]
-// and we can fit 1 executor on a, 1 executor on b, 3 executors on c, 5 executors on d, 5 executors on e
+// 'mostly' empty nodes are currently defined as the ones having capacity >= (executor count + max capacity) / 2
+//
+// for instance if nodePriorityOrder = [a, b, c, d, e, f]
+// and we can fit 1 executor on a, 1 executor on b, 3 executors on c, 5 executors on d, 5 executors on e, 17 executors on f
 // and executorCount = 11, then we will return:
 // [d, d, d, d, d, e, e, e, e, e, a], true
 //
 // if instead we have executorCount = 6, then we will return:
 // [d, d, d, d, d, a], true
+//
+// if instead we have executorCount = 15, then we will return:
+// [d, d, d, d, d, e, e, e, e, e, c, c, c, a, b], true
+//
+// if instead we have executorCount = 17, then we will return:
+// [f, f, ..., f], true
+//
+// if instead we have executorCount = 19, then we will return:
+// [f, f, ..., f, a, b], true
 func minimalFragmentation(
 	_ context.Context,
 	executorResources *resources.Resources,
 	executorCount int,
 	nodePriorityOrder []string,
 	nodeGroupSchedulingMetadata resources.NodeGroupSchedulingMetadata,
 	reservedResources resources.NodeGroupResources) ([]string, bool) {
-	executorNodes := make([]string, 0, executorCount)
 	if executorCount == 0 {
-		return executorNodes, true
+		return []string{}, true
 	}
 
 	nodeCapacities := capacity.GetNodeCapacities(nodePriorityOrder, nodeGroupSchedulingMetadata, reservedResources, executorResources)
 	nodeCapacities = capacity.FilterOutNodesWithoutCapacity(nodeCapacities)
+	if len(nodeCapacities) == 0 {
+		return nil, false
+	}
+
 	sort.SliceStable(nodeCapacities, func(i, j int) bool {
 		return nodeCapacities[i].Capacity < nodeCapacities[j].Capacity
 	})
+	maxCapacity := nodeCapacities[len(nodeCapacities)-1].Capacity
+	if executorCount < maxCapacity {
+		targetCapacity := (executorCount + maxCapacity) / 2
+		firstNodeWithAtLeastTargetCapacity := sort.Search(len(nodeCapacities), func(i int) bool {
+			return nodeCapacities[i].Capacity >= targetCapacity
+		})
+
+		// try scheduling on a subset of nodes that excludes the 'emptiest' nodes
+		if executorNodes, ok := internalMinimalFragmentation(executorCount, nodeCapacities[:firstNodeWithAtLeastTargetCapacity]); ok {
+			return executorNodes, ok
+		}
+	}
+
+	// fall back to using empty nodes
+	return internalMinimalFragmentation(executorCount, nodeCapacities)
+}
+
+func internalMinimalFragmentation(
+	executorCount int,
+	nodeCapacities []capacity.NodeAndExecutorCapacity) ([]string, bool) {
+	nodeCapacitiesCopy := make([]capacity.NodeAndExecutorCapacity, 0, len(nodeCapacities))
+	nodeCapacitiesCopy = append(nodeCapacitiesCopy, nodeCapacities...)
+	executorNodes := make([]string, 0, executorCount)
 
 	// as long as we have nodes where we could schedule executors
-	for len(nodeCapacities) > 0 {
+	for len(nodeCapacitiesCopy) > 0 {
 		// pick the first node that could fit all the executors (if there's one)
-		position := sort.Search(len(nodeCapacities), func(i int) bool {
-			return nodeCapacities[i].Capacity >= executorCount
+		position := sort.Search(len(nodeCapacitiesCopy), func(i int) bool {
+			return nodeCapacitiesCopy[i].Capacity >= executorCount
 		})
 
-		if position != len(nodeCapacities) {
+		if position != len(nodeCapacitiesCopy) {
 			// we found a node that has the required capacity, schedule everything there and we're done
-			return append(executorNodes, repeat(nodeCapacities[position].NodeName, executorCount)...), true
+			return append(executorNodes, repeat(nodeCapacitiesCopy[position].NodeName, executorCount)...), true
 		}
 
 		// we will need multiple nodes for scheduling, thus we'll try to schedule executors on nodes with the most capacity
-		maxCapacity := nodeCapacities[len(nodeCapacities)-1].Capacity
-		firstNodeWithMaxCapacityIdx := sort.Search(len(nodeCapacities), func(i int) bool {
-			return nodeCapacities[i].Capacity >= maxCapacity
+		maxCapacity := nodeCapacitiesCopy[len(nodeCapacitiesCopy)-1].Capacity
+		firstNodeWithMaxCapacityIdx := sort.Search(len(nodeCapacitiesCopy), func(i int) bool {
+			return nodeCapacitiesCopy[i].Capacity >= maxCapacity
 		})
 
 		// the loop will exit because maxCapacity is always > 0
 		currentPos := firstNodeWithMaxCapacityIdx
-		for ; executorCount >= maxCapacity && currentPos < len(nodeCapacities); currentPos++ {
+		for ; executorCount >= maxCapacity && currentPos < len(nodeCapacitiesCopy); currentPos++ {
 			// we can skip the check on firstNodeWithMaxCapacityIdx since we know at least one node will be found
-			executorNodes = append(executorNodes, repeat(nodeCapacities[currentPos].NodeName, maxCapacity)...)
+			executorNodes = append(executorNodes, repeat(nodeCapacitiesCopy[currentPos].NodeName, maxCapacity)...)
 			executorCount -= maxCapacity
 		}
 
 		if executorCount == 0 {
 			return executorNodes, true
 		}
 
-		nodeCapacities = append(nodeCapacities[:firstNodeWithMaxCapacityIdx], nodeCapacities[currentPos:]...)
+		nodeCapacitiesCopy = append(nodeCapacitiesCopy[:firstNodeWithMaxCapacityIdx], nodeCapacitiesCopy[currentPos:]...)
 	}
 
 	return nil, false
diff --git a/pkg/binpack/minimal_fragmentation_test.go b/pkg/binpack/minimal_fragmentation_test.go
@@ -85,7 +85,7 @@ func TestMinimalFragmentation(t *testing.T) {
 		willFit:            true,
 		expectedCounts:     map[string]int{"n1": 1},
 	}, {
-		name:              "executors fit on a single node",
+		name:              "executors fits perfectly on a single node",
 		driverResources:   resources.CreateResources(1, 3, 0),
 		executorResources: resources.CreateResources(2, 5, 0),
 		numExecutors:      5,
@@ -98,6 +98,20 @@ func TestMinimalFragmentation(t *testing.T) {
 		expectedDriverNode: "n1",
 		willFit:            true,
 		expectedCounts:     map[string]int{"n3": 5},
+	}, {
+		name:              "executors would fit on a single node, but we consider this too wasteful and revert to fragmenting",
+		driverResources:   resources.CreateResources(1, 3, 0),
+		executorResources: resources.CreateResources(2, 5, 0),
+		numExecutors:      5,
+		nodesSchedulingMetadata: resources.NodeGroupSchedulingMetadata(map[string]*resources.NodeSchedulingMetadata{
+			"n1": resources.CreateSchedulingMetadata(10, 25, 6, "zone1"),
+			"n2": resources.CreateSchedulingMetadata(5, 25, 6, "zone1"),
+			"n3": resources.CreateSchedulingMetadata(100, 100, 6, "zone1"),
+		}),
+		nodePriorityOrder:  []string{"n1", "n2", "n3"},
+		expectedDriverNode: "n1",
+		willFit:            true,
+		expectedCounts:     map[string]int{"n1": 4, "n2": 1},
 	}, {
 		name:              "executors fit on the smallest nodes that can accommodate all of them",
 		driverResources:   resources.CreateResources(1, 3, 0),
@@ -107,18 +121,34 @@ func TestMinimalFragmentation(t *testing.T) {
 			"n1": resources.CreateSchedulingMetadata(200, 500, 6, "zone1"),
 			"n2": resources.CreateSchedulingMetadata(100, 250, 6, "zone1"),
 		}),
-		nodePriorityOrder:  []string{"n1", "n2", "n3"},
+		nodePriorityOrder:  []string{"n1", "n2"},
 		expectedDriverNode: "n1",
 		willFit:            true,
 		expectedCounts:     map[string]int{"n2": 5},
+	}, {
+		name:              "consider 'empty' nodes when the app can't fit on 'non-empty' ones",
+		driverResources:   resources.CreateResources(1, 3, 0),
+		executorResources: resources.CreateResources(2, 5, 0),
+		numExecutors:      5,
+		nodesSchedulingMetadata: resources.NodeGroupSchedulingMetadata(map[string]*resources.NodeSchedulingMetadata{
+			"n1": resources.CreateSchedulingMetadata(1, 3, 6, "zone1"),
+			"n2": resources.CreateSchedulingMetadata(8, 10, 6, "zone1"),
+			"n3": resources.CreateSchedulingMetadata(20, 25, 6, "zone1"),
+			"n4": resources.CreateSchedulingMetadata(20, 25, 6, "zone1"),
+			"n5": resources.CreateSchedulingMetadata(20, 25, 6, "zone1"),
+		}),
+		nodePriorityOrder:  []string{"n1", "n2", "n3", "n4", "n5"},
+		expectedDriverNode: "n1",
+		willFit:            true,
+		expectedCounts:     map[string]int{"n3": 5},
 	}, {
 		name:              "when available resources are equal, prefer nodes according to the requested priorities",
 		driverResources:   resources.CreateResources(1, 3, 0),
 		executorResources: resources.CreateResources(2, 5, 0),
 		numExecutors:      5,
 		nodesSchedulingMetadata: resources.NodeGroupSchedulingMetadata(map[string]*resources.NodeSchedulingMetadata{
-			"n1": resources.CreateSchedulingMetadata(10, 25, 6, "zone1"),
-			"n2": resources.CreateSchedulingMetadata(5, 25, 6, "zone1"),
+			"n1": resources.CreateSchedulingMetadata(1, 3, 6, "zone1"),
+			"n2": resources.CreateSchedulingMetadata(8, 10, 6, "zone1"),
 			"n3": resources.CreateSchedulingMetadata(20, 25, 6, "zone1"),
 			"n4": resources.CreateSchedulingMetadata(20, 25, 6, "zone1"),
 			"n5": resources.CreateSchedulingMetadata(20, 25, 6, "zone1"),
@@ -141,6 +171,16 @@ func TestMinimalFragmentation(t *testing.T) {
 		expectedDriverNode: "n1",
 		willFit:            true,
 		expectedCounts:     map[string]int{"n2": 1, "n3": 4},
+	}, {
+		name:              "driver fits but not executors",
+		driverResources:   resources.CreateResources(2, 3, 1),
+		executorResources: resources.CreateResources(1, 1, 0),
+		numExecutors:      1,
+		nodesSchedulingMetadata: resources.NodeGroupSchedulingMetadata(map[string]*resources.NodeSchedulingMetadata{
+			"n1": resources.CreateSchedulingMetadata(2, 3, 1, "zone1"),
+		}),
+		nodePriorityOrder: []string{"n1"},
+		willFit:           false,
 	}, {
 		name:              "driver memory does not fit",
 		driverResources:   resources.CreateResources(2, 4, 1),