@@ -22,8 +22,8 @@ import (
2222 "github.com/palantir/k8s-spark-scheduler-lib/pkg/resources"
2323)
2424
25- // MinimalFragmentation is a SparkBinPackFunction that tries to put the driver pod on the first possible node with
26- // enough capacity, and then tries to pack executors onto as few nodes as possible .
25+ // MinimalFragmentation is a SparkBinPackFunction that tries to minimize spark app fragmentation across the cluster.
26+ // see minimalFragmentation for more details .
2727var MinimalFragmentation = SparkBinPackFunction (func (
2828 ctx context.Context ,
2929 driverResources , executorResources * resources.Resources ,
@@ -33,66 +33,104 @@ var MinimalFragmentation = SparkBinPackFunction(func(
3333 return SparkBinPack (ctx , driverResources , executorResources , executorCount , driverNodePriorityOrder , executorNodePriorityOrder , nodesSchedulingMetadata , minimalFragmentation )
3434})
3535
36- // minimalFragmentation attempts to pack executors onto as few nodes as possible, ideally a single one
36+ // minimalFragmentation attempts to pack executors onto as few nodes as possible, ideally a single one.
3737// nodePriorityOrder is still used as a guideline, i.e. if an application can fit on multiple nodes, it will pick
38- // the first eligible node according to nodePriorityOrder
38+ // the first eligible node according to nodePriorityOrder. additionally, minimalFragmentation will attempt to avoid
39+ // mostly empty nodes unless those are required for scheduling or they provide a perfect fit, see a couple examples below.
3940//
40- // for instance if nodePriorityOrder = [a, b, c, d, e]
41- // and we can fit 1 executor on a, 1 executor on b, 3 executors on c, 5 executors on d, 5 executors on e
41+ // 'mostly' empty nodes are currently defined as the ones having capacity >= (executor count + max capacity) / 2
42+ //
43+ // for instance if nodePriorityOrder = [a, b, c, d, e, f]
44+ // and we can fit 1 executor on a, 1 executor on b, 3 executors on c, 5 executors on d, 5 executors on e, 17 executors on f
4245// and executorCount = 11, then we will return:
4346// [d, d, d, d, d, e, e, e, e, e, a], true
4447//
4548// if instead we have executorCount = 6, then we will return:
4649// [d, d, d, d, d, a], true
50+ //
51+ // if instead we have executorCount = 15, then we will return:
52+ // [d, d, d, d, d, e, e, e, e, e, c, c, c, a, b], true
53+ //
54+ // if instead we have executorCount = 17, then we will return:
55+ // [f, f, ..., f], true
56+ //
57+ // if instead we have executorCount = 19, then we will return:
58+ // [f, f, ..., f, a, b], true
4759func minimalFragmentation (
4860 _ context.Context ,
4961 executorResources * resources.Resources ,
5062 executorCount int ,
5163 nodePriorityOrder []string ,
5264 nodeGroupSchedulingMetadata resources.NodeGroupSchedulingMetadata ,
5365 reservedResources resources.NodeGroupResources ) ([]string , bool ) {
54- executorNodes := make ([]string , 0 , executorCount )
5566 if executorCount == 0 {
56- return executorNodes , true
67+ return [] string {} , true
5768 }
5869
5970 nodeCapacities := capacity .GetNodeCapacities (nodePriorityOrder , nodeGroupSchedulingMetadata , reservedResources , executorResources )
6071 nodeCapacities = capacity .FilterOutNodesWithoutCapacity (nodeCapacities )
72+ if len (nodeCapacities ) == 0 {
73+ return nil , false
74+ }
75+
6176 sort .SliceStable (nodeCapacities , func (i , j int ) bool {
6277 return nodeCapacities [i ].Capacity < nodeCapacities [j ].Capacity
6378 })
79+ maxCapacity := nodeCapacities [len (nodeCapacities )- 1 ].Capacity
80+ if executorCount < maxCapacity {
81+ targetCapacity := (executorCount + maxCapacity ) / 2
82+ firstNodeWithAtLeastTargetCapacity := sort .Search (len (nodeCapacities ), func (i int ) bool {
83+ return nodeCapacities [i ].Capacity >= targetCapacity
84+ })
85+
86+ // try scheduling on a subset of nodes that excludes the 'emptiest' nodes
87+ if executorNodes , ok := internalMinimalFragmentation (executorCount , nodeCapacities [:firstNodeWithAtLeastTargetCapacity ]); ok {
88+ return executorNodes , ok
89+ }
90+ }
91+
92+ // fall back to using empty nodes
93+ return internalMinimalFragmentation (executorCount , nodeCapacities )
94+ }
95+
96+ func internalMinimalFragmentation (
97+ executorCount int ,
98+ nodeCapacities []capacity.NodeAndExecutorCapacity ) ([]string , bool ) {
99+ nodeCapacitiesCopy := make ([]capacity.NodeAndExecutorCapacity , 0 , len (nodeCapacities ))
100+ nodeCapacitiesCopy = append (nodeCapacitiesCopy , nodeCapacities ... )
101+ executorNodes := make ([]string , 0 , executorCount )
64102
65103 // as long as we have nodes where we could schedule executors
66- for len (nodeCapacities ) > 0 {
104+ for len (nodeCapacitiesCopy ) > 0 {
67105 // pick the first node that could fit all the executors (if there's one)
68- position := sort .Search (len (nodeCapacities ), func (i int ) bool {
69- return nodeCapacities [i ].Capacity >= executorCount
106+ position := sort .Search (len (nodeCapacitiesCopy ), func (i int ) bool {
107+ return nodeCapacitiesCopy [i ].Capacity >= executorCount
70108 })
71109
72- if position != len (nodeCapacities ) {
110+ if position != len (nodeCapacitiesCopy ) {
73111 // we found a node that has the required capacity, schedule everything there and we're done
74- return append (executorNodes , repeat (nodeCapacities [position ].NodeName , executorCount )... ), true
112+ return append (executorNodes , repeat (nodeCapacitiesCopy [position ].NodeName , executorCount )... ), true
75113 }
76114
77115 // we will need multiple nodes for scheduling, thus we'll try to schedule executors on nodes with the most capacity
78- maxCapacity := nodeCapacities [len (nodeCapacities )- 1 ].Capacity
79- firstNodeWithMaxCapacityIdx := sort .Search (len (nodeCapacities ), func (i int ) bool {
80- return nodeCapacities [i ].Capacity >= maxCapacity
116+ maxCapacity := nodeCapacitiesCopy [len (nodeCapacitiesCopy )- 1 ].Capacity
117+ firstNodeWithMaxCapacityIdx := sort .Search (len (nodeCapacitiesCopy ), func (i int ) bool {
118+ return nodeCapacitiesCopy [i ].Capacity >= maxCapacity
81119 })
82120
83121 // the loop will exit because maxCapacity is always > 0
84122 currentPos := firstNodeWithMaxCapacityIdx
85- for ; executorCount >= maxCapacity && currentPos < len (nodeCapacities ); currentPos ++ {
123+ for ; executorCount >= maxCapacity && currentPos < len (nodeCapacitiesCopy ); currentPos ++ {
86124 // we can skip the check on firstNodeWithMaxCapacityIdx since we know at least one node will be found
87- executorNodes = append (executorNodes , repeat (nodeCapacities [currentPos ].NodeName , maxCapacity )... )
125+ executorNodes = append (executorNodes , repeat (nodeCapacitiesCopy [currentPos ].NodeName , maxCapacity )... )
88126 executorCount -= maxCapacity
89127 }
90128
91129 if executorCount == 0 {
92130 return executorNodes , true
93131 }
94132
95- nodeCapacities = append (nodeCapacities [:firstNodeWithMaxCapacityIdx ], nodeCapacities [currentPos :]... )
133+ nodeCapacitiesCopy = append (nodeCapacitiesCopy [:firstNodeWithMaxCapacityIdx ], nodeCapacitiesCopy [currentPos :]... )
96134 }
97135
98136 return nil , false
0 commit comments