Skip to content

Commit 71509c8

Browse files
Kyrie336Lei Guo
andauthored
optimize schedule failure event (#1444)
1. delete duplicate event type definition, use common package 2. support vendor device report custom event Signed-off-by: Lei Guo <[email protected]> Co-authored-by: Lei Guo <[email protected]>
1 parent 97c0b17 commit 71509c8

File tree

4 files changed

+90
-62
lines changed

4 files changed

+90
-62
lines changed

pkg/device/common/common.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,3 +46,20 @@ func GenReason(reasons map[string]int, cards int) string {
4646
}
4747
return strings.Join(reason, ", ")
4848
}
49+
50+
func ParseReason(reason string) map[string]int {
51+
reasons := strings.Split(reason, ", ")
52+
53+
reasonMap := map[string]int{}
54+
for _, reason := range reasons {
55+
cnt, key := 0, ""
56+
_, err := fmt.Sscanf(reason, "%d/%d %s", &cnt, new(int), &key)
57+
if err != nil {
58+
continue
59+
}
60+
61+
reasonMap[key] = cnt
62+
}
63+
64+
return reasonMap
65+
}

pkg/device/common/common_test.go

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
/*
2+
Copyright 2025 The HAMi Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package common
18+
19+
import (
20+
"reflect"
21+
"testing"
22+
)
23+
24+
func TestParseReason(t *testing.T) {
25+
for _, ts := range []struct {
26+
name string
27+
reason string
28+
29+
expectedReasonMap map[string]int
30+
}{
31+
{
32+
name: "base test",
33+
reason: "3/8 CardInsufficientMemory, 2/8 CardInsufficientCore, 3/8 CardNotHealth",
34+
35+
expectedReasonMap: map[string]int{
36+
"CardInsufficientMemory": 3,
37+
"CardInsufficientCore": 2,
38+
"CardNotHealth": 3,
39+
},
40+
},
41+
} {
42+
t.Run(ts.name, func(t *testing.T) {
43+
result := ParseReason(ts.reason)
44+
if !reflect.DeepEqual(result, ts.expectedReasonMap) {
45+
t.Errorf("ParseReason failed: result %v, expected %v",
46+
result, ts.expectedReasonMap)
47+
}
48+
})
49+
}
50+
}

pkg/scheduler/score.go

Lines changed: 8 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"k8s.io/klog/v2"
2727

2828
"github.com/Project-HAMi/HAMi/pkg/device"
29+
"github.com/Project-HAMi/HAMi/pkg/device/common"
2930
"github.com/Project-HAMi/HAMi/pkg/scheduler/config"
3031
"github.com/Project-HAMi/HAMi/pkg/scheduler/policy"
3132
)
@@ -37,36 +38,6 @@ func viewStatus(usage NodeUsage) {
3738
}
3839
}
3940

40-
const (
41-
cardTypeMismatch = "CardTypeMismatch"
42-
cardUUIDMismatch = "CardUuidMismatch"
43-
cardTimeSlicingExhausted = "CardTimeSlicingExhausted"
44-
cardComputeUnitsExhausted = "CardComputeUnitsExhausted"
45-
cardInsufficientMemory = "CardInsufficientMemory"
46-
cardInsufficientCore = "CardInsufficientCore"
47-
numaNotFit = "NumaNotFit"
48-
exclusiveDeviceAllocateConflict = "ExclusiveDeviceAllocateConflict"
49-
cardNotFoundCustomFilterRule = "CardNotFoundCustomFilterRule"
50-
nodeInsufficientDevice = "NodeInsufficientDevice"
51-
allocatedCardsInsufficientRequest = "AllocatedCardsInsufficientRequest"
52-
nodeUnfitPod = "NodeUnfitPod"
53-
nodeFitPod = "NodeFitPod"
54-
)
55-
56-
var scheduleFailureReasons = []string{
57-
cardTypeMismatch,
58-
cardUUIDMismatch,
59-
cardTimeSlicingExhausted,
60-
cardComputeUnitsExhausted,
61-
cardInsufficientMemory,
62-
cardInsufficientCore,
63-
numaNotFit,
64-
exclusiveDeviceAllocateConflict,
65-
cardNotFoundCustomFilterRule,
66-
nodeInsufficientDevice,
67-
allocatedCardsInsufficientRequest,
68-
}
69-
7041
func getNodeResources(list NodeUsage, t string) []*device.DeviceUsage {
7142
l := []*device.DeviceUsage{}
7243
for _, val := range list.Devices.DeviceLists {
@@ -91,16 +62,15 @@ func fitInDevices(node *NodeUsage, requests device.ContainerDeviceRequests, pod
9162
for _, k := range requests {
9263
sums += int(k.Nums)
9364
if int(k.Nums) > len(node.Devices.DeviceLists) {
94-
klog.V(5).InfoS(nodeInsufficientDevice, "pod", klog.KObj(pod), "request devices nums", k.Nums, "node device nums", len(node.Devices.DeviceLists))
95-
return false, nodeInsufficientDevice
65+
klog.V(5).InfoS(common.NodeInsufficientDevice, "pod", klog.KObj(pod), "request devices nums", k.Nums, "node device nums", len(node.Devices.DeviceLists))
66+
return false, common.NodeInsufficientDevice
9667
}
9768
sort.Sort(node.Devices)
9869
_, ok := device.GetDevices()[k.Type]
9970
if !ok {
10071
return false, "Device type not found"
10172
}
102-
fit, tmpDevs, devreason := device.GetDevices()[k.Type].Fit(getNodeResources(*node, k.Type), k, pod, nodeInfo, devinput)
103-
reason := "node:" + node.Node.Name + " " + "resaon:" + devreason
73+
fit, tmpDevs, reason := device.GetDevices()[k.Type].Fit(getNodeResources(*node, k.Type), k, pod, nodeInfo, devinput)
10474
if fit {
10575
for idx, val := range tmpDevs[k.Type] {
10676
for nidx, v := range node.Devices.DeviceLists {
@@ -193,10 +163,10 @@ func (s *Scheduler) calcScore(nodes *map[string]*NodeUsage, resourceReqs device.
193163
}
194164
ctrfit = fit
195165
if !fit {
196-
klog.V(4).InfoS(nodeUnfitPod, "pod", klog.KObj(task), "node", nodeID, "reason", reason)
166+
klog.V(4).InfoS(common.NodeUnfitPod, "pod", klog.KObj(task), "node", nodeID, "reason", reason)
197167
failedNodesMutex.Lock()
198-
failedNodes[nodeID] = nodeUnfitPod
199-
for _, reasonType := range parseNodeReason(reason) {
168+
failedNodes[nodeID] = common.NodeUnfitPod
169+
for reasonType := range common.ParseReason(reason) {
200170
failureReason[reasonType] = append(failureReason[reasonType], nodeID)
201171
}
202172
failedNodesMutex.Unlock()
@@ -209,7 +179,7 @@ func (s *Scheduler) calcScore(nodes *map[string]*NodeUsage, resourceReqs device.
209179
res.NodeList = append(res.NodeList, &score)
210180
fitNodesMutex.Unlock()
211181
score.OverrideScore(snapshot, userNodePolicy)
212-
klog.V(4).InfoS(nodeFitPod, "pod", klog.KObj(task), "node", nodeID, "score", score.Score)
182+
klog.V(4).InfoS(common.NodeFitPod, "pod", klog.KObj(task), "node", nodeID, "score", score.Score)
213183
}
214184
}(nodeID, node)
215185
}
@@ -231,13 +201,3 @@ func (s *Scheduler) calcScore(nodes *map[string]*NodeUsage, resourceReqs device.
231201
}
232202
return &res, utilerrors.NewAggregate(errorsSlice)
233203
}
234-
235-
func parseNodeReason(nodeReason string) []string {
236-
var res []string
237-
for _, reason := range scheduleFailureReasons {
238-
if strings.Contains(nodeReason, reason) {
239-
res = append(res, reason)
240-
}
241-
}
242-
return res
243-
}

pkg/scheduler/score_test.go

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
"k8s.io/klog/v2"
2929

3030
"github.com/Project-HAMi/HAMi/pkg/device"
31+
"github.com/Project-HAMi/HAMi/pkg/device/common"
3132
"github.com/Project-HAMi/HAMi/pkg/device/hygon"
3233
"github.com/Project-HAMi/HAMi/pkg/device/kunlun"
3334
"github.com/Project-HAMi/HAMi/pkg/device/metax"
@@ -1573,7 +1574,7 @@ func Test_calcScore(t *testing.T) {
15731574
NodeList: []*policy.NodeScore{},
15741575
},
15751576
failedNodes: map[string]string{
1576-
"node1": nodeUnfitPod,
1577+
"node1": common.NodeUnfitPod,
15771578
},
15781579
err: nil,
15791580
},
@@ -1679,8 +1680,8 @@ func Test_calcScore(t *testing.T) {
16791680
NodeList: []*policy.NodeScore{},
16801681
},
16811682
failedNodes: map[string]string{
1682-
"node1": nodeUnfitPod,
1683-
"node2": nodeUnfitPod,
1683+
"node1": common.NodeUnfitPod,
1684+
"node2": common.NodeUnfitPod,
16841685
},
16851686
err: nil,
16861687
},
@@ -2762,7 +2763,7 @@ func Test_fitInCertainDevice(t *testing.T) {
27622763
},
27632764
want1: false,
27642765
want2: map[string]device.ContainerDevices{},
2765-
want3: map[string]int{cardTypeMismatch: 1},
2766+
want3: map[string]int{common.CardTypeMismatch: 1},
27662767
},
27672768
{
27682769
name: "device count less than device used",
@@ -2805,7 +2806,7 @@ func Test_fitInCertainDevice(t *testing.T) {
28052806
},
28062807
want1: false,
28072808
want2: map[string]device.ContainerDevices{},
2808-
want3: map[string]int{cardTimeSlicingExhausted: 1},
2809+
want3: map[string]int{common.CardTimeSlicingExhausted: 1},
28092810
},
28102811
{
28112812
name: "core limit exceed 100",
@@ -2848,7 +2849,7 @@ func Test_fitInCertainDevice(t *testing.T) {
28482849
},
28492850
want1: false,
28502851
want2: map[string]device.ContainerDevices{},
2851-
want3: map[string]int{cardInsufficientCore: 1},
2852+
want3: map[string]int{common.CardInsufficientCore: 1},
28522853
},
28532854
{
28542855
name: "card insufficient remaining memory",
@@ -2891,7 +2892,7 @@ func Test_fitInCertainDevice(t *testing.T) {
28912892
},
28922893
want1: false,
28932894
want2: map[string]device.ContainerDevices{},
2894-
want3: map[string]int{cardInsufficientMemory: 1},
2895+
want3: map[string]int{common.CardInsufficientMemory: 1},
28952896
},
28962897
{
28972898
name: "the container wants exclusive access to an entire card, but the card is already in use",
@@ -2934,7 +2935,7 @@ func Test_fitInCertainDevice(t *testing.T) {
29342935
},
29352936
want1: false,
29362937
want2: map[string]device.ContainerDevices{},
2937-
want3: map[string]int{exclusiveDeviceAllocateConflict: 1},
2938+
want3: map[string]int{common.ExclusiveDeviceAllocateConflict: 1},
29382939
},
29392940
{
29402941
name: "can't allocate core=0 job to an already full GPU",
@@ -2977,7 +2978,7 @@ func Test_fitInCertainDevice(t *testing.T) {
29772978
},
29782979
want1: false,
29792980
want2: map[string]device.ContainerDevices{},
2980-
want3: map[string]int{cardComputeUnitsExhausted: 1},
2981+
want3: map[string]int{common.CardComputeUnitsExhausted: 1},
29812982
},
29822983
{
29832984
name: "mode is mig",
@@ -3040,7 +3041,7 @@ func Test_fitInCertainDevice(t *testing.T) {
30403041
},
30413042
},
30423043
},
3043-
want3: map[string]int{cardNotFoundCustomFilterRule: 1, allocatedCardsInsufficientRequest: 1},
3044+
want3: map[string]int{common.CardNotFoundCustomFilterRule: 1, common.AllocatedCardsInsufficientRequest: 1},
30443045
},
30453046
{
30463047
name: "card uuid don't match",
@@ -3089,7 +3090,7 @@ func Test_fitInCertainDevice(t *testing.T) {
30893090
},
30903091
want1: false,
30913092
want2: map[string]device.ContainerDevices{},
3092-
want3: map[string]int{cardUUIDMismatch: 1},
3093+
want3: map[string]int{common.CardUUIDMismatch: 1},
30933094
},
30943095
{
30953096
name: "numa not fit",
@@ -3136,7 +3137,7 @@ func Test_fitInCertainDevice(t *testing.T) {
31363137
},
31373138
},
31383139
},
3139-
want3: map[string]int{numaNotFit: 1, allocatedCardsInsufficientRequest: 1},
3140+
want3: map[string]int{common.NumaNotFit: 1, common.AllocatedCardsInsufficientRequest: 1},
31403141
},
31413142
{
31423143
name: "test device kind of not fit reason",
@@ -3186,8 +3187,8 @@ func Test_fitInCertainDevice(t *testing.T) {
31863187
},
31873188
want1: false,
31883189
want2: map[string]device.ContainerDevices{},
3189-
want3: map[string]int{cardUUIDMismatch: 3, cardTimeSlicingExhausted: 4,
3190-
cardInsufficientMemory: 2, cardInsufficientCore: 1},
3190+
want3: map[string]int{common.CardUUIDMismatch: 3, common.CardTimeSlicingExhausted: 4,
3191+
common.CardInsufficientMemory: 2, common.CardInsufficientCore: 1},
31913192
},
31923193
}
31933194
for _, test := range tests {

0 commit comments

Comments
 (0)