Skip to content

Commit 4fbe7be

Browse files
committed
chore: review comment changes
1 parent bdcf2e4 commit 4fbe7be

File tree

12 files changed

+87
-111
lines changed

12 files changed

+87
-111
lines changed

distros/kubernetes/nvsentinel/charts/health-events-analyzer/values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ config: |
5151
errorCount = 5
5252
5353
[[rules]]
54-
name = "RepeatedXid13"
54+
name = "RepeatedXidError"
5555
description = "Detect occurrence of fatal XIDs 5 times within 24 hours"
5656
time_window = "24h"
5757
recommended_action = "REPORT_ISSUE"

health-events-analyzer/go.mod

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ require (
1616
google.golang.org/grpc v1.76.0
1717
google.golang.org/protobuf v1.36.10
1818
k8s.io/apimachinery v0.34.1
19-
k8s.io/klog v1.0.0
2019
)
2120

2221
require (

health-events-analyzer/go.sum

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ github.com/emicklei/go-restful/v3 v3.13.0 h1:C4Bl2xDndpU6nJ4bc1jXd+uTmYPVUwkD6bF
1111
github.com/emicklei/go-restful/v3 v3.13.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
1212
github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM=
1313
github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ=
14-
github.com/go-logr/logr v0.1.0/go.mod h1:ixOQHD9gLJUVQQ2ZOR7zLEifBX6tGkNJF4QyIY7sIas=
1514
github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
1615
github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
1716
github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
@@ -187,8 +186,6 @@ k8s.io/apimachinery v0.34.1 h1:dTlxFls/eikpJxmAC7MVE8oOeP1zryV7iRyIjB0gky4=
187186
k8s.io/apimachinery v0.34.1/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw=
188187
k8s.io/client-go v0.34.1 h1:ZUPJKgXsnKwVwmKKdPfw4tB58+7/Ik3CrjOEhsiZ7mY=
189188
k8s.io/client-go v0.34.1/go.mod h1:kA8v0FP+tk6sZA0yKLRG67LWjqufAoSHA2xVGKw9Of8=
190-
k8s.io/klog v1.0.0 h1:Pt+yjF5aB1xDSVbau4VsWe+dQNzA0qv1LlXdC2dF6Q8=
191-
k8s.io/klog v1.0.0/go.mod h1:4Bi6QPql/J/LkTDqv7R/cd3hPo4k2DG6Ptcz060Ez5I=
192189
k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
193190
k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
194191
k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 h1:Y3gxNAuB0OBLImH611+UDZcmKS3g6CthxToOb37KgwE=

health-events-analyzer/pkg/config/rules.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ import (
2121
)
2222

2323
type SequenceStep struct {
24-
Criteria map[string]interface{} `toml:"criteria,omitempty"`
24+
Criteria map[string]interface{} `toml:"criteria"`
2525
ErrorCount int `toml:"errorCount"`
2626
}
2727

health-events-analyzer/pkg/publisher/publisher.go

Lines changed: 9 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323

2424
"google.golang.org/grpc/codes"
2525
"google.golang.org/grpc/status"
26+
"google.golang.org/protobuf/proto"
2627
"k8s.io/apimachinery/pkg/util/wait"
2728
)
2829

@@ -95,21 +96,14 @@ func NewPublisher(platformConnectorClient pb.PlatformConnectorClient) *Publisher
9596

9697
func (p *PublisherConfig) Publish(ctx context.Context, event *pb.HealthEvent,
9798
recommendedAction pb.RecommenedAction, ruleName string) error {
98-
newEvent := &pb.HealthEvent{
99-
Version: event.Version,
100-
Agent: "health-events-analyzer",
101-
CheckName: ruleName, // change the check name to HealthEventsAnalyzer
102-
ComponentClass: event.ComponentClass,
103-
Message: event.Message,
104-
RecommendedAction: recommendedAction, // set the rule's recommended action
105-
ErrorCode: event.ErrorCode,
106-
IsHealthy: false, // mark event as unhealthy
107-
IsFatal: true, // mark event as fatal
108-
EntitiesImpacted: event.EntitiesImpacted,
109-
Metadata: event.Metadata,
110-
GeneratedTimestamp: event.GeneratedTimestamp,
111-
NodeName: event.NodeName,
112-
}
99+
newEvent := proto.Clone(event).(*pb.HealthEvent)
100+
101+
// Override fields with new values
102+
newEvent.Agent = "health-events-analyzer"
103+
newEvent.CheckName = ruleName // change the check name to HealthEventsAnalyzer
104+
newEvent.RecommendedAction = recommendedAction // set the rule's recommended action
105+
newEvent.IsHealthy = false // mark event as unhealthy
106+
newEvent.IsFatal = true // mark event as fatal
113107

114108
req := &pb.HealthEvents{
115109
Version: 1,

health-events-analyzer/pkg/reconciler/parser.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ func getValueFromErrorCode(event *platform_connectors.HealthEvent, parts []strin
136136
}
137137

138138
idx, err := strconv.Atoi(parts[0])
139-
if err != nil || idx >= len(event.ErrorCode) {
139+
if err != nil || idx >= len(event.ErrorCode) || idx < 0 {
140140
return nil
141141
}
142142

@@ -162,7 +162,7 @@ func getValueFromEntitiesImpacted(event *platform_connectors.HealthEvent, parts
162162
return nil
163163
}
164164

165-
if idx, err := strconv.Atoi(parts[0]); err == nil && idx < len(event.EntitiesImpacted) {
165+
if idx, err := strconv.Atoi(parts[0]); err == nil && idx >= 0 && idx < len(event.EntitiesImpacted) {
166166
entity := event.EntitiesImpacted[idx]
167167
subField := strings.ToLower(parts[1])
168168

@@ -193,6 +193,10 @@ func getValueFromGeneratedTimestamp(event *platform_connectors.HealthEvent, part
193193
}
194194

195195
func getValueFromHealthEventStatus(event storeconnector.HealthEventStatus, parts []string) any {
196+
if len(parts) == 0 {
197+
return nil
198+
}
199+
196200
rootField := strings.ToLower(parts[0])
197201

198202
if len(parts) == 1 {

health-events-analyzer/pkg/reconciler/reconciler.go

Lines changed: 11 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,6 @@ import (
3333
"go.mongodb.org/mongo-driver/mongo/options"
3434
)
3535

36-
const (
37-
maxRetries int = 5
38-
delay time.Duration = 10 * time.Second
39-
)
40-
4136
type CollectionInterface interface {
4237
Aggregate(ctx context.Context, pipeline interface{}, opts ...*options.AggregateOptions) (*mongo.Cursor, error)
4338
}
@@ -129,32 +124,20 @@ func (r *Reconciler) processEvent(ctx context.Context, event bson.M) error {
129124

130125
var publishedNewEvent bool
131126

132-
for i := 1; i <= maxRetries; i++ {
133-
slog.Debug("Handling event", "attempt", i, "event", healthEventWithStatus)
134-
135-
publishedNewEvent, err = r.handleEvent(ctx, &healthEventWithStatus)
136-
if err == nil {
137-
totalEventsSuccessfullyProcessed.Inc()
138-
139-
if publishedNewEvent {
140-
slog.Info("New event successfully published.")
141-
newEventsPublishedTotal.WithLabelValues(healthEventWithStatus.HealthEvent.NodeName).Inc()
142-
} else {
143-
slog.Info("New event is not published, rule set criteria didn't match.")
144-
}
145-
146-
break
147-
}
148-
127+
publishedNewEvent, err = r.handleEvent(ctx, &healthEventWithStatus)
128+
if err != nil {
149129
slog.Error("Error in handling the event", "event", healthEventWithStatus, "error", err)
150130

151131
totalEventProcessingError.WithLabelValues("handle_event_error").Inc()
152-
153-
time.Sleep(delay)
154-
}
155-
156-
if err != nil {
157-
slog.Error("Max attempt reached, error in handling the event", "event", healthEventWithStatus, "error", err)
132+
} else {
133+
totalEventsSuccessfullyProcessed.Inc()
134+
135+
if publishedNewEvent {
136+
slog.Info("New event successfully published.")
137+
newEventsPublishedTotal.WithLabelValues(healthEventWithStatus.HealthEvent.NodeName).Inc()
138+
} else {
139+
slog.Info("New event is not published, rule set criteria didn't match.")
140+
}
158141
}
159142

160143
duration := time.Since(startTime).Seconds()

health-events-analyzer/pkg/reconciler/reconciler_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ var (
9696
TimeWindow: "2m",
9797
Sequence: []config.SequenceStep{{
9898
Criteria: map[string]interface{}{
99-
"healthevent.entitiesimpacted.0.entitytype": "GPU ",
99+
"healthevent.entitiesimpacted.0.entitytype": "GPU",
100100
"healthevent.entitiesimpacted.0.entityvalue": "this.healthevent.entitiesimpacted[0].entityvalue",
101101
"healthevent.errorcode.0": "13",
102102
"healthevent.nodename": "this.healthevent.nodename",

tests/helpers/healthevent.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,12 +198,13 @@ func TestCleanUp(ctx context.Context, GpuNodeName string, nodeCondition string,
198198
client, err := c.NewClient()
199199
if err != nil {
200200
klog.Errorf("failed to create client during cleanup: %v", err)
201-
return err
201+
return fmt.Errorf("failed to create client during cleanup: %w", err)
202202
}
203203

204204
err = CleanupNodeConditionAndUncordon(ctx, client, GpuNodeName, nodeCondition)
205205
if err != nil {
206206
klog.Errorf("failed to cleanup node condition and uncordon node %s: %v", GpuNodeName, err)
207+
return fmt.Errorf("failed to cleanup node condition and uncordon node %s: %w", GpuNodeName, err)
207208
}
208209

209210
klog.Infof("Successfully cleaned up node condition and uncordoned node %s", GpuNodeName)

tests/multiple_fatal_event_test.go

Lines changed: 32 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -22,19 +22,18 @@ import (
2222

2323
"github.com/stretchr/testify/assert"
2424
"go.mongodb.org/mongo-driver/bson"
25-
"k8s.io/klog/v2"
2625
"sigs.k8s.io/e2e-framework/pkg/envconf"
2726
"sigs.k8s.io/e2e-framework/pkg/features"
2827
)
2928

3029
func TestMultipleFatalEventRule(t *testing.T) {
3130
type contextKey int
32-
var GpuNodeName string
3331

3432
const (
35-
keyGpuNodes contextKey = iota
36-
ERRORCODE_13 = "13"
37-
ERRORCODE_48 = "48"
33+
keyGpuNodes contextKey = iota
34+
keyGpuNodeName
35+
ERRORCODE_13 = "13"
36+
ERRORCODE_48 = "48"
3837
)
3938

4039
feature := features.New("TestMultipleFatalEventRule").
@@ -55,82 +54,89 @@ func TestMultipleFatalEventRule(t *testing.T) {
5554
feature.Assess("Inject multiple fatal errors", func(ctx context.Context, t *testing.T, c *envconf.Config) context.Context {
5655
gpuNodes := ctx.Value(keyGpuNodes).([]string)
5756
assert.True(t, len(gpuNodes) > 0, "no gpu nodes found")
58-
GpuNodeName = gpuNodes[rand.Intn(len(gpuNodes))]
59-
t.Logf("Injecting fatal events to node %s", GpuNodeName)
57+
gpuNodeName := gpuNodes[rand.Intn(len(gpuNodes))]
58+
ctx = context.WithValue(ctx, keyGpuNodeName, gpuNodeName)
59+
t.Logf("Injecting fatal events to node %s", gpuNodeName)
6060

6161
// inject 5 fatal errors and let the remediation cycle finish
6262

6363
// inject XID 13 error
64-
err := helpers.SendHealthEventsToNodes([]string{GpuNodeName}, ERRORCODE_13, "data/fatal-health-event.json")
64+
err := helpers.SendHealthEventsToNodes([]string{gpuNodeName}, ERRORCODE_13, "data/fatal-health-event.json")
6565
assert.NoError(t, err, "failed to send fatal events")
6666
time.Sleep(10 * time.Second)
6767

68-
err = helpers.SendHealthEventsToNodes([]string{GpuNodeName}, ERRORCODE_13, "data/healthy-event.json")
68+
err = helpers.SendHealthEventsToNodes([]string{gpuNodeName}, ERRORCODE_13, "data/healthy-event.json")
6969
assert.NoError(t, err, "failed to send healthy events")
7070
time.Sleep(5 * time.Second)
7171

7272
// inject XID 48 error
73-
err = helpers.SendHealthEventsToNodes([]string{GpuNodeName}, ERRORCODE_48, "data/fatal-health-event.json")
73+
err = helpers.SendHealthEventsToNodes([]string{gpuNodeName}, ERRORCODE_48, "data/fatal-health-event.json")
7474
assert.NoError(t, err, "failed to send fatal events")
7575
time.Sleep(10 * time.Second)
7676

77-
err = helpers.SendHealthEventsToNodes([]string{GpuNodeName}, ERRORCODE_48, "data/healthy-event.json")
77+
err = helpers.SendHealthEventsToNodes([]string{gpuNodeName}, ERRORCODE_48, "data/healthy-event.json")
7878
assert.NoError(t, err, "failed to send healthy events")
7979
time.Sleep(5 * time.Second)
8080

8181
// inject XID 13 error
82-
err = helpers.SendHealthEventsToNodes([]string{GpuNodeName}, ERRORCODE_13, "data/fatal-health-event.json")
82+
err = helpers.SendHealthEventsToNodes([]string{gpuNodeName}, ERRORCODE_13, "data/fatal-health-event.json")
8383
assert.NoError(t, err, "failed to send fatal events")
8484
time.Sleep(10 * time.Second)
8585

86-
err = helpers.SendHealthEventsToNodes([]string{GpuNodeName}, ERRORCODE_13, "data/healthy-event.json")
86+
err = helpers.SendHealthEventsToNodes([]string{gpuNodeName}, ERRORCODE_13, "data/healthy-event.json")
8787
assert.NoError(t, err, "failed to send healthy events")
8888
time.Sleep(5 * time.Second)
8989

9090
// inject XID 48 error
91-
err = helpers.SendHealthEventsToNodes([]string{GpuNodeName}, ERRORCODE_48, "data/fatal-health-event.json")
91+
err = helpers.SendHealthEventsToNodes([]string{gpuNodeName}, ERRORCODE_48, "data/fatal-health-event.json")
9292
assert.NoError(t, err, "failed to send fatal events")
9393
time.Sleep(10 * time.Second)
9494

95-
err = helpers.SendHealthEventsToNodes([]string{GpuNodeName}, ERRORCODE_48, "data/healthy-event.json")
95+
err = helpers.SendHealthEventsToNodes([]string{gpuNodeName}, ERRORCODE_48, "data/healthy-event.json")
9696
assert.NoError(t, err, "failed to send healthy events")
9797
time.Sleep(5 * time.Second)
9898

9999
// inject XID 13 error
100-
err = helpers.SendHealthEventsToNodes([]string{GpuNodeName}, ERRORCODE_13, "data/fatal-health-event.json")
100+
err = helpers.SendHealthEventsToNodes([]string{gpuNodeName}, ERRORCODE_13, "data/fatal-health-event.json")
101101
assert.NoError(t, err, "failed to send fatal events")
102102
time.Sleep(10 * time.Second)
103103

104-
err = helpers.SendHealthEventsToNodes([]string{GpuNodeName}, ERRORCODE_13, "data/healthy-event.json")
104+
err = helpers.SendHealthEventsToNodes([]string{gpuNodeName}, ERRORCODE_13, "data/healthy-event.json")
105105
assert.NoError(t, err, "failed to send healthy events")
106106
time.Sleep(5 * time.Second)
107107

108108
return ctx
109109
})
110110

111111
feature.Assess("Check if health event analyzer published a new fatal event", func(ctx context.Context, t *testing.T, c *envconf.Config) context.Context {
112+
// Get GPU node name from context
113+
gpuNodeName, ok := ctx.Value(keyGpuNodeName).(string)
114+
if !ok || gpuNodeName == "" {
115+
t.Fatal("GPU node name not found in context - previous assess step may have failed")
116+
}
117+
112118
// Ensure cleanup at the end of the test
113119
defer func() {
114-
t.Logf("Starting cleanup for node %s", GpuNodeName)
120+
t.Logf("Starting cleanup for node %s", gpuNodeName)
115121

116-
err := helpers.TestCleanUp(ctx, GpuNodeName, "MultipleFatalError", "31", c)
117-
assert.NoError(t, err, "failed to cleanup node condition and uncordon node %s", GpuNodeName)
118-
t.Logf("Successfully cleaned up node condition and uncordoned node %s", GpuNodeName)
122+
err := helpers.TestCleanUp(ctx, gpuNodeName, "MultipleFatalError", "31", c)
123+
assert.NoError(t, err, "failed to cleanup node condition and uncordon node %s", gpuNodeName)
124+
t.Logf("Successfully cleaned up node condition and uncordoned node %s", gpuNodeName)
119125
}()
120126

121-
// inject XID 48 error to trigger the rule
122-
err := helpers.SendHealthEventsToNodes([]string{GpuNodeName}, "31", "data/fatal-health-event.json")
127+
// inject XID 31 error to trigger the rule
128+
err := helpers.SendHealthEventsToNodes([]string{gpuNodeName}, "31", "data/fatal-health-event.json")
123129
assert.NoError(t, err, "failed to send fatal events")
124130
time.Sleep(10 * time.Second)
125131

126132
client, err := c.NewClient()
127133
assert.NoError(t, err, "failed to create client")
128134

129135
// Check node condition for matched ruleset
130-
helpers.WaitForNodeConditionWithCheckName(ctx, t, client, GpuNodeName, "MultipleFatalError")
136+
helpers.WaitForNodeConditionWithCheckName(ctx, t, client, gpuNodeName, "MultipleFatalError")
131137
// Check MongoDB for health event with checkName = "MultipleFatalError"
132138
filter := bson.M{
133-
"healthevent.nodename": GpuNodeName,
139+
"healthevent.nodename": gpuNodeName,
134140
"healthevent.checkname": "MultipleFatalError",
135141
}
136142

@@ -142,7 +148,7 @@ func TestMultipleFatalEventRule(t *testing.T) {
142148
if healthEvent, ok := event["healthevent"].(map[string]interface{}); ok {
143149
nodeName, ok := healthEvent["nodename"].(string)
144150
assert.True(t, ok, "nodename should be a string")
145-
assert.Equal(t, GpuNodeName, nodeName, "nodeName should be the same as the node name")
151+
assert.Equal(t, gpuNodeName, nodeName, "nodeName should be the same as the node name")
146152

147153
checkName, ok := healthEvent["checkname"].(string)
148154
assert.True(t, ok, "checkname should be a string")

0 commit comments

Comments
 (0)