@@ -22,19 +22,18 @@ import (
2222
2323	"github.com/stretchr/testify/assert" 
2424	"go.mongodb.org/mongo-driver/bson" 
25- 	"k8s.io/klog/v2" 
2625	"sigs.k8s.io/e2e-framework/pkg/envconf" 
2726	"sigs.k8s.io/e2e-framework/pkg/features" 
2827)
2928
3029func  TestMultipleFatalEventRule (t  * testing.T ) {
3130	type  contextKey  int 
32- 	var  GpuNodeName  string 
3331
3432	const  (
35- 		keyGpuNodes   contextKey  =  iota 
36- 		ERRORCODE_13             =  "13" 
37- 		ERRORCODE_48             =  "48" 
33+ 		keyGpuNodes  contextKey  =  iota 
34+ 		keyGpuNodeName 
35+ 		ERRORCODE_13  =  "13" 
36+ 		ERRORCODE_48  =  "48" 
3837	)
3938
4039	feature  :=  features .New ("TestMultipleFatalEventRule" ).
@@ -55,82 +54,89 @@ func TestMultipleFatalEventRule(t *testing.T) {
5554	feature .Assess ("Inject multiple fatal errors" , func (ctx  context.Context , t  * testing.T , c  * envconf.Config ) context.Context  {
5655		gpuNodes  :=  ctx .Value (keyGpuNodes ).([]string )
5756		assert .True (t , len (gpuNodes ) >  0 , "no gpu nodes found" )
58- 		GpuNodeName  =  gpuNodes [rand .Intn (len (gpuNodes ))]
59- 		t .Logf ("Injecting fatal events to node %s" , GpuNodeName )
57+ 		gpuNodeName  :=  gpuNodes [rand .Intn (len (gpuNodes ))]
58+ 		ctx  =  context .WithValue (ctx , keyGpuNodeName , gpuNodeName )
59+ 		t .Logf ("Injecting fatal events to node %s" , gpuNodeName )
6060
6161		// inject 5 fatal errors and let the remediation cycle finish 
6262
6363		// inject XID 13 error 
64- 		err  :=  helpers .SendHealthEventsToNodes ([]string {GpuNodeName }, ERRORCODE_13 , "data/fatal-health-event.json" )
64+ 		err  :=  helpers .SendHealthEventsToNodes ([]string {gpuNodeName }, ERRORCODE_13 , "data/fatal-health-event.json" )
6565		assert .NoError (t , err , "failed to send fatal events" )
6666		time .Sleep (10  *  time .Second )
6767
68- 		err  =  helpers .SendHealthEventsToNodes ([]string {GpuNodeName }, ERRORCODE_13 , "data/healthy-event.json" )
68+ 		err  =  helpers .SendHealthEventsToNodes ([]string {gpuNodeName }, ERRORCODE_13 , "data/healthy-event.json" )
6969		assert .NoError (t , err , "failed to send healthy events" )
7070		time .Sleep (5  *  time .Second )
7171
7272		// inject XID 48 error 
73- 		err  =  helpers .SendHealthEventsToNodes ([]string {GpuNodeName }, ERRORCODE_48 , "data/fatal-health-event.json" )
73+ 		err  =  helpers .SendHealthEventsToNodes ([]string {gpuNodeName }, ERRORCODE_48 , "data/fatal-health-event.json" )
7474		assert .NoError (t , err , "failed to send fatal events" )
7575		time .Sleep (10  *  time .Second )
7676
77- 		err  =  helpers .SendHealthEventsToNodes ([]string {GpuNodeName }, ERRORCODE_48 , "data/healthy-event.json" )
77+ 		err  =  helpers .SendHealthEventsToNodes ([]string {gpuNodeName }, ERRORCODE_48 , "data/healthy-event.json" )
7878		assert .NoError (t , err , "failed to send healthy events" )
7979		time .Sleep (5  *  time .Second )
8080
8181		// inject XID 13 error 
82- 		err  =  helpers .SendHealthEventsToNodes ([]string {GpuNodeName }, ERRORCODE_13 , "data/fatal-health-event.json" )
82+ 		err  =  helpers .SendHealthEventsToNodes ([]string {gpuNodeName }, ERRORCODE_13 , "data/fatal-health-event.json" )
8383		assert .NoError (t , err , "failed to send fatal events" )
8484		time .Sleep (10  *  time .Second )
8585
86- 		err  =  helpers .SendHealthEventsToNodes ([]string {GpuNodeName }, ERRORCODE_13 , "data/healthy-event.json" )
86+ 		err  =  helpers .SendHealthEventsToNodes ([]string {gpuNodeName }, ERRORCODE_13 , "data/healthy-event.json" )
8787		assert .NoError (t , err , "failed to send healthy events" )
8888		time .Sleep (5  *  time .Second )
8989
9090		// inject XID 48 error 
91- 		err  =  helpers .SendHealthEventsToNodes ([]string {GpuNodeName }, ERRORCODE_48 , "data/fatal-health-event.json" )
91+ 		err  =  helpers .SendHealthEventsToNodes ([]string {gpuNodeName }, ERRORCODE_48 , "data/fatal-health-event.json" )
9292		assert .NoError (t , err , "failed to send fatal events" )
9393		time .Sleep (10  *  time .Second )
9494
95- 		err  =  helpers .SendHealthEventsToNodes ([]string {GpuNodeName }, ERRORCODE_48 , "data/healthy-event.json" )
95+ 		err  =  helpers .SendHealthEventsToNodes ([]string {gpuNodeName }, ERRORCODE_48 , "data/healthy-event.json" )
9696		assert .NoError (t , err , "failed to send healthy events" )
9797		time .Sleep (5  *  time .Second )
9898
9999		// inject XID 13 error 
100- 		err  =  helpers .SendHealthEventsToNodes ([]string {GpuNodeName }, ERRORCODE_13 , "data/fatal-health-event.json" )
100+ 		err  =  helpers .SendHealthEventsToNodes ([]string {gpuNodeName }, ERRORCODE_13 , "data/fatal-health-event.json" )
101101		assert .NoError (t , err , "failed to send fatal events" )
102102		time .Sleep (10  *  time .Second )
103103
104- 		err  =  helpers .SendHealthEventsToNodes ([]string {GpuNodeName }, ERRORCODE_13 , "data/healthy-event.json" )
104+ 		err  =  helpers .SendHealthEventsToNodes ([]string {gpuNodeName }, ERRORCODE_13 , "data/healthy-event.json" )
105105		assert .NoError (t , err , "failed to send healthy events" )
106106		time .Sleep (5  *  time .Second )
107107
108108		return  ctx 
109109	})
110110
111111	feature .Assess ("Check if health event analyzer published a new fatal event" , func (ctx  context.Context , t  * testing.T , c  * envconf.Config ) context.Context  {
112+ 		// Get GPU node name from context 
113+ 		gpuNodeName , ok  :=  ctx .Value (keyGpuNodeName ).(string )
114+ 		if  ! ok  ||  gpuNodeName  ==  ""  {
115+ 			t .Fatal ("GPU node name not found in context - previous assess step may have failed" )
116+ 		}
117+ 
112118		// Ensure cleanup at the end of the test 
113119		defer  func () {
114- 			t .Logf ("Starting cleanup for node %s" , GpuNodeName )
120+ 			t .Logf ("Starting cleanup for node %s" , gpuNodeName )
115121
116- 			err  :=  helpers .TestCleanUp (ctx , GpuNodeName , "MultipleFatalError" , "31" , c )
117- 			assert .NoError (t , err , "failed to cleanup node condition and uncordon node %s" , GpuNodeName )
118- 			t .Logf ("Successfully cleaned up node condition and uncordoned node %s" , GpuNodeName )
122+ 			err  :=  helpers .TestCleanUp (ctx , gpuNodeName , "MultipleFatalError" , "31" , c )
123+ 			assert .NoError (t , err , "failed to cleanup node condition and uncordon node %s" , gpuNodeName )
124+ 			t .Logf ("Successfully cleaned up node condition and uncordoned node %s" , gpuNodeName )
119125		}()
120126
121- 		// inject XID 48  error to trigger the rule 
122- 		err  :=  helpers .SendHealthEventsToNodes ([]string {GpuNodeName }, "31" , "data/fatal-health-event.json" )
127+ 		// inject XID 31  error to trigger the rule 
128+ 		err  :=  helpers .SendHealthEventsToNodes ([]string {gpuNodeName }, "31" , "data/fatal-health-event.json" )
123129		assert .NoError (t , err , "failed to send fatal events" )
124130		time .Sleep (10  *  time .Second )
125131
126132		client , err  :=  c .NewClient ()
127133		assert .NoError (t , err , "failed to create client" )
128134
129135		// Check node condition for matched ruleset 
130- 		helpers .WaitForNodeConditionWithCheckName (ctx , t , client , GpuNodeName , "MultipleFatalError" )
136+ 		helpers .WaitForNodeConditionWithCheckName (ctx , t , client , gpuNodeName , "MultipleFatalError" )
131137		// Check MongoDB for health event with checkName = "MultipleFatalError" 
132138		filter  :=  bson.M {
133- 			"healthevent.nodename" :  GpuNodeName ,
139+ 			"healthevent.nodename" :  gpuNodeName ,
134140			"healthevent.checkname" : "MultipleFatalError" ,
135141		}
136142
@@ -142,7 +148,7 @@ func TestMultipleFatalEventRule(t *testing.T) {
142148		if  healthEvent , ok  :=  event ["healthevent" ].(map [string ]interface {}); ok  {
143149			nodeName , ok  :=  healthEvent ["nodename" ].(string )
144150			assert .True (t , ok , "nodename should be a string" )
145- 			assert .Equal (t , GpuNodeName , nodeName , "nodeName should be the same as the node name" )
151+ 			assert .Equal (t , gpuNodeName , nodeName , "nodeName should be the same as the node name" )
146152
147153			checkName , ok  :=  healthEvent ["checkname" ].(string )
148154			assert .True (t , ok , "checkname should be a string" )
0 commit comments