Skip to content

Commit b44a45c

Browse files
authored
Add Health Checks Framework (#252)
Just adding in the machinery to allow generic health checks of resources. The interfaces we have in place act to constrain the types that can be helth checked and also provide the functions to grab applicatons from the CD layer and finally update the status conditions.
1 parent 56d674c commit b44a45c

File tree

7 files changed

+234
-0
lines changed

7 files changed

+234
-0
lines changed

charts/kubernetes/crds/unikorn-cloud.org_clustermanagers.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,7 @@ spec:
292292
- Errored
293293
- Deprovisioning
294294
- Deprovisioned
295+
- Unknown
295296
- Healthy
296297
- Degraded
297298
type: string

charts/kubernetes/crds/unikorn-cloud.org_kubernetesclusters.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -497,6 +497,7 @@ spec:
497497
- Errored
498498
- Deprovisioning
499499
- Deprovisioned
500+
- Unknown
500501
- Healthy
501502
- Degraded
502503
type: string

charts/kubernetes/crds/unikorn-cloud.org_virtualkubernetesclusters.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,7 @@ spec:
320320
- Errored
321321
- Deprovisioning
322322
- Deprovisioned
323+
- Unknown
323324
- Healthy
324325
- Degraded
325326
type: string

charts/kubernetes/templates/monitor/clusterrole.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,20 @@ rules:
1111
resources:
1212
- clustermanagers
1313
- kubernetesclusters
14+
- virtualkubernetesclusters
1415
verbs:
1516
- list
1617
- watch
1718
- update
19+
# Update status conditions
20+
- apiGroups:
21+
- unikorn-cloud.org
22+
resources:
23+
- clustermanagers/status
24+
- kubernetesclusters/status
25+
- virtualkubernetesclusters/status
26+
verbs:
27+
- patch
1828
# Get application bundles
1929
- apiGroups:
2030
- unikorn-cloud.org
@@ -24,3 +34,11 @@ rules:
2434
verbs:
2535
- list
2636
- watch
37+
# ArgoCD application access for health monitoring.
38+
- apiGroups:
39+
- argoproj.io
40+
resources:
41+
- applications
42+
verbs:
43+
- list
44+
- watch

pkg/apis/unikorn/v1alpha1/helpers.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ package v1alpha1
2020
import (
2121
"errors"
2222
"fmt"
23+
"iter"
2324
"strings"
2425
"time"
2526

@@ -40,6 +41,39 @@ var (
4041
ErrApplicationLookup = errors.New("failed to lookup an application")
4142
)
4243

44+
// All implements generic iteration over list items.
45+
func (l *ClusterManagerList) All() iter.Seq[*ClusterManager] {
46+
return func(yield func(t *ClusterManager) bool) {
47+
for i := range l.Items {
48+
if !yield(&l.Items[i]) {
49+
return
50+
}
51+
}
52+
}
53+
}
54+
55+
// All implements generic iteration over list items.
56+
func (l *KubernetesClusterList) All() iter.Seq[*KubernetesCluster] {
57+
return func(yield func(t *KubernetesCluster) bool) {
58+
for i := range l.Items {
59+
if !yield(&l.Items[i]) {
60+
return
61+
}
62+
}
63+
}
64+
}
65+
66+
// All implements generic iteration over list items.
67+
func (l *VirtualKubernetesClusterList) All() iter.Seq[*VirtualKubernetesCluster] {
68+
return func(yield func(t *VirtualKubernetesCluster) bool) {
69+
for i := range l.Items {
70+
if !yield(&l.Items[i]) {
71+
return
72+
}
73+
}
74+
}
75+
}
76+
4377
// Paused implements the ReconcilePauser interface.
4478
func (c *ClusterManager) Paused() bool {
4579
return c.Spec.Pause

pkg/monitor/health/cheker.go

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
/*
2+
Copyright 2025 the Unikorn Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package health
18+
19+
import (
20+
"context"
21+
"errors"
22+
"fmt"
23+
"iter"
24+
25+
unikornv1 "github.com/unikorn-cloud/core/pkg/apis/unikorn/v1alpha1"
26+
"github.com/unikorn-cloud/core/pkg/cd"
27+
28+
corev1 "k8s.io/api/core/v1"
29+
30+
"sigs.k8s.io/controller-runtime/pkg/client"
31+
)
32+
33+
var (
34+
ErrTypeConversion = errors.New("type conversion error")
35+
)
36+
37+
// Lister is a generic inteface for operating on and iterating over resource
38+
// lists (as no such interface is required by apimachinery.
39+
type Lister[T unikornv1.ManagableResourceInterface] interface {
40+
client.ObjectList
41+
All() iter.Seq[T]
42+
}
43+
44+
// Checker lists all resources of the specified type and does a health check on it.
45+
// The type itself is constrained to a manageable resource so we can get the label selector
46+
// to pass to the CD layer to get all applications for the resource, then once we have
47+
// checked the status of those applications we can set the condition generically, again
48+
// as provided by the manageable resource interface.
49+
type Checker[T unikornv1.ManagableResourceInterface, L Lister[T]] struct {
50+
// client allows access to Kubernetes resources.
51+
client client.Client
52+
// driver is the CD driver.
53+
driver cd.Driver
54+
// l is storage for the manageable resource list.
55+
l L
56+
}
57+
58+
// New creates a new checker. All types can be inferred, the template parameters
59+
// are purely for type constraints.
60+
func New[T unikornv1.ManagableResourceInterface, L Lister[T]](client client.Client, driver cd.Driver, l L) *Checker[T, L] {
61+
return &Checker[T, L]{
62+
client: client,
63+
driver: driver,
64+
l: l,
65+
}
66+
}
67+
68+
// resourceIdentifierFromResource takes our manageable resource type and returns
69+
// a CD resource ID to identify its applications.
70+
func resourceIdentifierFromResource(r unikornv1.ManagableResourceInterface) (*cd.ResourceIdentifier, error) {
71+
labels, err := r.ResourceLabels()
72+
if err != nil {
73+
return nil, err
74+
}
75+
76+
id := &cd.ResourceIdentifier{
77+
Labels: make([]cd.ResourceIdentifierLabel, 0, len(labels)),
78+
}
79+
80+
for k, v := range labels {
81+
id.Labels = append(id.Labels, cd.ResourceIdentifierLabel{
82+
Name: k,
83+
Value: v,
84+
})
85+
}
86+
87+
return id, nil
88+
}
89+
90+
// convertHealthStatus translates from the CD interface to the Kubernetes API.
91+
func convertHealthStatus(status cd.HealthStatus) (corev1.ConditionStatus, unikornv1.ConditionReason, string) {
92+
switch status {
93+
case cd.HealthStatusUnknown:
94+
return corev1.ConditionUnknown, unikornv1.ConditionReasonUnknown, "unable to poll application status"
95+
case cd.HealthStatusHealthy:
96+
return corev1.ConditionTrue, unikornv1.ConditionReasonHealthy, "resource applications healthy"
97+
case cd.HealthStatusDegraded:
98+
return corev1.ConditionFalse, unikornv1.ConditionReasonDegraded, "one or more resource applications are degraded"
99+
}
100+
101+
// NOTE: the linter will warn about non-exhaustive switches.
102+
return corev1.ConditionUnknown, unikornv1.ConditionReasonUnknown, "unreachable code reached"
103+
}
104+
105+
// check does the actual check for a resource and updates its status.
106+
func (c *Checker[T, L]) check(ctx context.Context, r unikornv1.ManagableResourceInterface) error {
107+
// Grab the overall health status.
108+
id, err := resourceIdentifierFromResource(r)
109+
if err != nil {
110+
return err
111+
}
112+
113+
// TODO: we only support argo now, but will need an abstraction down the line.
114+
// There is precedent in the main controllers.
115+
healthStatus, err := c.driver.GetHealthStatus(ctx, id)
116+
if err != nil {
117+
return err
118+
}
119+
120+
updated, ok := r.DeepCopyObject().(unikornv1.ManagableResourceInterface)
121+
if !ok {
122+
return fmt.Errorf("%w: unable to deep copy manageable resource", ErrTypeConversion)
123+
}
124+
125+
// And finally set the status condition.
126+
status, reason, message := convertHealthStatus(healthStatus)
127+
128+
updated.StatusConditionWrite(unikornv1.ConditionHealthy, status, reason, message)
129+
130+
if err := c.client.Status().Patch(ctx, updated, client.MergeFrom(r)); err != nil {
131+
return err
132+
}
133+
134+
return nil
135+
}
136+
137+
// Check does the actual check as described for the health checker type.
138+
func (c *Checker[T, L]) Check(ctx context.Context) error {
139+
// NOTE: This looks expensive, but it's all cached by controller-runtime.
140+
if err := c.client.List(ctx, c.l, &client.ListOptions{}); err != nil {
141+
return err
142+
}
143+
144+
for o := range c.l.All() {
145+
if err := c.check(ctx, o); err != nil {
146+
return err
147+
}
148+
}
149+
150+
return nil
151+
}

pkg/monitor/monitor.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,11 @@ import (
2323

2424
"github.com/spf13/pflag"
2525

26+
"github.com/unikorn-cloud/core/pkg/cd"
27+
"github.com/unikorn-cloud/core/pkg/cd/argocd"
28+
coreerrors "github.com/unikorn-cloud/core/pkg/errors"
29+
unikornv1 "github.com/unikorn-cloud/kubernetes/pkg/apis/unikorn/v1alpha1"
30+
"github.com/unikorn-cloud/kubernetes/pkg/monitor/health"
2631
upgradecluster "github.com/unikorn-cloud/kubernetes/pkg/monitor/upgrade/cluster"
2732
upgradeclustermanager "github.com/unikorn-cloud/kubernetes/pkg/monitor/upgrade/clustermanager"
2833

@@ -36,11 +41,26 @@ type Options struct {
3641
// run with high frequency, reads are all cached. It's mostly down to
3742
// burning CPU unnecessarily.
3843
pollPeriod time.Duration
44+
45+
// cdDriver defines the continuous-delivery backend driver to use
46+
// to manage applications.
47+
cdDriver cd.DriverKindFlag
3948
}
4049

4150
// AddFlags registers option flags with pflag.
4251
func (o *Options) AddFlags(flags *pflag.FlagSet) {
52+
o.cdDriver.Kind = cd.DriverKindArgoCD
53+
4354
flags.DurationVar(&o.pollPeriod, "poll-period", time.Minute, "Period to poll for updates")
55+
flags.Var(&o.cdDriver, "cd-driver", "CD backend driver to use from [argocd]")
56+
}
57+
58+
func (o *Options) getDriver(client client.Client) (cd.Driver, error) {
59+
if o.cdDriver.Kind != cd.DriverKindArgoCD {
60+
return nil, coreerrors.ErrCDDriver
61+
}
62+
63+
return argocd.New(client, argocd.Options{}), nil
4464
}
4565

4666
// Checker is an interface that monitors must implement.
@@ -56,9 +76,17 @@ func Run(ctx context.Context, c client.Client, o *Options) {
5676
ticker := time.NewTicker(o.pollPeriod)
5777
defer ticker.Stop()
5878

79+
driver, err := o.getDriver(c)
80+
if err != nil {
81+
panic(err)
82+
}
83+
5984
checkers := []Checker{
6085
upgradecluster.New(c),
6186
upgradeclustermanager.New(c),
87+
health.New(c, driver, &unikornv1.ClusterManagerList{}),
88+
health.New(c, driver, &unikornv1.KubernetesClusterList{}),
89+
health.New(c, driver, &unikornv1.VirtualKubernetesClusterList{}),
6290
}
6391

6492
for {

0 commit comments

Comments
 (0)