added EtcdClusterStatus, modified reconcilation loop for status tracking

ballista01 · ballista01 · commit 86e9968d6d6c · 2025-05-05T00:30:49.000-04:00
Signed-off-by: Wenxue Zhao &lt;ballista01@outlook.com&gt;
diff --git a/api/v1alpha1/etcdcluster_types.go b/api/v1alpha1/etcdcluster_types.go
@@ -62,6 +62,21 @@ type ProviderCertManagerConfig struct {
 type EtcdClusterStatus struct {
 	// INSERT ADDITIONAL STATUS FIELD - define observed state of cluster
 	// Important: Run "make" to regenerate code after modifying this file
+
+	// ReadyReplicas is the number of pods targeted by this EtcdCluster with a Ready condition.
+	ReadyReplicas int32 `json:"readyReplicas,omitempty"`
+	// Members is the number of etcd members in the cluster reported by etcd API.
+	Members int32 `json:"members,omitempty"`
+	// CurrentVersion is the version of the etcd cluster.
+	CurrentVersion string `json:"currentVersion,omitempty"`
+	// Phase indicates the state of the EtcdCluster.
+	Phase string `json:"phase,omitempty"`
+	// Conditions represent the latest available observations of a replica set's state.
+	// +optional
+	// +patchMergeKey=type
+	// +patchStrategy=merge
+	// +listType=atomic
+	Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"`
 }
 
 // +kubebuilder:object:root=true
diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go
diff --git a/config/crd/bases/operator.etcd.io_etcdclusters.yaml b/config/crd/bases/operator.etcd.io_etcdclusters.yaml
@@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
 kind: CustomResourceDefinition
 metadata:
   annotations:
-    controller-gen.kubebuilder.io/version: v0.17.2
+    controller-gen.kubebuilder.io/version: v0.17.3
   name: etcdclusters.operator.etcd.io
 spec:
   group: operator.etcd.io
@@ -99,6 +99,82 @@ spec:
             type: object
           status:
             description: EtcdClusterStatus defines the observed state of EtcdCluster.
+            properties:
+              conditions:
+                description: Conditions represent the latest available observations
+                  of a replica set's state.
+                items:
+                  description: Condition contains details for one aspect of the current
+                    state of this API Resource.
+                  properties:
+                    lastTransitionTime:
+                      description: |-
+                        lastTransitionTime is the last time the condition transitioned from one status to another.
+                        This should be when the underlying condition changed.  If that is not known, then using the time when the API field changed is acceptable.
+                      format: date-time
+                      type: string
+                    message:
+                      description: |-
+                        message is a human readable message indicating details about the transition.
+                        This may be an empty string.
+                      maxLength: 32768
+                      type: string
+                    observedGeneration:
+                      description: |-
+                        observedGeneration represents the .metadata.generation that the condition was set based upon.
+                        For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
+                        with respect to the current state of the instance.
+                      format: int64
+                      minimum: 0
+                      type: integer
+                    reason:
+                      description: |-
+                        reason contains a programmatic identifier indicating the reason for the condition's last transition.
+                        Producers of specific condition types may define expected values and meanings for this field,
+                        and whether the values are considered a guaranteed API.
+                        The value should be a CamelCase string.
+                        This field may not be empty.
+                      maxLength: 1024
+                      minLength: 1
+                      pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
+                      type: string
+                    status:
+                      description: status of the condition, one of True, False, Unknown.
+                      enum:
+                      - "True"
+                      - "False"
+                      - Unknown
+                      type: string
+                    type:
+                      description: type of condition in CamelCase or in foo.example.com/CamelCase.
+                      maxLength: 316
+                      pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
+                      type: string
+                  required:
+                  - lastTransitionTime
+                  - message
+                  - reason
+                  - status
+                  - type
+                  type: object
+                type: array
+                x-kubernetes-list-type: atomic
+              currentVersion:
+                description: CurrentVersion is the version of the etcd cluster.
+                type: string
+              members:
+                description: Members is the number of etcd members in the cluster
+                  reported by etcd API.
+                format: int32
+                type: integer
+              phase:
+                description: Phase indicates the state of the EtcdCluster.
+                type: string
+              readyReplicas:
+                description: ReadyReplicas is the number of pods targeted by this
+                  EtcdCluster with a Ready condition.
+                format: int32
+                type: integer
             type: object
         type: object
     served: true
diff --git a/internal/controller/etcdcluster_controller.go b/internal/controller/etcdcluster_controller.go
@@ -23,6 +23,7 @@ import (
 
 	appsv1 "k8s.io/api/apps/v1"
 	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/equality"
 	"k8s.io/apimachinery/pkg/api/errors"
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/client-go/tools/record"
@@ -78,8 +79,24 @@ func (r *EtcdClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 		return ctrl.Result{}, err
 	}
 
+	// Keep a copy of the old status for patching later
+	oldEtcdCluster := etcdCluster.DeepCopy()
+
+	// --- Defer the status update ---
+	// Use a closure to capture the logger and handle potential errors from updateStatusIfNeeded
+	defer func() {
+		if err := r.updateStatusIfNeeded(ctx, etcdCluster, oldEtcdCluster); err != nil {
+			// Log the error from status update, but don't change the Reconcile return value here.
+			// Controller Runtime will likely retry anyway if the status update failed.
+			logger.Error(err, "Deferred status update failed")
+		}
+	}()
+
 	if etcdCluster.Spec.Size == 0 {
 		logger.Info("EtcdCluster size is 0..Skipping next steps")
+		etcdCluster.Status.Phase = "Idle" // Example: Set a phase even for size 0
+		etcdCluster.Status.ReadyReplicas = 0
+		etcdCluster.Status.Members = 0
 		return ctrl.Result{}, nil
 	}
 
@@ -93,53 +110,81 @@ func (r *EtcdClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 		if errors.IsNotFound(err) {
 			logger.Info("Creating StatefulSet with 0 replica", "expectedSize", etcdCluster.Spec.Size)
 			// Create a new StatefulSet
-
 			sts, err = reconcileStatefulSet(ctx, logger, etcdCluster, r.Client, 0, r.Scheme)
 			if err != nil {
+				logger.Error(err, "Failed to create StatefulSet")
+				etcdCluster.Status.Phase = "Failed"
 				return ctrl.Result{}, err
 			}
 		} else {
 			// If an error occurs during Get/Create, we'll requeue the item so we can
 			// attempt processing again later. This could have been caused by a
 			// temporary network failure, or any other transient reason.
 			logger.Error(err, "Failed to get StatefulSet. Requesting requeue")
+			etcdCluster.Status.Phase = "Failed"
 			return ctrl.Result{RequeueAfter: requeueDuration}, nil
 		}
 	}
 
+	// At this point, sts should exist (either found or created)
+	if sts == nil {
+		// This case should ideally not happen if error handling above is correct
+		err := fmt.Errorf("statefulSet is unexpectedly nil after get/create")
+		logger.Error(err, "Internal error")
+		etcdCluster.Status.Phase = "Failed"
+		return ctrl.Result{}, err // Return error, defer will update status
+	}
+
+	// Update status based on STS before proceeding
+	etcdCluster.Status.ReadyReplicas = sts.Status.ReadyReplicas
+
 	// If the Statefulsets is not controlled by this EtcdCluster resource, we should log
 	// a warning to the event recorder and return error msg.
 	err = checkStatefulSetControlledByEtcdOperator(etcdCluster, sts)
 	if err != nil {
 		logger.Error(err, "StatefulSet is not controlled by this EtcdCluster resource")
+		etcdCluster.Status.Phase = "Failed"
 		return ctrl.Result{}, err
 	}
 
 	// If statefulset size is 0. try to instantiate the cluster with 1 member
 	if sts.Spec.Replicas != nil && *sts.Spec.Replicas == 0 {
 		logger.Info("StatefulSet has 0 replicas. Trying to create a new cluster with 1 member")
-
 		sts, err = reconcileStatefulSet(ctx, logger, etcdCluster, r.Client, 1, r.Scheme)
 		if err != nil {
+			logger.Error(err, "Failed to scale StatefulSet to 1 replica")
+			etcdCluster.Status.Phase = "Failed"
 			return ctrl.Result{}, err
 		}
+		// Successfully scaled to 1, update status fields and requeue to wait for pod readiness
+		etcdCluster.Status.ReadyReplicas = sts.Status.ReadyReplicas
+		etcdCluster.Status.Phase = "Initializing"
+		// return ctrl.Result{RequeueAfter: requeueDuration}, nil // Requeue to check readiness, should we do it?
 	}
 
 	err = createHeadlessServiceIfNotExist(ctx, logger, r.Client, etcdCluster, r.Scheme)
 	if err != nil {
+		logger.Error(err, "Failed to create Headless Service")
+		etcdCluster.Status.Phase = "Failed"
 		return ctrl.Result{}, err
 	}
 
 	logger.Info("Now checking health of the cluster members")
 	memberListResp, healthInfos, err := healthCheck(sts, logger)
 	if err != nil {
+		logger.Error(err, "Health check failed")
+		etcdCluster.Status.Phase = "Degraded" // Or "Unavailable"?
 		return ctrl.Result{}, fmt.Errorf("health check failed: %w", err)
 	}
 
 	memberCnt := 0
 	if memberListResp != nil {
 		memberCnt = len(memberListResp.Members)
 	}
+	etcdCluster.Status.Members = int32(memberCnt)
+	// TODO: Update CurrentVersion from healthInfos
+	// TODO: Update Conditions based on healthInfos
+
 	targetReplica := *sts.Spec.Replicas // Start with the current size of the stateful set
 
 	// The number of replicas in the StatefulSet doesn't match the number of etcd members in the cluster.
@@ -151,6 +196,8 @@ func (r *EtcdClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 			logger.Info("Increasing StatefulSet replicas to match the etcd cluster member count", "oldReplicaCount", targetReplica, "newReplicaCount", newReplicaCount)
 			_, err = reconcileStatefulSet(ctx, logger, etcdCluster, r.Client, newReplicaCount, r.Scheme)
 			if err != nil {
+				logger.Error(err, "Failed to adjust StatefulSet replicas to match member count")
+				etcdCluster.Status.Phase = "Failed"
 				return ctrl.Result{}, err
 			}
 		} else {
@@ -159,9 +206,14 @@ func (r *EtcdClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 			logger.Info("Decreasing StatefulSet replicas to remove the unneeded Pod.", "oldReplicaCount", targetReplica, "newReplicaCount", newReplicaCount)
 			_, err = reconcileStatefulSet(ctx, logger, etcdCluster, r.Client, newReplicaCount, r.Scheme)
 			if err != nil {
+				logger.Error(err, "Failed to adjust StatefulSet replicas to match member count")
+				etcdCluster.Status.Phase = "Failed"
 				return ctrl.Result{}, err
 			}
 		}
+		// Successfully adjusted STS, update status and requeue
+		etcdCluster.Status.ReadyReplicas = sts.Status.ReadyReplicas
+		etcdCluster.Status.Phase = "Scaling"
 		return ctrl.Result{RequeueAfter: requeueDuration}, nil
 	}
 
@@ -175,6 +227,10 @@ func (r *EtcdClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 		// Find the leader status
 		_, leaderStatus = etcdutils.FindLeaderStatus(healthInfos, logger)
 		if leaderStatus == nil {
+			err := fmt.Errorf("couldn't find leader, memberCnt: %d", memberCnt)
+			logger.Error(err, "Leader election might be in progress or cluster unhealthy")
+			etcdCluster.Status.Phase = "Degraded" // Or Unavailable
+			// TODO: Add Condition
 			// If the leader is not available, let's wait for the leader to be elected
 			return ctrl.Result{}, fmt.Errorf("couldn't find leader, memberCnt: %d", memberCnt)
 		}
@@ -191,13 +247,18 @@ func (r *EtcdClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 				eps = eps[:(len(eps) - 1)]
 				err = etcdutils.PromoteLearner(eps, learner)
 				if err != nil {
+					logger.Error(err, "Failed to promote learner")
+					etcdCluster.Status.Phase = "Failed" // Promotion failed
+					// TODO: Add Condition
 					// The member is not promoted yet, so we error out
 					return ctrl.Result{}, err
 				}
+				etcdCluster.Status.Phase = "PromotingLearner" // Indicate promotion happened
 			} else {
 				// Learner is not yet ready. We can't add another learner or proceed further until this one is promoted
 				// So let's requeue
 				logger.Info("The learner member isn't ready to be promoted yet", "learnerID", learner)
+				etcdCluster.Status.Phase = "PromotingLearner" // Still trying to promote
 				return ctrl.Result{RequeueAfter: requeueDuration}, nil
 			}
 		}
@@ -218,6 +279,9 @@ func (r *EtcdClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 		targetReplica++
 		logger.Info("[Scale out] adding a new learner member to etcd cluster", "peerURLs", peerURL)
 		if _, err := etcdutils.AddMember(eps, []string{peerURL}, true); err != nil {
+			logger.Error(err, "Failed to add learner member")
+			etcdCluster.Status.Phase = "Failed" // Scaling failed
+			// TODO: Add Condition
 			return ctrl.Result{}, err
 		}
 
@@ -232,31 +296,61 @@ func (r *EtcdClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 		logger.Info("[Scale in] removing one member", "memberID", memberID)
 		eps = eps[:targetReplica]
 		if err := etcdutils.RemoveMember(eps, memberID); err != nil {
+			logger.Error(err, "Failed to remove member")
+			etcdCluster.Status.Phase = "Failed" // Scaling failed
+			// TODO: Add Condition
 			return ctrl.Result{}, err
 		}
 	}
 
 	sts, err = reconcileStatefulSet(ctx, logger, etcdCluster, r.Client, targetReplica, r.Scheme)
 	if err != nil {
+		logger.Error(err, "Failed to update StatefulSet during scaling")
+		etcdCluster.Status.Phase = "Failed"
 		return ctrl.Result{}, err
 	}
 
 	allMembersHealthy, err := areAllMembersHealthy(sts, logger)
 	if err != nil {
+		logger.Error(err, "Final health check failed")
+		etcdCluster.Status.Phase = "Degraded"
+		// TODO: Add Condition
 		return ctrl.Result{}, err
 	}
 
 	if *sts.Spec.Replicas != int32(etcdCluster.Spec.Size) || !allMembersHealthy {
 		// Requeue if the statefulset size is not equal to the expected size of ETCD cluster
 		// Or if all members of the cluster are not healthy
+		etcdCluster.Status.Phase = "Degraded"
+		// TODO: Add Condition
 		return ctrl.Result{RequeueAfter: requeueDuration}, nil
 	}
 
+	etcdCluster.Status.Phase = "Running" // Final healthy state
+	// TODO: Set Available Condition to True
 	logger.Info("EtcdCluster reconciled successfully")
 	return ctrl.Result{}, nil
 
 }
 
+// updateStatusIfNeeded compares the old and new status and patches if changed.
+func (r *EtcdClusterReconciler) updateStatusIfNeeded(ctx context.Context, etcdCluster *ecv1alpha1.EtcdCluster, oldEtcdCluster *ecv1alpha1.EtcdCluster) error {
+	logger := log.FromContext(ctx)
+	// Compare the new status with the old status
+	if !equality.Semantic.DeepEqual(oldEtcdCluster.Status, etcdCluster.Status) {
+		logger.Info("Updating EtcdCluster status", "namespace", etcdCluster.Namespace, "name", etcdCluster.Name)
+		err := r.Status().Patch(ctx, etcdCluster, client.MergeFrom(oldEtcdCluster))
+		if err != nil {
+			logger.Error(err, "Failed to update EtcdCluster status", "namespace", etcdCluster.Namespace, "name", etcdCluster.Name)
+			return err // Return the error so the Reconcile loop retries
+		}
+		logger.Info("Successfully updated EtcdCluster status", "namespace", etcdCluster.Namespace, "name", etcdCluster.Name)
+	} else {
+		logger.V(1).Info("EtcdCluster status is already up-to-date", "namespace", etcdCluster.Namespace, "name", etcdCluster.Name) // Use V(1) for less important info
+	}
+	return nil // No error occurred during status update itself
+}
+
 // SetupWithManager sets up the controller with the Manager.
 func (r *EtcdClusterReconciler) SetupWithManager(mgr ctrl.Manager) error {
 	r.Recorder = mgr.GetEventRecorderFor("etcdcluster-controller")