diff --git a/pkg/testsuites/standard_suites.go b/pkg/testsuites/standard_suites.go index 59d27f9fa9e9..00e16ffdcb10 100644 --- a/pkg/testsuites/standard_suites.go +++ b/pkg/testsuites/standard_suites.go @@ -462,4 +462,17 @@ var staticSuites = []ginkgo.TestSuite{ }, TestTimeout: 120 * time.Minute, }, + { + Name: "openshift/two-node", + Description: templates.LongDesc(` + This test suite runs tests to validate two-node. + `), + Matches: func(name string) bool { + if isDisabled(name) { + return false + } + return strings.Contains(name, "[Suite:openshift/two-node") || strings.Contains(name, "[FeatureGate:DualReplica]") || strings.Contains(name, "[FeatureGate:HighlyAvailableArbiter]") + }, + TestTimeout: 60 * time.Minute, + }, } diff --git a/test/extended/two_node/arbiter_topology.go b/test/extended/two_node/arbiter_topology.go index 87f23d0c4837..ee5eb4966480 100644 --- a/test/extended/two_node/arbiter_topology.go +++ b/test/extended/two_node/arbiter_topology.go @@ -1,4 +1,4 @@ -package arbiter_topology +package two_node import ( "context" @@ -20,11 +20,6 @@ import ( "k8s.io/apimachinery/pkg/util/wait" ) -const ( - labelNodeRoleMaster = "node-role.kubernetes.io/master" - labelNodeRoleArbiter = "node-role.kubernetes.io/arbiter" -) - var ( defaultExpectedMaxPodCount = 30 expectedMaxPodCountsPerPlatform = map[v1.PlatformType]int{ @@ -33,15 +28,12 @@ var ( } ) -var _ = g.Describe("[sig-node][apigroup:config.openshift.io][OCPFeatureGate:HighlyAvailableArbiter] expected Master and Arbiter node counts", func() { +var _ = g.Describe("[sig-node][apigroup:config.openshift.io][OCPFeatureGate:HighlyAvailableArbiter][Suite:openshift/two-node] expected Master and Arbiter node counts", func() { defer g.GinkgoRecover() oc := exutil.NewCLIWithoutNamespace("") g.BeforeEach(func() { - infraStatus := getInfraStatus(oc) - if infraStatus.ControlPlaneTopology != v1.HighlyAvailableArbiterMode { - g.Skip("Cluster is not in HighlyAvailableArbiterMode skipping test") - } + skipIfNotTopology(oc, v1.HighlyAvailableArbiterMode) }) g.It("Should validate that there are Master and Arbiter nodes as specified in the cluster", func() { @@ -66,7 +58,7 @@ var _ = g.Describe("[sig-node][apigroup:config.openshift.io][OCPFeatureGate:High }) }) -var _ = g.Describe("[sig-node][apigroup:config.openshift.io][OCPFeatureGate:HighlyAvailableArbiter] required pods on the Arbiter node", func() { +var _ = g.Describe("[sig-node][apigroup:config.openshift.io][OCPFeatureGate:HighlyAvailableArbiter][Suite:openshift/two-node] required pods on the Arbiter node", func() { defer g.GinkgoRecover() var ( @@ -75,10 +67,7 @@ var _ = g.Describe("[sig-node][apigroup:config.openshift.io][OCPFeatureGate:High ) g.BeforeEach(func() { - infraStatus = getInfraStatus(oc) - if infraStatus.ControlPlaneTopology != v1.HighlyAvailableArbiterMode { - g.Skip("Cluster is not in HighlyAvailableArbiterMode skipping test") - } + skipIfNotTopology(oc, v1.HighlyAvailableArbiterMode) }) g.It("Should verify that the correct number of pods are running on the Arbiter node", func() { g.By("inferring platform type") @@ -110,12 +99,12 @@ var _ = g.Describe("[sig-node][apigroup:config.openshift.io][OCPFeatureGate:High }) }) -var _ = g.Describe("[sig-apps][apigroup:apps.openshift.io][OCPFeatureGate:HighlyAvailableArbiter] Deployments on HighlyAvailableArbiterMode topology", func() { +var _ = g.Describe("[sig-apps][apigroup:apps.openshift.io][OCPFeatureGate:HighlyAvailableArbiter][Suite:openshift/two-node] Deployments on HighlyAvailableArbiterMode topology", func() { defer g.GinkgoRecover() oc := exutil.NewCLI("arbiter-pod-validation").SetManagedNamespace().AsAdmin() g.BeforeEach(func() { - skipNonArbiterCluster(oc) + skipIfNotTopology(oc, v1.HighlyAvailableArbiterMode) }) g.It("should be created on arbiter nodes when arbiter node is selected", func() { @@ -202,12 +191,12 @@ var _ = g.Describe("[sig-apps][apigroup:apps.openshift.io][OCPFeatureGate:Highly }) }) -var _ = g.Describe("[sig-apps][apigroup:apps.openshift.io][OCPFeatureGate:HighlyAvailableArbiter] Evaluate DaemonSet placement in HighlyAvailableArbiterMode topology", func() { +var _ = g.Describe("[sig-apps][apigroup:apps.openshift.io][OCPFeatureGate:HighlyAvailableArbiter][Suite:openshift/two-node] Evaluate DaemonSet placement in HighlyAvailableArbiterMode topology", func() { defer g.GinkgoRecover() oc := exutil.NewCLI("daemonset-pod-validation").SetManagedNamespace().AsAdmin() g.BeforeEach(func() { - skipNonArbiterCluster(oc) + skipIfNotTopology(oc, v1.HighlyAvailableArbiterMode) }) g.It("should not create a DaemonSet on the Arbiter node", func() { @@ -252,12 +241,12 @@ var _ = g.Describe("[sig-apps][apigroup:apps.openshift.io][OCPFeatureGate:Highly }) }) -var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:HighlyAvailableArbiter] Ensure etcd health and quorum in HighlyAvailableArbiterMode", func() { +var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:HighlyAvailableArbiter][Suite:openshift/two-node] Ensure etcd health and quorum in HighlyAvailableArbiterMode", func() { defer g.GinkgoRecover() oc := exutil.NewCLIWithoutNamespace("").AsAdmin() g.BeforeEach(func() { - skipNonArbiterCluster(oc) + skipIfNotTopology(oc, v1.HighlyAvailableArbiterMode) }) g.It("should have all etcd pods running and quorum met", func() { @@ -454,17 +443,3 @@ func isClusterOperatorDegraded(operator *v1.ClusterOperator) bool { } return false } - -func skipNonArbiterCluster(oc *exutil.CLI) { - infraStatus := getInfraStatus(oc) - if infraStatus.ControlPlaneTopology != v1.HighlyAvailableArbiterMode { - g.Skip("Cluster is not in HighlyAvailableArbiterMode, skipping test") - } -} - -func getInfraStatus(oc *exutil.CLI) v1.InfrastructureStatus { - infra, err := oc.AdminConfigClient().ConfigV1().Infrastructures().Get(context.Background(), - "cluster", metav1.GetOptions{}) - o.Expect(err).NotTo(o.HaveOccurred()) - return infra.Status -} diff --git a/test/extended/two_node/common.go b/test/extended/two_node/common.go new file mode 100644 index 000000000000..07c8bf364d2e --- /dev/null +++ b/test/extended/two_node/common.go @@ -0,0 +1,26 @@ +package two_node + +import ( + "fmt" + + v1 "github.com/openshift/api/config/v1" + exutil "github.com/openshift/origin/test/extended/util" + e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" +) + +const ( + labelNodeRoleMaster = "node-role.kubernetes.io/master" + labelNodeRoleControlPlane = "node-role.kubernetes.io/control-plane" + labelNodeRoleWorker = "node-role.kubernetes.io/worker" + labelNodeRoleArbiter = "node-role.kubernetes.io/arbiter" +) + +func skipIfNotTopology(oc *exutil.CLI, wanted v1.TopologyMode) { + current, err := exutil.GetControlPlaneTopology(oc) + if err != nil { + e2eskipper.Skip(fmt.Sprintf("Could not get current topology, skipping test: error %v", err)) + } + if *current != wanted { + e2eskipper.Skip(fmt.Sprintf("Cluster is not in %v topology, skipping test", wanted)) + } +} diff --git a/test/extended/two_node/tnf_recovery.go b/test/extended/two_node/tnf_recovery.go new file mode 100644 index 000000000000..ca4095b8003d --- /dev/null +++ b/test/extended/two_node/tnf_recovery.go @@ -0,0 +1,196 @@ +package two_node + +import ( + "context" + "fmt" + "math/rand" + "slices" + "time" + + g "github.com/onsi/ginkgo/v2" + o "github.com/onsi/gomega" + v1 "github.com/openshift/api/config/v1" + "github.com/openshift/origin/test/extended/etcd/helpers" + exutil "github.com/openshift/origin/test/extended/util" + "github.com/pkg/errors" + "go.etcd.io/etcd/api/v3/etcdserverpb" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/two-node] Two Node with Fencing etcd recovery", func() { + defer g.GinkgoRecover() + + var ( + oc = exutil.NewCLIWithoutNamespace("").AsAdmin() + etcdClientFactory *helpers.EtcdClientFactoryImpl + nodeA, nodeB corev1.Node + ) + + g.BeforeEach(func() { + skipIfNotTopology(oc, v1.DualReplicaTopologyMode) + + nodes, err := oc.AdminKubeClient().CoreV1().Nodes().List(context.Background(), metav1.ListOptions{}) + o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error") + o.Expect(len(nodes.Items)).To(o.BeNumerically("==", 2), "Expected to find 2 Nodes only") + + // Select the first index randomly + randomIndex := rand.Intn(len(nodes.Items)) + nodeA = nodes.Items[randomIndex] + // Select the remaining index + nodeB = nodes.Items[(randomIndex+1)%len(nodes.Items)] + g.GinkgoT().Printf("Randomly selected %s (%s) to be gracefully shut down and %s (%s) to take the lead\n", nodeB.Name, nodeB.Status.Addresses[0].Address, nodeA.Name, nodeA.Status.Addresses[0].Address) + + kubeClient := oc.KubeClient() + etcdClientFactory = helpers.NewEtcdClientFactory(kubeClient) + + g.GinkgoT().Printf("Ensure both nodes are healthy before starting the test\n") + o.Eventually(func() error { + return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, nodeA.Name) + }, time.Minute, 5*time.Second).ShouldNot(o.HaveOccurred(), "expect to ensure Node A healthy without error") + + o.Eventually(func() error { + return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, nodeB.Name) + }, time.Minute, 5*time.Second).ShouldNot(o.HaveOccurred(), "expect to ensure Node B healthy without error") + }) + + g.It("Should support a graceful node shutdown", func() { + msg := fmt.Sprintf("Shutting down %s gracefully in 1 minute", nodeB.Name) + g.By(msg) + // NOTE: Using `shutdown` alone would cause the node to be permanently removed from the cluster. + // To prevent this, we use the `--reboot` flag, which ensures a graceful shutdown and allows the + // node to rejoin the cluster upon restart. A one-minute delay is added to give the debug node + // sufficient time to cleanly exit before the shutdown process completes. + _, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, nodeB.Name, "openshift-etcd", "shutdown", "--reboot", "+1") + o.Expect(err).To(o.BeNil(), "Expected to gracefully shutdown the node without errors") + time.Sleep(time.Minute) + + msg = fmt.Sprintf("Ensuring %s leaves the member list", nodeB.Name) + g.By(msg) + o.Eventually(func() error { + return helpers.EnsureMemberRemoved(g.GinkgoT(), etcdClientFactory, nodeB.Name) + }, 5*time.Minute, 30*time.Second).ShouldNot(o.HaveOccurred()) + + msg = fmt.Sprintf("Ensuring that %s is a healthy voting member and adds %s back as learner", nodeA.Name, nodeB.Name) + g.By(msg) + o.Eventually(func() error { + members, err := getMembers(etcdClientFactory) + if err != nil { + return err + } + if len(members) != 2 { + return fmt.Errorf("Not enough members") + } + + if started, learner, err := getMemberState(&nodeA, members); err != nil { + return err + } else if !started || learner { + return fmt.Errorf("Expected node: %s to be a started and voting member. Membership: %+v", nodeA.Name, members) + } + + // Ensure GNS node is unstarted and a learner member (i.e. !learner) + if started, learner, err := getMemberState(&nodeB, members); err != nil { + return err + } else if started || !learner { + return fmt.Errorf("Expected node: %s to be a unstarted and learning member. Membership: %+v", nodeB.Name, members) + } + + g.GinkgoT().Logf("membership: %+v", members) + return nil + }, 2*time.Minute, 15*time.Second).ShouldNot(o.HaveOccurred()) + + msg = fmt.Sprintf("Ensuring %s rejoins as learner", nodeB.Name) + g.By(msg) + o.Eventually(func() error { + members, err := getMembers(etcdClientFactory) + if err != nil { + return err + } + if len(members) != 2 { + return fmt.Errorf("Not enough members") + } + + if started, learner, err := getMemberState(&nodeA, members); err != nil { + return err + } else if !started || learner { + return fmt.Errorf("Expected node: %s to be a started and voting member. Membership: %+v", nodeA.Name, members) + } + + if started, learner, err := getMemberState(&nodeB, members); err != nil { + return err + } else if !started || !learner { + return fmt.Errorf("Expected node: %s to be a started and learner member. Membership: %+v", nodeB.Name, members) + } + + g.GinkgoT().Logf("membership: %+v", members) + return nil + }, 10*time.Minute, 15*time.Second).ShouldNot(o.HaveOccurred()) + + msg = fmt.Sprintf("Ensuring %s node is promoted back as voting member", nodeB.Name) + g.By(msg) + o.Eventually(func() error { + members, err := getMembers(etcdClientFactory) + if err != nil { + return err + } + if len(members) != 2 { + return fmt.Errorf("Not enough members") + } + + if started, learner, err := getMemberState(&nodeA, members); err != nil { + return err + } else if !started || learner { + return fmt.Errorf("Expected node: %s to be a started and voting member. Membership: %+v", nodeA.Name, members) + } + + if started, learner, err := getMemberState(&nodeB, members); err != nil { + return err + } else if !started || learner { + return fmt.Errorf("Expected node: %s to be a started and voting member. Membership: %+v", nodeB.Name, members) + } + + g.GinkgoT().Logf("membership: %+v", members) + return nil + }, 10*time.Minute, 15*time.Second).ShouldNot(o.HaveOccurred()) + }) +}) + +func getMembers(etcdClientFactory helpers.EtcdClientCreator) ([]*etcdserverpb.Member, error) { + etcdClient, closeFn, err := etcdClientFactory.NewEtcdClient() + if err != nil { + return []*etcdserverpb.Member{}, errors.Wrap(err, "could not get a etcd client") + } + defer closeFn() + + ctx, cancel := context.WithTimeout(context.TODO(), 10*time.Second) + defer cancel() + m, err := etcdClient.MemberList(ctx) + if err != nil { + return []*etcdserverpb.Member{}, errors.Wrap(err, "could not get the member list") + } + return m.Members, nil +} + +func getMemberState(node *corev1.Node, members []*etcdserverpb.Member) (started, learner bool, err error) { + // Etcd members that have been added to the member list but haven't + // joined yet will have an empty Name field. We can match them via Peer URL. + peerURL := fmt.Sprintf("https://%s:2380", node.Status.Addresses[0].Address) + var found bool + for _, m := range members { + if m.Name == node.Name { + found = true + started = true + learner = m.IsLearner + break + } + if slices.Contains(m.PeerURLs, peerURL) { + found = true + learner = m.IsLearner + break + } + } + if !found { + return false, false, fmt.Errorf("could not find node %v", node.Name) + } + return started, learner, nil +} diff --git a/test/extended/two_node/tnf_topology.go b/test/extended/two_node/tnf_topology.go new file mode 100644 index 000000000000..ae62834cf656 --- /dev/null +++ b/test/extended/two_node/tnf_topology.go @@ -0,0 +1,117 @@ +package two_node + +import ( + "context" + "fmt" + + g "github.com/onsi/ginkgo/v2" + o "github.com/onsi/gomega" + v1 "github.com/openshift/api/config/v1" + exutil "github.com/openshift/origin/test/extended/util" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +const ensurePodmanEtcdContainerIsRunning = "podman inspect --format '{{.State.Running}}' etcd" + +var _ = g.Describe("[sig-node][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/two-node] Two Node with Fencing topology", func() { + defer g.GinkgoRecover() + var ( + oc = exutil.NewCLIWithoutNamespace("") + ) + + g.BeforeEach(func() { + skipIfNotTopology(oc, v1.DualReplicaTopologyMode) + }) + + g.It("Should validate the number of control-planes, workers and arbiters as configured", func() { + const ( + expectedTotalNodes = 2 + expectedControlPlanes = 2 + expectedWorkers = 2 // CPs will also have the Workers label + expectedArbiters = 0 + ) + + g.By(fmt.Sprintf("Ensuring only %d nodes in the cluster: %d CP/Workers and %d Arbiter nodes", expectedTotalNodes, expectedControlPlanes, expectedArbiters)) + nodes, err := oc.AdminKubeClient().CoreV1().Nodes().List(context.Background(), metav1.ListOptions{}) + o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve control-plane nodes without error") + o.Expect(len(nodes.Items)).To(o.Equal(expectedControlPlanes), fmt.Sprintf("Expected %d Control-plane Nodes, found %d", expectedTotalNodes, len(nodes.Items))) + + controlPlaneNodes, err := oc.AdminKubeClient().CoreV1().Nodes().List(context.Background(), metav1.ListOptions{ + LabelSelector: labelNodeRoleControlPlane, + }) + o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve control-plane nodes without error") + o.Expect(len(controlPlaneNodes.Items)).To(o.Equal(expectedControlPlanes), fmt.Sprintf("Expected %d Control-plane Nodes, found %d", expectedControlPlanes, len(controlPlaneNodes.Items))) + + workerNodes, err := oc.AdminKubeClient().CoreV1().Nodes().List(context.Background(), metav1.ListOptions{ + LabelSelector: labelNodeRoleWorker, + }) + o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve worker nodes without error") + o.Expect(len(workerNodes.Items)).To(o.Equal(expectedWorkers), fmt.Sprintf("Expected %d Worker Nodes, found %d", expectedWorkers, len(workerNodes.Items))) + + arbiterNodes, err := oc.AdminKubeClient().CoreV1().Nodes().List(context.Background(), metav1.ListOptions{ + LabelSelector: labelNodeRoleArbiter, + }) + o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve arbiter nodes without error") + o.Expect(len(arbiterNodes.Items)).To(o.Equal(expectedArbiters), fmt.Sprintf("Expected %d Arbiter Nodes, found %d", expectedArbiters, len(arbiterNodes.Items))) + }) +}) + +var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/two-node] Two Node with Fencing pods and podman containers", func() { + defer g.GinkgoRecover() + var ( + oc = exutil.NewCLIWithoutNamespace("") + nodes *corev1.NodeList + ) + + g.BeforeEach(func() { + skipIfNotTopology(oc, v1.DualReplicaTopologyMode) + + var err error + nodes, err = oc.AdminKubeClient().CoreV1().Nodes().List(context.Background(), metav1.ListOptions{}) + o.Expect(err).To(o.BeNil(), "Expected to retrieve all nodes without error") + }) + g.It("Should validate the number of etcd pods and containers as configured", func() { + const ( + expectedEtcdPod = 2 + expectedEtcdCtlContainers = 2 + expectedEtcdContainers = 0 + ) + + nodeNameA := nodes.Items[0].Name + nodeNameB := nodes.Items[1].Name + + g.By("Ensuring 0 etcd pod containers and 2 etcdctl pod containers are running in the cluster ") + pods, err := oc.AdminKubeClient().CoreV1().Pods("openshift-etcd").List(context.Background(), metav1.ListOptions{}) + o.Expect(err).To(o.BeNil(), "Expected to retrieve etcd pods in openshift-etcd namespace without error") + + etcdPodCount := 0 + etcdContainerCount := 0 + etcdctlContainerCount := 0 + for _, pod := range pods.Items { + if pod.Name == "etcd-"+nodeNameA || pod.Name == "etcd-"+nodeNameB { + etcdPodCount += 1 + for _, container := range pod.Spec.Containers { + if container.Name == "etcd" { + etcdContainerCount += 1 + } + if container.Name == "etcdctl" { + etcdctlContainerCount += 1 + } + } + } + } + o.Expect(etcdPodCount).To(o.Equal(expectedEtcdPod)) + o.Expect(etcdctlContainerCount).To(o.Equal(expectedEtcdCtlContainers)) + o.Expect(etcdContainerCount).To(o.Equal(expectedEtcdContainers)) + }) + + g.It("Should verify the number of podman-etcd containers as configured", func() { + g.By("Ensuring one podman etcd container is running on each node") + for _, node := range nodes.Items { + got, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, node.Name, "openshift-etcd", "podman", "inspect", "--format", "'{{.State.Running}}'", "etcd") + o.Expect(err).To(o.BeNil(), fmt.Sprintf("expected to call podman without errors on Node %s: error %v", node.Name, err)) + o.Expect(got).To(o.Equal("'true'"), fmt.Sprintf("expected a podman etcd container running on Node %s: got running %s", node.Name, got)) + } + }) +})