Skip to content

Initial tests for Two Nodes OCP with Fencing (TNF) cluster #29833

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions pkg/testsuites/standard_suites.go
Original file line number Diff line number Diff line change
Expand Up @@ -462,4 +462,17 @@ var staticSuites = []ginkgo.TestSuite{
},
TestTimeout: 120 * time.Minute,
},
{
Name: "openshift/two-node",
Description: templates.LongDesc(`
This test suite runs tests to validate two-node.
`),
Matches: func(name string) bool {
if isDisabled(name) {
return false
}
return strings.Contains(name, "[Suite:openshift/two-node") || strings.Contains(name, "[FeatureGate:DualReplica]") || strings.Contains(name, "[FeatureGate:HighlyAvailableArbiter]")
},
TestTimeout: 60 * time.Minute,
},
}
47 changes: 11 additions & 36 deletions test/extended/two_node/arbiter_topology.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package arbiter_topology
package two_node

import (
"context"
Expand All @@ -20,11 +20,6 @@ import (
"k8s.io/apimachinery/pkg/util/wait"
)

const (
labelNodeRoleMaster = "node-role.kubernetes.io/master"
labelNodeRoleArbiter = "node-role.kubernetes.io/arbiter"
)

var (
defaultExpectedMaxPodCount = 30
expectedMaxPodCountsPerPlatform = map[v1.PlatformType]int{
Expand All @@ -33,15 +28,12 @@ var (
}
)

var _ = g.Describe("[sig-node][apigroup:config.openshift.io][OCPFeatureGate:HighlyAvailableArbiter] expected Master and Arbiter node counts", func() {
var _ = g.Describe("[sig-node][apigroup:config.openshift.io][OCPFeatureGate:HighlyAvailableArbiter][Suite:openshift/two-node] expected Master and Arbiter node counts", func() {
defer g.GinkgoRecover()
oc := exutil.NewCLIWithoutNamespace("")

g.BeforeEach(func() {
infraStatus := getInfraStatus(oc)
if infraStatus.ControlPlaneTopology != v1.HighlyAvailableArbiterMode {
g.Skip("Cluster is not in HighlyAvailableArbiterMode skipping test")
}
skipIfNotTopology(oc, v1.HighlyAvailableArbiterMode)
})

g.It("Should validate that there are Master and Arbiter nodes as specified in the cluster", func() {
Expand All @@ -66,7 +58,7 @@ var _ = g.Describe("[sig-node][apigroup:config.openshift.io][OCPFeatureGate:High
})
})

var _ = g.Describe("[sig-node][apigroup:config.openshift.io][OCPFeatureGate:HighlyAvailableArbiter] required pods on the Arbiter node", func() {
var _ = g.Describe("[sig-node][apigroup:config.openshift.io][OCPFeatureGate:HighlyAvailableArbiter][Suite:openshift/two-node] required pods on the Arbiter node", func() {
defer g.GinkgoRecover()

var (
Expand All @@ -75,10 +67,7 @@ var _ = g.Describe("[sig-node][apigroup:config.openshift.io][OCPFeatureGate:High
)

g.BeforeEach(func() {
infraStatus = getInfraStatus(oc)
if infraStatus.ControlPlaneTopology != v1.HighlyAvailableArbiterMode {
g.Skip("Cluster is not in HighlyAvailableArbiterMode skipping test")
}
skipIfNotTopology(oc, v1.HighlyAvailableArbiterMode)
})
g.It("Should verify that the correct number of pods are running on the Arbiter node", func() {
g.By("inferring platform type")
Expand Down Expand Up @@ -110,12 +99,12 @@ var _ = g.Describe("[sig-node][apigroup:config.openshift.io][OCPFeatureGate:High
})
})

var _ = g.Describe("[sig-apps][apigroup:apps.openshift.io][OCPFeatureGate:HighlyAvailableArbiter] Deployments on HighlyAvailableArbiterMode topology", func() {
var _ = g.Describe("[sig-apps][apigroup:apps.openshift.io][OCPFeatureGate:HighlyAvailableArbiter][Suite:openshift/two-node] Deployments on HighlyAvailableArbiterMode topology", func() {
defer g.GinkgoRecover()

oc := exutil.NewCLI("arbiter-pod-validation").SetManagedNamespace().AsAdmin()
g.BeforeEach(func() {
skipNonArbiterCluster(oc)
skipIfNotTopology(oc, v1.HighlyAvailableArbiterMode)
})

g.It("should be created on arbiter nodes when arbiter node is selected", func() {
Expand Down Expand Up @@ -202,12 +191,12 @@ var _ = g.Describe("[sig-apps][apigroup:apps.openshift.io][OCPFeatureGate:Highly
})
})

var _ = g.Describe("[sig-apps][apigroup:apps.openshift.io][OCPFeatureGate:HighlyAvailableArbiter] Evaluate DaemonSet placement in HighlyAvailableArbiterMode topology", func() {
var _ = g.Describe("[sig-apps][apigroup:apps.openshift.io][OCPFeatureGate:HighlyAvailableArbiter][Suite:openshift/two-node] Evaluate DaemonSet placement in HighlyAvailableArbiterMode topology", func() {
defer g.GinkgoRecover()
oc := exutil.NewCLI("daemonset-pod-validation").SetManagedNamespace().AsAdmin()

g.BeforeEach(func() {
skipNonArbiterCluster(oc)
skipIfNotTopology(oc, v1.HighlyAvailableArbiterMode)
})

g.It("should not create a DaemonSet on the Arbiter node", func() {
Expand Down Expand Up @@ -252,12 +241,12 @@ var _ = g.Describe("[sig-apps][apigroup:apps.openshift.io][OCPFeatureGate:Highly
})
})

var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:HighlyAvailableArbiter] Ensure etcd health and quorum in HighlyAvailableArbiterMode", func() {
var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:HighlyAvailableArbiter][Suite:openshift/two-node] Ensure etcd health and quorum in HighlyAvailableArbiterMode", func() {
defer g.GinkgoRecover()
oc := exutil.NewCLIWithoutNamespace("").AsAdmin()

g.BeforeEach(func() {
skipNonArbiterCluster(oc)
skipIfNotTopology(oc, v1.HighlyAvailableArbiterMode)
})

g.It("should have all etcd pods running and quorum met", func() {
Expand Down Expand Up @@ -454,17 +443,3 @@ func isClusterOperatorDegraded(operator *v1.ClusterOperator) bool {
}
return false
}

func skipNonArbiterCluster(oc *exutil.CLI) {
infraStatus := getInfraStatus(oc)
if infraStatus.ControlPlaneTopology != v1.HighlyAvailableArbiterMode {
g.Skip("Cluster is not in HighlyAvailableArbiterMode, skipping test")
}
}

func getInfraStatus(oc *exutil.CLI) v1.InfrastructureStatus {
infra, err := oc.AdminConfigClient().ConfigV1().Infrastructures().Get(context.Background(),
"cluster", metav1.GetOptions{})
o.Expect(err).NotTo(o.HaveOccurred())
return infra.Status
}
26 changes: 26 additions & 0 deletions test/extended/two_node/common.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package two_node

import (
"fmt"

v1 "github.com/openshift/api/config/v1"
exutil "github.com/openshift/origin/test/extended/util"
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
)

const (
labelNodeRoleMaster = "node-role.kubernetes.io/master"
labelNodeRoleControlPlane = "node-role.kubernetes.io/control-plane"
labelNodeRoleWorker = "node-role.kubernetes.io/worker"
labelNodeRoleArbiter = "node-role.kubernetes.io/arbiter"
)

func skipIfNotTopology(oc *exutil.CLI, wanted v1.TopologyMode) {
current, err := exutil.GetControlPlaneTopology(oc)
if err != nil {
e2eskipper.Skip(fmt.Sprintf("Could not get current topology, skipping test: error %v", err))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This may be a little strange, but I think we should default to running the test when we don't know the topology. The reason is - this will likely lead to the tests running and failing, which will help us identify misconfigured clusters.

We want to avoid failing silently.

}
if *current != wanted {
e2eskipper.Skip(fmt.Sprintf("Cluster is not in %v topology, skipping test", wanted))
}
}
196 changes: 196 additions & 0 deletions test/extended/two_node/tnf_recovery.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
package two_node

import (
"context"
"fmt"
"math/rand"
"slices"
"time"

g "github.com/onsi/ginkgo/v2"
o "github.com/onsi/gomega"
v1 "github.com/openshift/api/config/v1"
"github.com/openshift/origin/test/extended/etcd/helpers"
exutil "github.com/openshift/origin/test/extended/util"
"github.com/pkg/errors"
"go.etcd.io/etcd/api/v3/etcdserverpb"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/two-node] Two Node with Fencing etcd recovery", func() {
defer g.GinkgoRecover()

var (
oc = exutil.NewCLIWithoutNamespace("").AsAdmin()
etcdClientFactory *helpers.EtcdClientFactoryImpl
nodeA, nodeB corev1.Node
)

g.BeforeEach(func() {
skipIfNotTopology(oc, v1.DualReplicaTopologyMode)

nodes, err := oc.AdminKubeClient().CoreV1().Nodes().List(context.Background(), metav1.ListOptions{})
o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error")
o.Expect(len(nodes.Items)).To(o.BeNumerically("==", 2), "Expected to find 2 Nodes only")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add a filter for control-plane nodes? I'm imagining a future where we're asked to support/test compute nodes.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might also be good to verify that there are 0 arbiter nodes.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking below, we have another test for that. :) So I think just verifying 2 control-plane nodes should be sufficient.


// Select the first index randomly
randomIndex := rand.Intn(len(nodes.Items))
nodeA = nodes.Items[randomIndex]
// Select the remaining index
nodeB = nodes.Items[(randomIndex+1)%len(nodes.Items)]
g.GinkgoT().Printf("Randomly selected %s (%s) to be gracefully shut down and %s (%s) to take the lead\n", nodeB.Name, nodeB.Status.Addresses[0].Address, nodeA.Name, nodeA.Status.Addresses[0].Address)

kubeClient := oc.KubeClient()
etcdClientFactory = helpers.NewEtcdClientFactory(kubeClient)

g.GinkgoT().Printf("Ensure both nodes are healthy before starting the test\n")
o.Eventually(func() error {
return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, nodeA.Name)
}, time.Minute, 5*time.Second).ShouldNot(o.HaveOccurred(), "expect to ensure Node A healthy without error")

o.Eventually(func() error {
return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, nodeB.Name)
}, time.Minute, 5*time.Second).ShouldNot(o.HaveOccurred(), "expect to ensure Node B healthy without error")
})

g.It("Should support a graceful node shutdown", func() {
msg := fmt.Sprintf("Shutting down %s gracefully in 1 minute", nodeB.Name)
g.By(msg)
// NOTE: Using `shutdown` alone would cause the node to be permanently removed from the cluster.
// To prevent this, we use the `--reboot` flag, which ensures a graceful shutdown and allows the
// node to rejoin the cluster upon restart. A one-minute delay is added to give the debug node
// sufficient time to cleanly exit before the shutdown process completes.
_, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, nodeB.Name, "openshift-etcd", "shutdown", "--reboot", "+1")
o.Expect(err).To(o.BeNil(), "Expected to gracefully shutdown the node without errors")
time.Sleep(time.Minute)

msg = fmt.Sprintf("Ensuring %s leaves the member list", nodeB.Name)
g.By(msg)
o.Eventually(func() error {
return helpers.EnsureMemberRemoved(g.GinkgoT(), etcdClientFactory, nodeB.Name)
}, 5*time.Minute, 30*time.Second).ShouldNot(o.HaveOccurred())

msg = fmt.Sprintf("Ensuring that %s is a healthy voting member and adds %s back as learner", nodeA.Name, nodeB.Name)
g.By(msg)
o.Eventually(func() error {
members, err := getMembers(etcdClientFactory)
if err != nil {
return err
}
if len(members) != 2 {
return fmt.Errorf("Not enough members")
}

if started, learner, err := getMemberState(&nodeA, members); err != nil {
return err
} else if !started || learner {
return fmt.Errorf("Expected node: %s to be a started and voting member. Membership: %+v", nodeA.Name, members)
}

// Ensure GNS node is unstarted and a learner member (i.e. !learner)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am confused by this comment.
Wouldn't !learner mean that it's not a learner?

if started, learner, err := getMemberState(&nodeB, members); err != nil {
return err
} else if started || !learner {
return fmt.Errorf("Expected node: %s to be a unstarted and learning member. Membership: %+v", nodeB.Name, members)
}

g.GinkgoT().Logf("membership: %+v", members)
return nil
}, 2*time.Minute, 15*time.Second).ShouldNot(o.HaveOccurred())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We may want to pull these timing values out to a shared place in the file to keep them consistent with our non-graceful shutdown test - or even just to quickly be able to adjust timeouts and check frequency across the test suite.


msg = fmt.Sprintf("Ensuring %s rejoins as learner", nodeB.Name)
g.By(msg)
o.Eventually(func() error {
members, err := getMembers(etcdClientFactory)
if err != nil {
return err
}
if len(members) != 2 {
return fmt.Errorf("Not enough members")
}

if started, learner, err := getMemberState(&nodeA, members); err != nil {
return err
} else if !started || learner {
return fmt.Errorf("Expected node: %s to be a started and voting member. Membership: %+v", nodeA.Name, members)
}

if started, learner, err := getMemberState(&nodeB, members); err != nil {
return err
} else if !started || !learner {
return fmt.Errorf("Expected node: %s to be a started and learner member. Membership: %+v", nodeB.Name, members)
}

g.GinkgoT().Logf("membership: %+v", members)
return nil
}, 10*time.Minute, 15*time.Second).ShouldNot(o.HaveOccurred())

msg = fmt.Sprintf("Ensuring %s node is promoted back as voting member", nodeB.Name)
g.By(msg)
o.Eventually(func() error {
members, err := getMembers(etcdClientFactory)
if err != nil {
return err
}
if len(members) != 2 {
return fmt.Errorf("Not enough members")
}

if started, learner, err := getMemberState(&nodeA, members); err != nil {
return err
} else if !started || learner {
return fmt.Errorf("Expected node: %s to be a started and voting member. Membership: %+v", nodeA.Name, members)
}

if started, learner, err := getMemberState(&nodeB, members); err != nil {
return err
} else if !started || learner {
return fmt.Errorf("Expected node: %s to be a started and voting member. Membership: %+v", nodeB.Name, members)
}

g.GinkgoT().Logf("membership: %+v", members)
return nil
}, 10*time.Minute, 15*time.Second).ShouldNot(o.HaveOccurred())
})
})

func getMembers(etcdClientFactory helpers.EtcdClientCreator) ([]*etcdserverpb.Member, error) {
etcdClient, closeFn, err := etcdClientFactory.NewEtcdClient()
if err != nil {
return []*etcdserverpb.Member{}, errors.Wrap(err, "could not get a etcd client")
}
defer closeFn()

ctx, cancel := context.WithTimeout(context.TODO(), 10*time.Second)
defer cancel()
m, err := etcdClient.MemberList(ctx)
if err != nil {
return []*etcdserverpb.Member{}, errors.Wrap(err, "could not get the member list")
}
return m.Members, nil
}

func getMemberState(node *corev1.Node, members []*etcdserverpb.Member) (started, learner bool, err error) {
// Etcd members that have been added to the member list but haven't
// joined yet will have an empty Name field. We can match them via Peer URL.
peerURL := fmt.Sprintf("https://%s:2380", node.Status.Addresses[0].Address)
var found bool
for _, m := range members {
if m.Name == node.Name {
found = true
started = true
learner = m.IsLearner
break
}
if slices.Contains(m.PeerURLs, peerURL) {
found = true
learner = m.IsLearner
break
}
}
if !found {
return false, false, fmt.Errorf("could not find node %v", node.Name)
}
return started, learner, nil
}
Loading