Skip to content

OCPBUGS-56736: Improve error messages for project Delete errors #520

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 58 additions & 1 deletion pkg/project/apiserver/registry/project/proxy/proxy.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@ package proxy
import (
"context"
"fmt"
"time"

kerrors "k8s.io/apimachinery/pkg/api/errors"
metainternal "k8s.io/apimachinery/pkg/apis/meta/internalversion"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/apimachinery/pkg/watch"
apirequest "k8s.io/apiserver/pkg/endpoints/request"
"k8s.io/apiserver/pkg/registry/rest"
Expand Down Expand Up @@ -208,11 +210,66 @@ func (s *REST) Update(ctx context.Context, name string, objInfo rest.UpdatedObje

var _ = rest.GracefulDeleter(&REST{})

// maxRetriesOnConflict is the maximum retry count for Delete calls which
// result in resource conflicts.
const maxRetriesOnConflict = 10

// maxDuration set max duration of delete retries. Deleting a project affects apiserver latency,
// so this should be kept as small as possible
const maxDuration = time.Second

// Delete deletes a Project specified by its name
func (s *REST) Delete(ctx context.Context, name string, objectFunc rest.ValidateObjectFunc, options *metav1.DeleteOptions) (runtime.Object, bool, error) {
var opts metav1.DeleteOptions
if options != nil {
opts = *options
}
return &metav1.Status{Status: metav1.StatusSuccess}, false, s.client.Delete(ctx, name, opts)
var lastErr error
err := wait.ExponentialBackoffWithContext(ctx, wait.Backoff{Steps: maxRetriesOnConflict, Duration: maxDuration}, func(ctx context.Context) (bool, error) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I'm understanding the Godoc for Duration correctly, this means that we will sleep for one second between retries. That seems high to me. I bet it is a lot longer than a typical total latency of both namespace requests combined.

We can configure the other fields for exponential backoff so that initial retry is fairly fast.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, right, somehow I though "Duration" is max duration we're allowed to spend. I think wait.Backoff{Steps: maxRetriesOnConflict, Factor: 1/maxRetriesOnConflict, Cap: maxDuration, Duration: maxDuration/maxRetriesOnConflict} would make it "up to 1 second" and ensure it has several retries

var err error
if objectFunc != nil {
var obj runtime.Object
obj, err = s.Get(ctx, name, &metav1.GetOptions{})
if err != nil {
lastErr = fmt.Errorf("unable to get project: %w", err)
return false, nil
}
projectObj, ok := obj.(*projectapi.Project)
if !ok || projectObj == nil {
lastErr = fmt.Errorf("not a project: %#v", obj)
return false, nil
}

// Make sure the object hasn't changed between Get and Delete - pass UID and RV to delete options
// unless Precondition is already set
if opts.Preconditions == nil {
opts.Preconditions = &metav1.Preconditions{}
}
if opts.Preconditions.UID == nil {
opts.Preconditions.UID = &projectObj.UID
}
if opts.Preconditions.ResourceVersion == nil {
opts.Preconditions.ResourceVersion = &projectObj.ResourceVersion
}
Comment on lines +248 to +253
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't want to retry conflicts that are caused by client-provided preconditions (they are probably doomed unless the request changes).

If we might have propagated one precondition from the request, and added a second precondition here, it becomes hard to robustly determine which precondition caused a conflict. One way to solve this might be to inspect the fresh namespace returned from Get and enforce any client-provided preconditions immediately. After that, we know that both preconditions passed to the namespace Delete came from this code and that a retry might succeed with a newer UID/RV.


if err := objectFunc(ctx, obj); err != nil {
lastErr = fmt.Errorf("validation func failed: %w", err)
return false, nil
}
}
err = s.client.Delete(ctx, name, opts)
switch {
case err == nil:
return true, nil
case kerrors.IsConflict(err):
lastErr = err
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add tests showing that retry happens on conflict, no retry happens on non-conflict, and one where retries are exhausted please?

return false, nil
default:
return false, err
}
})
if err != nil && wait.Interrupted(err) {
return &metav1.Status{Status: metav1.StatusFailure}, false, lastErr
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this will plumb non-conflict errors to the client. For example, if the namespace is not found then we should return project not found -- is there a test for that?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a test for that

}
return &metav1.Status{Status: metav1.StatusSuccess}, false, nil
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happens when err != nil && !wait.Interrupted(err)?

}
218 changes: 214 additions & 4 deletions pkg/project/apiserver/registry/project/proxy/proxy_test.go
Original file line number Diff line number Diff line change
@@ -1,17 +1,23 @@
package proxy

import (
"context"
"errors"
"fmt"
"strings"
"testing"

corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
kerrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
ktypes "k8s.io/apimachinery/pkg/types"
"k8s.io/apiserver/pkg/authentication/user"
apirequest "k8s.io/apiserver/pkg/endpoints/request"
"k8s.io/apiserver/pkg/registry/rest"
"k8s.io/client-go/kubernetes/fake"
ktesting "k8s.io/client-go/testing"

oapi "github.com/openshift/openshift-apiserver/pkg/api"
projectapi "github.com/openshift/openshift-apiserver/pkg/project/apis/project"
Expand Down Expand Up @@ -79,7 +85,7 @@ func TestCreateInvalidProject(t *testing.T) {
Annotations: map[string]string{oapi.OpenShiftDisplayName: "h\t\ni"},
},
}, rest.ValidateAllObjectFunc, &metav1.CreateOptions{})
if !errors.IsInvalid(err) {
if !kerrors.IsInvalid(err) {
t.Errorf("Expected 'invalid' error, got %v", err)
}
}
Expand All @@ -101,6 +107,27 @@ func TestCreateProjectOK(t *testing.T) {
}
}

func TestCreateProjectValidation(t *testing.T) {
mockClient := &fake.Clientset{}
storage := NewREST(mockClient.CoreV1().Namespaces(), &mockLister{}, nil, nil)

validationCalled := false
validationFunc := func(ctx context.Context, obj runtime.Object) error {
validationCalled = true
return nil
}

_, err := storage.Create(apirequest.NewContext(), &projectapi.Project{
ObjectMeta: metav1.ObjectMeta{Name: "foo"},
}, validationFunc, &metav1.CreateOptions{})
if err != nil {
t.Errorf("Unexpected non-nil error: %#v", err)
}
if !validationCalled {
t.Errorf("Expected validation function to be called")
}
}

func TestGetProjectOK(t *testing.T) {
mockClient := fake.NewSimpleClientset(&corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "foo"}})
storage := NewREST(mockClient.CoreV1().Namespaces(), &mockLister{}, nil, nil)
Expand Down Expand Up @@ -136,9 +163,192 @@ func TestDeleteProject(t *testing.T) {
t.Errorf("Expected status=success, got: %#v", status)
}
if len(mockClient.Actions()) != 1 {
t.Errorf("Expected client action for delete")
t.Errorf("Expected client action for delete, got %v", mockClient.Actions())
}
if !mockClient.Actions()[0].Matches("delete", "namespaces") {
t.Errorf("Expected call to delete-namespace")
t.Errorf("Expected call to delete-namespace, got %#v", mockClient.Actions()[0])
}
}

func TestDeleteProjectValidation(t *testing.T) {
mockClient := &fake.Clientset{}
storage := REST{
client: mockClient.CoreV1().Namespaces(),
}
validationCalled := false
validationFunc := func(ctx context.Context, obj runtime.Object) error {
validationCalled = true
return nil
}

storage.Delete(apirequest.NewContext(), "foo", validationFunc, &metav1.DeleteOptions{})
if !validationCalled {
t.Errorf("Expected validation function to be called")
}
}

func TestDeleteProjectValidationRetries(t *testing.T) {
mockClient := &fake.Clientset{}
storage := REST{
client: mockClient.CoreV1().Namespaces(),
}
maxRetries := 3
validationRetries := 0
validationFunc := func(ctx context.Context, obj runtime.Object) error {
if validationRetries < maxRetries {
validationRetries++
return fmt.Errorf("not there yet")
}
return nil
}
obj, _, err := storage.Delete(apirequest.NewContext(), "foo", validationFunc, &metav1.DeleteOptions{})
if obj == nil {
t.Error("Unexpected nil obj")
}
if err != nil {
t.Errorf("Unexpected non-nil error: %#v", err)
}
status, ok := obj.(*metav1.Status)
if !ok {
t.Errorf("Expected status type, got: %#v", obj)
}
if status.Status != metav1.StatusSuccess {
t.Errorf("Expected status=success, got: %#v", status)
}
if len(mockClient.Actions()) != maxRetries+2 {
t.Errorf("Expected client action for get, got %v", mockClient.Actions())
}
for i := range len(mockClient.Actions()) - 1 {
if !mockClient.Actions()[i].Matches("get", "namespaces") {
t.Errorf("Expected call #%d to get-namespace, got %#v", i, mockClient.Actions()[i])
}
}
if !mockClient.Actions()[len(mockClient.Actions())-1].Matches("delete", "namespaces") {
t.Errorf("Expected call #%d to delete-namespace, got %#v", len(mockClient.Actions())-1, mockClient.Actions()[len(mockClient.Actions())-1])
}
if validationRetries != maxRetries {
t.Errorf("Expected validation function to be retried %d times, got %d", maxRetries, validationRetries)
}
}

func TestDeleteProjectValidationError(t *testing.T) {
mockClient := &fake.Clientset{}
storage := REST{
client: mockClient.CoreV1().Namespaces(),
}
validationError := fmt.Errorf("faulty function")

validationFunc := func(ctx context.Context, obj runtime.Object) error {
return validationError
}
obj, _, err := storage.Delete(apirequest.NewContext(), "foo", validationFunc, &metav1.DeleteOptions{})
if obj == nil {
t.Error("Unexpected nil obj")
}
status, ok := obj.(*metav1.Status)
if !ok {
t.Errorf("Expected status type, got: %#v", obj)
}
if status.Status != metav1.StatusFailure {
t.Errorf("Expected status=failure, got: %#v", status)
}
if err == nil {
t.Errorf("Expected error but got nil")
}
if errors.Unwrap(err) != validationError {
t.Errorf("Unexpected error: %#v", errors.Unwrap(err))
}
}

func TestDeleteProjectValidationPreconditionUID(t *testing.T) {
uid := ktypes.UID("first-uid")
resourceVersion := "10"
meta := metav1.ObjectMeta{Name: "foo", UID: uid, ResourceVersion: resourceVersion}
namespaceList := corev1.NamespaceList{
Items: []corev1.Namespace{
{
ObjectMeta: meta,
},
},
}

tests := []struct {
testName string
uid, resourceVersion string
}{
{
testName: "all unset",
},
{
testName: "uid set",
uid: "first-uid",
},
{
testName: "resourceVersion set",
resourceVersion: "10",
},
{
testName: "both set",
uid: "first-uid",
resourceVersion: "10",
},
}
for _, test := range tests {
t.Run(test.testName, func(t *testing.T) {
mockClient := fake.NewSimpleClientset(&namespaceList)
storage := REST{
client: mockClient.CoreV1().Namespaces(),
}
validationFunc := func(ctx context.Context, obj runtime.Object) error {
return nil
}

expectedUID := uid
expectedRV := resourceVersion
deleteOpts := &metav1.DeleteOptions{}
if len(test.uid) > 0 {
expectedUID = ktypes.UID(test.uid)
if deleteOpts.Preconditions == nil {
deleteOpts.Preconditions = &metav1.Preconditions{}
}
deleteOpts.Preconditions.UID = &expectedUID
}
if len(test.resourceVersion) > 0 {
expectedRV = test.resourceVersion
if deleteOpts.Preconditions == nil {
deleteOpts.Preconditions = &metav1.Preconditions{}
}
deleteOpts.Preconditions.ResourceVersion = &expectedRV
}

storage.Delete(apirequest.NewContext(), "foo", validationFunc, deleteOpts)
if len(mockClient.Actions()) != 2 {
t.Errorf("Expected client action for get and delete, got %v", mockClient.Actions())
}
if !mockClient.Actions()[0].Matches("get", "namespaces") {
t.Errorf("Expected call to get-namespace, got %#v", mockClient.Actions()[0])
}
lastAction := mockClient.Actions()[1]
if !lastAction.Matches("delete", "namespaces") {
t.Errorf("Expected call to delete-namespace, got %#v", mockClient.Actions()[1])
}
deleteAction := lastAction.(ktesting.DeleteActionImpl)
preconditions := deleteAction.DeleteOptions.Preconditions
if preconditions == nil {
t.Fatalf("Expected DeleteOptions precondition to be non-nil")
}
if preconditions.UID == nil {
t.Fatalf("Expected DeleteOptions precondition UID to be non-nil")
}
if *preconditions.UID != expectedUID {
t.Errorf("Expected DeleteOptions precondition UID to %#v, got %#v", expectedUID, *preconditions.UID)
}
if preconditions.ResourceVersion == nil {
t.Fatalf("Expected DeleteOptions precondition ResourceVersion to be non-nil")
}
if *preconditions.ResourceVersion != expectedRV {
t.Errorf("Expected DeleteOptions precondition ResourceVersion to %#v, got %#v", expectedRV, *preconditions.ResourceVersion)
}
})
}
}