-
Notifications
You must be signed in to change notification settings - Fork 98
OCPBUGS-56736: Improve error messages for project Delete errors #520
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,11 +3,13 @@ package proxy | |
import ( | ||
"context" | ||
"fmt" | ||
"time" | ||
|
||
kerrors "k8s.io/apimachinery/pkg/api/errors" | ||
metainternal "k8s.io/apimachinery/pkg/apis/meta/internalversion" | ||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
"k8s.io/apimachinery/pkg/runtime" | ||
"k8s.io/apimachinery/pkg/util/wait" | ||
"k8s.io/apimachinery/pkg/watch" | ||
apirequest "k8s.io/apiserver/pkg/endpoints/request" | ||
"k8s.io/apiserver/pkg/registry/rest" | ||
|
@@ -208,11 +210,66 @@ func (s *REST) Update(ctx context.Context, name string, objInfo rest.UpdatedObje | |
|
||
var _ = rest.GracefulDeleter(&REST{}) | ||
|
||
// maxRetriesOnConflict is the maximum retry count for Delete calls which | ||
// result in resource conflicts. | ||
const maxRetriesOnConflict = 10 | ||
|
||
// maxDuration set max duration of delete retries. Deleting a project affects apiserver latency, | ||
// so this should be kept as small as possible | ||
const maxDuration = time.Second | ||
|
||
// Delete deletes a Project specified by its name | ||
func (s *REST) Delete(ctx context.Context, name string, objectFunc rest.ValidateObjectFunc, options *metav1.DeleteOptions) (runtime.Object, bool, error) { | ||
var opts metav1.DeleteOptions | ||
if options != nil { | ||
opts = *options | ||
} | ||
return &metav1.Status{Status: metav1.StatusSuccess}, false, s.client.Delete(ctx, name, opts) | ||
var lastErr error | ||
err := wait.ExponentialBackoffWithContext(ctx, wait.Backoff{Steps: maxRetriesOnConflict, Duration: maxDuration}, func(ctx context.Context) (bool, error) { | ||
var err error | ||
if objectFunc != nil { | ||
var obj runtime.Object | ||
obj, err = s.Get(ctx, name, &metav1.GetOptions{}) | ||
if err != nil { | ||
lastErr = fmt.Errorf("unable to get project: %w", err) | ||
return false, nil | ||
} | ||
projectObj, ok := obj.(*projectapi.Project) | ||
if !ok || projectObj == nil { | ||
lastErr = fmt.Errorf("not a project: %#v", obj) | ||
return false, nil | ||
} | ||
|
||
// Make sure the object hasn't changed between Get and Delete - pass UID and RV to delete options | ||
// unless Precondition is already set | ||
if opts.Preconditions == nil { | ||
opts.Preconditions = &metav1.Preconditions{} | ||
} | ||
if opts.Preconditions.UID == nil { | ||
opts.Preconditions.UID = &projectObj.UID | ||
} | ||
if opts.Preconditions.ResourceVersion == nil { | ||
opts.Preconditions.ResourceVersion = &projectObj.ResourceVersion | ||
} | ||
Comment on lines
+248
to
+253
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't want to retry conflicts that are caused by client-provided preconditions (they are probably doomed unless the request changes). If we might have propagated one precondition from the request, and added a second precondition here, it becomes hard to robustly determine which precondition caused a conflict. One way to solve this might be to inspect the fresh namespace returned from Get and enforce any client-provided preconditions immediately. After that, we know that both preconditions passed to the namespace Delete came from this code and that a retry might succeed with a newer UID/RV. |
||
|
||
if err := objectFunc(ctx, obj); err != nil { | ||
lastErr = fmt.Errorf("validation func failed: %w", err) | ||
return false, nil | ||
} | ||
} | ||
err = s.client.Delete(ctx, name, opts) | ||
switch { | ||
case err == nil: | ||
return true, nil | ||
case kerrors.IsConflict(err): | ||
lastErr = err | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add tests showing that retry happens on conflict, no retry happens on non-conflict, and one where retries are exhausted please? |
||
return false, nil | ||
default: | ||
return false, err | ||
} | ||
}) | ||
if err != nil && wait.Interrupted(err) { | ||
return &metav1.Status{Status: metav1.StatusFailure}, false, lastErr | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think this will plumb non-conflict errors to the client. For example, if the namespace is not found then we should return project not found -- is there a test for that? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added a test for that |
||
} | ||
return &metav1.Status{Status: metav1.StatusSuccess}, false, nil | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What happens when |
||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If I'm understanding the Godoc for Duration correctly, this means that we will sleep for one second between retries. That seems high to me. I bet it is a lot longer than a typical total latency of both namespace requests combined.
We can configure the other fields for exponential backoff so that initial retry is fairly fast.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, right, somehow I though "Duration" is max duration we're allowed to spend. I think
wait.Backoff{Steps: maxRetriesOnConflict, Factor: 1/maxRetriesOnConflict, Cap: maxDuration, Duration: maxDuration/maxRetriesOnConflict}
would make it "up to 1 second" and ensure it has several retries