From 4506816efc85582d4c4e6da36b6ec754df421570 Mon Sep 17 00:00:00 2001 From: Mike VanDenburgh Date: Fri, 1 Aug 2025 11:47:06 -0400 Subject: [PATCH] Configure retries/backoff for gitlab-runner k8s API requests --- .../runners/protected/graviton/3/release.yaml | 18 ++++++++++++++++++ .../runners/protected/graviton/4/release.yaml | 18 ++++++++++++++++++ .../protected/x86_64/v2-win/release.yaml | 18 ++++++++++++++++++ .../runners/protected/x86_64/v2/release.yaml | 18 ++++++++++++++++++ .../runners/protected/x86_64/v3/release.yaml | 18 ++++++++++++++++++ .../runners/protected/x86_64/v4/release.yaml | 18 ++++++++++++++++++ .../runners/public/graviton/3/release.yaml | 18 ++++++++++++++++++ .../runners/public/graviton/4/release.yaml | 18 ++++++++++++++++++ .../runners/public/x86_64/v2-win/release.yaml | 18 ++++++++++++++++++ .../runners/public/x86_64/v2/release.yaml | 18 ++++++++++++++++++ .../runners/public/x86_64/v3/release.yaml | 18 ++++++++++++++++++ .../runners/public/x86_64/v4/release.yaml | 18 ++++++++++++++++++ k8s/production/runners/signing/release.yaml | 18 ++++++++++++++++++ 13 files changed, 234 insertions(+) diff --git a/k8s/production/runners/protected/graviton/3/release.yaml b/k8s/production/runners/protected/graviton/3/release.yaml index 76380932b..d13d45651 100644 --- a/k8s/production/runners/protected/graviton/3/release.yaml +++ b/k8s/production/runners/protected/graviton/3/release.yaml @@ -96,6 +96,24 @@ spec: poll_timeout = 600 # ten minutes service_account = "runner" + # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds, + # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`. + # We increase it to 30 seconds to give the pod more time to reach the server; it's more important + # that the job eventually starts than for it to fail quickly and get retried. + # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api + retry_backoff_max = 30000 + + # This is the default retry limit. We override this for specific classes of + # errors below. + retry_limit = 5 + + [runners.kubernetes.retry_limits] + # Retry this type of error 10 times instead of 5. + # This error usually occurs when the EKS API server times out or + # is unreachable. Presumably the server will eventually become + # available again, so we want to give the pod plenty of time to retry. + "tls: internal error" = 10 + [runners.kubernetes.affinity] [runners.kubernetes.affinity.node_affinity] diff --git a/k8s/production/runners/protected/graviton/4/release.yaml b/k8s/production/runners/protected/graviton/4/release.yaml index 0eeb2dfc2..581f79aad 100644 --- a/k8s/production/runners/protected/graviton/4/release.yaml +++ b/k8s/production/runners/protected/graviton/4/release.yaml @@ -96,6 +96,24 @@ spec: poll_timeout = 600 # ten minutes service_account = "runner" + # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds, + # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`. + # We increase it to 30 seconds to give the pod more time to reach the server; it's more important + # that the job eventually starts than for it to fail quickly and get retried. + # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api + retry_backoff_max = 30000 + + # This is the default retry limit. We override this for specific classes of + # errors below. + retry_limit = 5 + + [runners.kubernetes.retry_limits] + # Retry this type of error 10 times instead of 5. + # This error usually occurs when the EKS API server times out or + # is unreachable. Presumably the server will eventually become + # available again, so we want to give the pod plenty of time to retry. + "tls: internal error" = 10 + [runners.kubernetes.affinity] [runners.kubernetes.affinity.node_affinity] diff --git a/k8s/production/runners/protected/x86_64/v2-win/release.yaml b/k8s/production/runners/protected/x86_64/v2-win/release.yaml index f01ee4cd8..2f6c4dc76 100644 --- a/k8s/production/runners/protected/x86_64/v2-win/release.yaml +++ b/k8s/production/runners/protected/x86_64/v2-win/release.yaml @@ -120,6 +120,24 @@ spec: ephemeral_storage_request = "500M" helper_ephemeral_storage_request = "500M" + # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds, + # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`. + # We increase it to 30 seconds to give the pod more time to reach the server; it's more important + # that the job eventually starts than for it to fail quickly and get retried. + # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api + retry_backoff_max = 30000 + + # This is the default retry limit. We override this for specific classes of + # errors below. + retry_limit = 5 + + [runners.kubernetes.retry_limits] + # Retry this type of error 10 times instead of 5. + # This error usually occurs when the EKS API server times out or + # is unreachable. Presumably the server will eventually become + # available again, so we want to give the pod plenty of time to retry. + "tls: internal error" = 10 + # Place pod close to other pipeline pods if possible ("pack" the pods tightly) # Docs: https://docs.gitlab.com/runner/executors/kubernetes.html#define-nodes-where-pods-are-scheduled [runners.kubernetes.affinity] diff --git a/k8s/production/runners/protected/x86_64/v2/release.yaml b/k8s/production/runners/protected/x86_64/v2/release.yaml index fa3db5614..f0a5c966b 100644 --- a/k8s/production/runners/protected/x86_64/v2/release.yaml +++ b/k8s/production/runners/protected/x86_64/v2/release.yaml @@ -95,6 +95,24 @@ spec: poll_timeout = 600 # ten minutes service_account = "runner" + # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds, + # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`. + # We increase it to 30 seconds to give the pod more time to reach the server; it's more important + # that the job eventually starts than for it to fail quickly and get retried. + # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api + retry_backoff_max = 30000 + + # This is the default retry limit. We override this for specific classes of + # errors below. + retry_limit = 5 + + [runners.kubernetes.retry_limits] + # Retry this type of error 10 times instead of 5. + # This error usually occurs when the EKS API server times out or + # is unreachable. Presumably the server will eventually become + # available again, so we want to give the pod plenty of time to retry. + "tls: internal error" = 10 + [runners.kubernetes.affinity] [runners.kubernetes.affinity.node_affinity] diff --git a/k8s/production/runners/protected/x86_64/v3/release.yaml b/k8s/production/runners/protected/x86_64/v3/release.yaml index 8b7a63a65..d9f7f58ce 100644 --- a/k8s/production/runners/protected/x86_64/v3/release.yaml +++ b/k8s/production/runners/protected/x86_64/v3/release.yaml @@ -95,6 +95,24 @@ spec: poll_timeout = 600 # ten minutes service_account = "runner" + # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds, + # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`. + # We increase it to 30 seconds to give the pod more time to reach the server; it's more important + # that the job eventually starts than for it to fail quickly and get retried. + # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api + retry_backoff_max = 30000 + + # This is the default retry limit. We override this for specific classes of + # errors below. + retry_limit = 5 + + [runners.kubernetes.retry_limits] + # Retry this type of error 10 times instead of 5. + # This error usually occurs when the EKS API server times out or + # is unreachable. Presumably the server will eventually become + # available again, so we want to give the pod plenty of time to retry. + "tls: internal error" = 10 + [runners.kubernetes.affinity] [runners.kubernetes.affinity.node_affinity] diff --git a/k8s/production/runners/protected/x86_64/v4/release.yaml b/k8s/production/runners/protected/x86_64/v4/release.yaml index 9494d8f1d..5be4fd6a6 100644 --- a/k8s/production/runners/protected/x86_64/v4/release.yaml +++ b/k8s/production/runners/protected/x86_64/v4/release.yaml @@ -95,6 +95,24 @@ spec: poll_timeout = 600 # ten minutes service_account = "runner" + # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds, + # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`. + # We increase it to 30 seconds to give the pod more time to reach the server; it's more important + # that the job eventually starts than for it to fail quickly and get retried. + # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api + retry_backoff_max = 30000 + + # This is the default retry limit. We override this for specific classes of + # errors below. + retry_limit = 5 + + [runners.kubernetes.retry_limits] + # Retry this type of error 10 times instead of 5. + # This error usually occurs when the EKS API server times out or + # is unreachable. Presumably the server will eventually become + # available again, so we want to give the pod plenty of time to retry. + "tls: internal error" = 10 + [runners.kubernetes.affinity] [runners.kubernetes.affinity.node_affinity] diff --git a/k8s/production/runners/public/graviton/3/release.yaml b/k8s/production/runners/public/graviton/3/release.yaml index 0e6582e3b..7f4ed481d 100644 --- a/k8s/production/runners/public/graviton/3/release.yaml +++ b/k8s/production/runners/public/graviton/3/release.yaml @@ -96,6 +96,24 @@ spec: poll_timeout = 600 # ten minutes service_account = "runner" + # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds, + # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`. + # We increase it to 30 seconds to give the pod more time to reach the server; it's more important + # that the job eventually starts than for it to fail quickly and get retried. + # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api + retry_backoff_max = 30000 + + # This is the default retry limit. We override this for specific classes of + # errors below. + retry_limit = 5 + + [runners.kubernetes.retry_limits] + # Retry this type of error 10 times instead of 5. + # This error usually occurs when the EKS API server times out or + # is unreachable. Presumably the server will eventually become + # available again, so we want to give the pod plenty of time to retry. + "tls: internal error" = 10 + [runners.kubernetes.affinity] [runners.kubernetes.affinity.node_affinity] diff --git a/k8s/production/runners/public/graviton/4/release.yaml b/k8s/production/runners/public/graviton/4/release.yaml index 2fad29a7f..ec54303dc 100644 --- a/k8s/production/runners/public/graviton/4/release.yaml +++ b/k8s/production/runners/public/graviton/4/release.yaml @@ -96,6 +96,24 @@ spec: poll_timeout = 600 # ten minutes service_account = "runner" + # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds, + # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`. + # We increase it to 30 seconds to give the pod more time to reach the server; it's more important + # that the job eventually starts than for it to fail quickly and get retried. + # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api + retry_backoff_max = 30000 + + # This is the default retry limit. We override this for specific classes of + # errors below. + retry_limit = 5 + + [runners.kubernetes.retry_limits] + # Retry this type of error 10 times instead of 5. + # This error usually occurs when the EKS API server times out or + # is unreachable. Presumably the server will eventually become + # available again, so we want to give the pod plenty of time to retry. + "tls: internal error" = 10 + [runners.kubernetes.affinity] [runners.kubernetes.affinity.node_affinity] diff --git a/k8s/production/runners/public/x86_64/v2-win/release.yaml b/k8s/production/runners/public/x86_64/v2-win/release.yaml index 6cbbb9756..649370bda 100644 --- a/k8s/production/runners/public/x86_64/v2-win/release.yaml +++ b/k8s/production/runners/public/x86_64/v2-win/release.yaml @@ -121,6 +121,24 @@ spec: ephemeral_storage_request = "500M" helper_ephemeral_storage_request = "500M" + # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds, + # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`. + # We increase it to 30 seconds to give the pod more time to reach the server; it's more important + # that the job eventually starts than for it to fail quickly and get retried. + # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api + retry_backoff_max = 30000 + + # This is the default retry limit. We override this for specific classes of + # errors below. + retry_limit = 5 + + [runners.kubernetes.retry_limits] + # Retry this type of error 10 times instead of 5. + # This error usually occurs when the EKS API server times out or + # is unreachable. Presumably the server will eventually become + # available again, so we want to give the pod plenty of time to retry. + "tls: internal error" = 10 + # Place pod close to other pipeline pods if possible ("pack" the pods tightly) # Docs: https://docs.gitlab.com/runner/executors/kubernetes.html#define-nodes-where-pods-are-scheduled [runners.kubernetes.affinity] diff --git a/k8s/production/runners/public/x86_64/v2/release.yaml b/k8s/production/runners/public/x86_64/v2/release.yaml index 62fb83ac8..4c9daa93e 100644 --- a/k8s/production/runners/public/x86_64/v2/release.yaml +++ b/k8s/production/runners/public/x86_64/v2/release.yaml @@ -97,6 +97,24 @@ spec: poll_timeout = 600 # ten minutes service_account = "runner" + # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds, + # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`. + # We increase it to 30 seconds to give the pod more time to reach the server; it's more important + # that the job eventually starts than for it to fail quickly and get retried. + # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api + retry_backoff_max = 30000 + + # This is the default retry limit. We override this for specific classes of + # errors below. + retry_limit = 5 + + [runners.kubernetes.retry_limits] + # Retry this type of error 10 times instead of 5. + # This error usually occurs when the EKS API server times out or + # is unreachable. Presumably the server will eventually become + # available again, so we want to give the pod plenty of time to retry. + "tls: internal error" = 10 + [runners.kubernetes.affinity] [runners.kubernetes.affinity.node_affinity] diff --git a/k8s/production/runners/public/x86_64/v3/release.yaml b/k8s/production/runners/public/x86_64/v3/release.yaml index d200dbb01..803c49a4d 100644 --- a/k8s/production/runners/public/x86_64/v3/release.yaml +++ b/k8s/production/runners/public/x86_64/v3/release.yaml @@ -95,6 +95,24 @@ spec: poll_timeout = 600 # ten minutes service_account = "runner" + # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds, + # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`. + # We increase it to 30 seconds to give the pod more time to reach the server; it's more important + # that the job eventually starts than for it to fail quickly and get retried. + # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api + retry_backoff_max = 30000 + + # This is the default retry limit. We override this for specific classes of + # errors below. + retry_limit = 5 + + [runners.kubernetes.retry_limits] + # Retry this type of error 10 times instead of 5. + # This error usually occurs when the EKS API server times out or + # is unreachable. Presumably the server will eventually become + # available again, so we want to give the pod plenty of time to retry. + "tls: internal error" = 10 + [runners.kubernetes.affinity] [runners.kubernetes.affinity.node_affinity] diff --git a/k8s/production/runners/public/x86_64/v4/release.yaml b/k8s/production/runners/public/x86_64/v4/release.yaml index c16103ebc..f3ab1e985 100644 --- a/k8s/production/runners/public/x86_64/v4/release.yaml +++ b/k8s/production/runners/public/x86_64/v4/release.yaml @@ -95,6 +95,24 @@ spec: poll_timeout = 600 # ten minutes service_account = "runner" + # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds, + # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`. + # We increase it to 30 seconds to give the pod more time to reach the server; it's more important + # that the job eventually starts than for it to fail quickly and get retried. + # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api + retry_backoff_max = 30000 + + # This is the default retry limit. We override this for specific classes of + # errors below. + retry_limit = 5 + + [runners.kubernetes.retry_limits] + # Retry this type of error 10 times instead of 5. + # This error usually occurs when the EKS API server times out or + # is unreachable. Presumably the server will eventually become + # available again, so we want to give the pod plenty of time to retry. + "tls: internal error" = 10 + [runners.kubernetes.affinity] [runners.kubernetes.affinity.node_affinity] diff --git a/k8s/production/runners/signing/release.yaml b/k8s/production/runners/signing/release.yaml index 05f35830f..3a3d299f3 100644 --- a/k8s/production/runners/signing/release.yaml +++ b/k8s/production/runners/signing/release.yaml @@ -91,6 +91,24 @@ spec: poll_timeout = 600 # ten minutes service_account = "notary" + # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds, + # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`. + # We increase it to 30 seconds to give the pod more time to reach the server; it's more important + # that the job eventually starts than for it to fail quickly and get retried. + # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api + retry_backoff_max = 30000 + + # This is the default retry limit. We override this for specific classes of + # errors below. + retry_limit = 5 + + [runners.kubernetes.retry_limits] + # Retry this type of error 10 times instead of 5. + # This error usually occurs when the EKS API server times out or + # is unreachable. Presumably the server will eventually become + # available again, so we want to give the pod plenty of time to retry. + "tls: internal error" = 10 + # TODO Change to actual image before merge allowed_images = ["ghcr.io/spack/notary:*", "ghcr.io/spack/notary@*"] allowed_services = [""]