From 4506816efc85582d4c4e6da36b6ec754df421570 Mon Sep 17 00:00:00 2001
From: Mike VanDenburgh <michael.vandenburgh@kitware.com>
Date: Fri, 1 Aug 2025 11:47:06 -0400
Subject: [PATCH] Configure retries/backoff for gitlab-runner k8s API requests

---
 .../runners/protected/graviton/3/release.yaml  | 18 ++++++++++++++++++
 .../runners/protected/graviton/4/release.yaml  | 18 ++++++++++++++++++
 .../protected/x86_64/v2-win/release.yaml       | 18 ++++++++++++++++++
 .../runners/protected/x86_64/v2/release.yaml   | 18 ++++++++++++++++++
 .../runners/protected/x86_64/v3/release.yaml   | 18 ++++++++++++++++++
 .../runners/protected/x86_64/v4/release.yaml   | 18 ++++++++++++++++++
 .../runners/public/graviton/3/release.yaml     | 18 ++++++++++++++++++
 .../runners/public/graviton/4/release.yaml     | 18 ++++++++++++++++++
 .../runners/public/x86_64/v2-win/release.yaml  | 18 ++++++++++++++++++
 .../runners/public/x86_64/v2/release.yaml      | 18 ++++++++++++++++++
 .../runners/public/x86_64/v3/release.yaml      | 18 ++++++++++++++++++
 .../runners/public/x86_64/v4/release.yaml      | 18 ++++++++++++++++++
 k8s/production/runners/signing/release.yaml    | 18 ++++++++++++++++++
 13 files changed, 234 insertions(+)

diff --git a/k8s/production/runners/protected/graviton/3/release.yaml b/k8s/production/runners/protected/graviton/3/release.yaml
index 76380932b..d13d45651 100644
--- a/k8s/production/runners/protected/graviton/3/release.yaml
+++ b/k8s/production/runners/protected/graviton/3/release.yaml
@@ -96,6 +96,24 @@ spec:
             poll_timeout = 600  # ten minutes
             service_account = "runner"
 
+            # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
+            # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
+            # We increase it to 30 seconds to give the pod more time to reach the server; it's more important
+            # that the job eventually starts than for it to fail quickly and get retried.
+            # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
+            retry_backoff_max = 30000
+
+            # This is the default retry limit. We override this for specific classes of
+            # errors below.
+            retry_limit = 5
+
+            [runners.kubernetes.retry_limits]
+              # Retry this type of error 10 times instead of 5.
+              # This error usually occurs when the EKS API server times out or
+              # is unreachable. Presumably the server will eventually become
+              # available again, so we want to give the pod plenty of time to retry.
+              "tls: internal error" = 10
+
             [runners.kubernetes.affinity]
               [runners.kubernetes.affinity.node_affinity]
 
diff --git a/k8s/production/runners/protected/graviton/4/release.yaml b/k8s/production/runners/protected/graviton/4/release.yaml
index 0eeb2dfc2..581f79aad 100644
--- a/k8s/production/runners/protected/graviton/4/release.yaml
+++ b/k8s/production/runners/protected/graviton/4/release.yaml
@@ -96,6 +96,24 @@ spec:
             poll_timeout = 600  # ten minutes
             service_account = "runner"
 
+            # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
+            # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
+            # We increase it to 30 seconds to give the pod more time to reach the server; it's more important
+            # that the job eventually starts than for it to fail quickly and get retried.
+            # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
+            retry_backoff_max = 30000
+
+            # This is the default retry limit. We override this for specific classes of
+            # errors below.
+            retry_limit = 5
+
+            [runners.kubernetes.retry_limits]
+              # Retry this type of error 10 times instead of 5.
+              # This error usually occurs when the EKS API server times out or
+              # is unreachable. Presumably the server will eventually become
+              # available again, so we want to give the pod plenty of time to retry.
+              "tls: internal error" = 10
+
             [runners.kubernetes.affinity]
               [runners.kubernetes.affinity.node_affinity]
 
diff --git a/k8s/production/runners/protected/x86_64/v2-win/release.yaml b/k8s/production/runners/protected/x86_64/v2-win/release.yaml
index f01ee4cd8..2f6c4dc76 100644
--- a/k8s/production/runners/protected/x86_64/v2-win/release.yaml
+++ b/k8s/production/runners/protected/x86_64/v2-win/release.yaml
@@ -120,6 +120,24 @@ spec:
             ephemeral_storage_request = "500M"
             helper_ephemeral_storage_request = "500M"
 
+            # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
+            # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
+            # We increase it to 30 seconds to give the pod more time to reach the server; it's more important
+            # that the job eventually starts than for it to fail quickly and get retried.
+            # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
+            retry_backoff_max = 30000
+
+            # This is the default retry limit. We override this for specific classes of
+            # errors below.
+            retry_limit = 5
+
+            [runners.kubernetes.retry_limits]
+              # Retry this type of error 10 times instead of 5.
+              # This error usually occurs when the EKS API server times out or
+              # is unreachable. Presumably the server will eventually become
+              # available again, so we want to give the pod plenty of time to retry.
+              "tls: internal error" = 10
+
             # Place pod close to other pipeline pods if possible ("pack" the pods tightly)
             # Docs: https://docs.gitlab.com/runner/executors/kubernetes.html#define-nodes-where-pods-are-scheduled
             [runners.kubernetes.affinity]
diff --git a/k8s/production/runners/protected/x86_64/v2/release.yaml b/k8s/production/runners/protected/x86_64/v2/release.yaml
index fa3db5614..f0a5c966b 100644
--- a/k8s/production/runners/protected/x86_64/v2/release.yaml
+++ b/k8s/production/runners/protected/x86_64/v2/release.yaml
@@ -95,6 +95,24 @@ spec:
             poll_timeout = 600  # ten minutes
             service_account = "runner"
 
+            # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
+            # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
+            # We increase it to 30 seconds to give the pod more time to reach the server; it's more important
+            # that the job eventually starts than for it to fail quickly and get retried.
+            # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
+            retry_backoff_max = 30000
+
+            # This is the default retry limit. We override this for specific classes of
+            # errors below.
+            retry_limit = 5
+
+            [runners.kubernetes.retry_limits]
+              # Retry this type of error 10 times instead of 5.
+              # This error usually occurs when the EKS API server times out or
+              # is unreachable. Presumably the server will eventually become
+              # available again, so we want to give the pod plenty of time to retry.
+              "tls: internal error" = 10
+
             [runners.kubernetes.affinity]
               [runners.kubernetes.affinity.node_affinity]
 
diff --git a/k8s/production/runners/protected/x86_64/v3/release.yaml b/k8s/production/runners/protected/x86_64/v3/release.yaml
index 8b7a63a65..d9f7f58ce 100644
--- a/k8s/production/runners/protected/x86_64/v3/release.yaml
+++ b/k8s/production/runners/protected/x86_64/v3/release.yaml
@@ -95,6 +95,24 @@ spec:
             poll_timeout = 600  # ten minutes
             service_account = "runner"
 
+            # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
+            # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
+            # We increase it to 30 seconds to give the pod more time to reach the server; it's more important
+            # that the job eventually starts than for it to fail quickly and get retried.
+            # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
+            retry_backoff_max = 30000
+
+            # This is the default retry limit. We override this for specific classes of
+            # errors below.
+            retry_limit = 5
+
+            [runners.kubernetes.retry_limits]
+              # Retry this type of error 10 times instead of 5.
+              # This error usually occurs when the EKS API server times out or
+              # is unreachable. Presumably the server will eventually become
+              # available again, so we want to give the pod plenty of time to retry.
+              "tls: internal error" = 10
+
             [runners.kubernetes.affinity]
               [runners.kubernetes.affinity.node_affinity]
 
diff --git a/k8s/production/runners/protected/x86_64/v4/release.yaml b/k8s/production/runners/protected/x86_64/v4/release.yaml
index 9494d8f1d..5be4fd6a6 100644
--- a/k8s/production/runners/protected/x86_64/v4/release.yaml
+++ b/k8s/production/runners/protected/x86_64/v4/release.yaml
@@ -95,6 +95,24 @@ spec:
             poll_timeout = 600  # ten minutes
             service_account = "runner"
 
+            # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
+            # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
+            # We increase it to 30 seconds to give the pod more time to reach the server; it's more important
+            # that the job eventually starts than for it to fail quickly and get retried.
+            # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
+            retry_backoff_max = 30000
+
+            # This is the default retry limit. We override this for specific classes of
+            # errors below.
+            retry_limit = 5
+
+            [runners.kubernetes.retry_limits]
+              # Retry this type of error 10 times instead of 5.
+              # This error usually occurs when the EKS API server times out or
+              # is unreachable. Presumably the server will eventually become
+              # available again, so we want to give the pod plenty of time to retry.
+              "tls: internal error" = 10
+
             [runners.kubernetes.affinity]
               [runners.kubernetes.affinity.node_affinity]
 
diff --git a/k8s/production/runners/public/graviton/3/release.yaml b/k8s/production/runners/public/graviton/3/release.yaml
index 0e6582e3b..7f4ed481d 100644
--- a/k8s/production/runners/public/graviton/3/release.yaml
+++ b/k8s/production/runners/public/graviton/3/release.yaml
@@ -96,6 +96,24 @@ spec:
             poll_timeout = 600  # ten minutes
             service_account = "runner"
 
+            # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
+            # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
+            # We increase it to 30 seconds to give the pod more time to reach the server; it's more important
+            # that the job eventually starts than for it to fail quickly and get retried.
+            # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
+            retry_backoff_max = 30000
+
+            # This is the default retry limit. We override this for specific classes of
+            # errors below.
+            retry_limit = 5
+
+            [runners.kubernetes.retry_limits]
+              # Retry this type of error 10 times instead of 5.
+              # This error usually occurs when the EKS API server times out or
+              # is unreachable. Presumably the server will eventually become
+              # available again, so we want to give the pod plenty of time to retry.
+              "tls: internal error" = 10
+
             [runners.kubernetes.affinity]
               [runners.kubernetes.affinity.node_affinity]
 
diff --git a/k8s/production/runners/public/graviton/4/release.yaml b/k8s/production/runners/public/graviton/4/release.yaml
index 2fad29a7f..ec54303dc 100644
--- a/k8s/production/runners/public/graviton/4/release.yaml
+++ b/k8s/production/runners/public/graviton/4/release.yaml
@@ -96,6 +96,24 @@ spec:
             poll_timeout = 600  # ten minutes
             service_account = "runner"
 
+            # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
+            # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
+            # We increase it to 30 seconds to give the pod more time to reach the server; it's more important
+            # that the job eventually starts than for it to fail quickly and get retried.
+            # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
+            retry_backoff_max = 30000
+
+            # This is the default retry limit. We override this for specific classes of
+            # errors below.
+            retry_limit = 5
+
+            [runners.kubernetes.retry_limits]
+              # Retry this type of error 10 times instead of 5.
+              # This error usually occurs when the EKS API server times out or
+              # is unreachable. Presumably the server will eventually become
+              # available again, so we want to give the pod plenty of time to retry.
+              "tls: internal error" = 10
+
             [runners.kubernetes.affinity]
               [runners.kubernetes.affinity.node_affinity]
 
diff --git a/k8s/production/runners/public/x86_64/v2-win/release.yaml b/k8s/production/runners/public/x86_64/v2-win/release.yaml
index 6cbbb9756..649370bda 100644
--- a/k8s/production/runners/public/x86_64/v2-win/release.yaml
+++ b/k8s/production/runners/public/x86_64/v2-win/release.yaml
@@ -121,6 +121,24 @@ spec:
             ephemeral_storage_request = "500M"
             helper_ephemeral_storage_request = "500M"
 
+            # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
+            # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
+            # We increase it to 30 seconds to give the pod more time to reach the server; it's more important
+            # that the job eventually starts than for it to fail quickly and get retried.
+            # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
+            retry_backoff_max = 30000
+
+            # This is the default retry limit. We override this for specific classes of
+            # errors below.
+            retry_limit = 5
+
+            [runners.kubernetes.retry_limits]
+              # Retry this type of error 10 times instead of 5.
+              # This error usually occurs when the EKS API server times out or
+              # is unreachable. Presumably the server will eventually become
+              # available again, so we want to give the pod plenty of time to retry.
+              "tls: internal error" = 10
+
             # Place pod close to other pipeline pods if possible ("pack" the pods tightly)
             # Docs: https://docs.gitlab.com/runner/executors/kubernetes.html#define-nodes-where-pods-are-scheduled
             [runners.kubernetes.affinity]
diff --git a/k8s/production/runners/public/x86_64/v2/release.yaml b/k8s/production/runners/public/x86_64/v2/release.yaml
index 62fb83ac8..4c9daa93e 100644
--- a/k8s/production/runners/public/x86_64/v2/release.yaml
+++ b/k8s/production/runners/public/x86_64/v2/release.yaml
@@ -97,6 +97,24 @@ spec:
             poll_timeout = 600  # ten minutes
             service_account = "runner"
 
+            # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
+            # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
+            # We increase it to 30 seconds to give the pod more time to reach the server; it's more important
+            # that the job eventually starts than for it to fail quickly and get retried.
+            # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
+            retry_backoff_max = 30000
+
+            # This is the default retry limit. We override this for specific classes of
+            # errors below.
+            retry_limit = 5
+
+            [runners.kubernetes.retry_limits]
+              # Retry this type of error 10 times instead of 5.
+              # This error usually occurs when the EKS API server times out or
+              # is unreachable. Presumably the server will eventually become
+              # available again, so we want to give the pod plenty of time to retry.
+              "tls: internal error" = 10
+
             [runners.kubernetes.affinity]
               [runners.kubernetes.affinity.node_affinity]
 
diff --git a/k8s/production/runners/public/x86_64/v3/release.yaml b/k8s/production/runners/public/x86_64/v3/release.yaml
index d200dbb01..803c49a4d 100644
--- a/k8s/production/runners/public/x86_64/v3/release.yaml
+++ b/k8s/production/runners/public/x86_64/v3/release.yaml
@@ -95,6 +95,24 @@ spec:
             poll_timeout = 600  # ten minutes
             service_account = "runner"
 
+            # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
+            # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
+            # We increase it to 30 seconds to give the pod more time to reach the server; it's more important
+            # that the job eventually starts than for it to fail quickly and get retried.
+            # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
+            retry_backoff_max = 30000
+
+            # This is the default retry limit. We override this for specific classes of
+            # errors below.
+            retry_limit = 5
+
+            [runners.kubernetes.retry_limits]
+              # Retry this type of error 10 times instead of 5.
+              # This error usually occurs when the EKS API server times out or
+              # is unreachable. Presumably the server will eventually become
+              # available again, so we want to give the pod plenty of time to retry.
+              "tls: internal error" = 10
+
             [runners.kubernetes.affinity]
               [runners.kubernetes.affinity.node_affinity]
 
diff --git a/k8s/production/runners/public/x86_64/v4/release.yaml b/k8s/production/runners/public/x86_64/v4/release.yaml
index c16103ebc..f3ab1e985 100644
--- a/k8s/production/runners/public/x86_64/v4/release.yaml
+++ b/k8s/production/runners/public/x86_64/v4/release.yaml
@@ -95,6 +95,24 @@ spec:
             poll_timeout = 600  # ten minutes
             service_account = "runner"
 
+            # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
+            # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
+            # We increase it to 30 seconds to give the pod more time to reach the server; it's more important
+            # that the job eventually starts than for it to fail quickly and get retried.
+            # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
+            retry_backoff_max = 30000
+
+            # This is the default retry limit. We override this for specific classes of
+            # errors below.
+            retry_limit = 5
+
+            [runners.kubernetes.retry_limits]
+              # Retry this type of error 10 times instead of 5.
+              # This error usually occurs when the EKS API server times out or
+              # is unreachable. Presumably the server will eventually become
+              # available again, so we want to give the pod plenty of time to retry.
+              "tls: internal error" = 10
+
             [runners.kubernetes.affinity]
               [runners.kubernetes.affinity.node_affinity]
 
diff --git a/k8s/production/runners/signing/release.yaml b/k8s/production/runners/signing/release.yaml
index 05f35830f..3a3d299f3 100644
--- a/k8s/production/runners/signing/release.yaml
+++ b/k8s/production/runners/signing/release.yaml
@@ -91,6 +91,24 @@ spec:
             poll_timeout = 600  # ten minutes
             service_account = "notary"
 
+            # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
+            # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
+            # We increase it to 30 seconds to give the pod more time to reach the server; it's more important
+            # that the job eventually starts than for it to fail quickly and get retried.
+            # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
+            retry_backoff_max = 30000
+
+            # This is the default retry limit. We override this for specific classes of
+            # errors below.
+            retry_limit = 5
+
+            [runners.kubernetes.retry_limits]
+              # Retry this type of error 10 times instead of 5.
+              # This error usually occurs when the EKS API server times out or
+              # is unreachable. Presumably the server will eventually become
+              # available again, so we want to give the pod plenty of time to retry.
+              "tls: internal error" = 10
+
             # TODO Change to actual image before merge
             allowed_images = ["ghcr.io/spack/notary:*", "ghcr.io/spack/notary@*"]
             allowed_services = [""]