spack · mvandenburgh · Aug 1, 2025 · jjnesbitt · Aug 1, 2025
diff --git a/k8s/production/runners/protected/graviton/3/release.yaml b/k8s/production/runners/protected/graviton/3/release.yaml
@@ -96,6 +96,24 @@ spec:
             poll_timeout = 600  # ten minutes
             service_account = "runner"
 
+            # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
+            # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
+            # We increase it to 30 seconds to give the pod more time to reach the server; it's more important
+            # that the job eventually starts than for it to fail quickly and get retried.
+            # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
+            retry_backoff_max = 30000
+
+            # This is the default retry limit. We override this for specific classes of
+            # errors below.
+            retry_limit = 5
+
+            [runners.kubernetes.retry_limits]
+              # Retry this type of error 10 times instead of 5.
+              # This error usually occurs when the EKS API server times out or
+              # is unreachable. Presumably the server will eventually become
+              # available again, so we want to give the pod plenty of time to retry.
+              "tls: internal error" = 10
+
             [runners.kubernetes.affinity]
               [runners.kubernetes.affinity.node_affinity]
 

diff --git a/k8s/production/runners/protected/graviton/4/release.yaml b/k8s/production/runners/protected/graviton/4/release.yaml
@@ -96,6 +96,24 @@ spec:
             poll_timeout = 600  # ten minutes
             service_account = "runner"
 
+            # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
+            # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
+            # We increase it to 30 seconds to give the pod more time to reach the server; it's more important
+            # that the job eventually starts than for it to fail quickly and get retried.
+            # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
+            retry_backoff_max = 30000
+
+            # This is the default retry limit. We override this for specific classes of
+            # errors below.
+            retry_limit = 5
+
+            [runners.kubernetes.retry_limits]
+              # Retry this type of error 10 times instead of 5.
+              # This error usually occurs when the EKS API server times out or
+              # is unreachable. Presumably the server will eventually become
+              # available again, so we want to give the pod plenty of time to retry.
+              "tls: internal error" = 10
+
             [runners.kubernetes.affinity]
               [runners.kubernetes.affinity.node_affinity]
 

diff --git a/k8s/production/runners/protected/x86_64/v2-win/release.yaml b/k8s/production/runners/protected/x86_64/v2-win/release.yaml
@@ -120,6 +120,24 @@ spec:
             ephemeral_storage_request = "500M"
             helper_ephemeral_storage_request = "500M"
 
+            # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
+            # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
+            # We increase it to 30 seconds to give the pod more time to reach the server; it's more important
+            # that the job eventually starts than for it to fail quickly and get retried.
+            # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
+            retry_backoff_max = 30000
+
+            # This is the default retry limit. We override this for specific classes of
+            # errors below.
+            retry_limit = 5
+
+            [runners.kubernetes.retry_limits]
+              # Retry this type of error 10 times instead of 5.
+              # This error usually occurs when the EKS API server times out or
+              # is unreachable. Presumably the server will eventually become
+              # available again, so we want to give the pod plenty of time to retry.
+              "tls: internal error" = 10
+
             # Place pod close to other pipeline pods if possible ("pack" the pods tightly)
             # Docs: https://docs.gitlab.com/runner/executors/kubernetes.html#define-nodes-where-pods-are-scheduled
             [runners.kubernetes.affinity]

diff --git a/k8s/production/runners/protected/x86_64/v2/release.yaml b/k8s/production/runners/protected/x86_64/v2/release.yaml
@@ -95,6 +95,24 @@ spec:
             poll_timeout = 600  # ten minutes
             service_account = "runner"
 
+            # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
+            # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
+            # We increase it to 30 seconds to give the pod more time to reach the server; it's more important
+            # that the job eventually starts than for it to fail quickly and get retried.
+            # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
+            retry_backoff_max = 30000
+
+            # This is the default retry limit. We override this for specific classes of
+            # errors below.
+            retry_limit = 5
+
+            [runners.kubernetes.retry_limits]
+              # Retry this type of error 10 times instead of 5.
+              # This error usually occurs when the EKS API server times out or
+              # is unreachable. Presumably the server will eventually become
+              # available again, so we want to give the pod plenty of time to retry.
+              "tls: internal error" = 10
+
             [runners.kubernetes.affinity]
               [runners.kubernetes.affinity.node_affinity]
 

diff --git a/k8s/production/runners/protected/x86_64/v3/release.yaml b/k8s/production/runners/protected/x86_64/v3/release.yaml
@@ -95,6 +95,24 @@ spec:
             poll_timeout = 600  # ten minutes
             service_account = "runner"
 
+            # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
+            # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
+            # We increase it to 30 seconds to give the pod more time to reach the server; it's more important
+            # that the job eventually starts than for it to fail quickly and get retried.
+            # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
+            retry_backoff_max = 30000
+
+            # This is the default retry limit. We override this for specific classes of
+            # errors below.
+            retry_limit = 5
+
+            [runners.kubernetes.retry_limits]
+              # Retry this type of error 10 times instead of 5.
+              # This error usually occurs when the EKS API server times out or
+              # is unreachable. Presumably the server will eventually become
+              # available again, so we want to give the pod plenty of time to retry.
+              "tls: internal error" = 10
+
             [runners.kubernetes.affinity]
               [runners.kubernetes.affinity.node_affinity]
 

diff --git a/k8s/production/runners/protected/x86_64/v4/release.yaml b/k8s/production/runners/protected/x86_64/v4/release.yaml
@@ -95,6 +95,24 @@ spec:
             poll_timeout = 600  # ten minutes
             service_account = "runner"
 
+            # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
+            # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
+            # We increase it to 30 seconds to give the pod more time to reach the server; it's more important
+            # that the job eventually starts than for it to fail quickly and get retried.
+            # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
+            retry_backoff_max = 30000
+
+            # This is the default retry limit. We override this for specific classes of
+            # errors below.
+            retry_limit = 5
+
+            [runners.kubernetes.retry_limits]
+              # Retry this type of error 10 times instead of 5.
+              # This error usually occurs when the EKS API server times out or
+              # is unreachable. Presumably the server will eventually become
+              # available again, so we want to give the pod plenty of time to retry.
+              "tls: internal error" = 10
+
             [runners.kubernetes.affinity]
               [runners.kubernetes.affinity.node_affinity]
 

diff --git a/k8s/production/runners/public/graviton/3/release.yaml b/k8s/production/runners/public/graviton/3/release.yaml
@@ -96,6 +96,24 @@ spec:
             poll_timeout = 600  # ten minutes
             service_account = "runner"
 
+            # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
+            # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
+            # We increase it to 30 seconds to give the pod more time to reach the server; it's more important
+            # that the job eventually starts than for it to fail quickly and get retried.
+            # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
+            retry_backoff_max = 30000
+
+            # This is the default retry limit. We override this for specific classes of
+            # errors below.
+            retry_limit = 5
+
+            [runners.kubernetes.retry_limits]
+              # Retry this type of error 10 times instead of 5.
+              # This error usually occurs when the EKS API server times out or
+              # is unreachable. Presumably the server will eventually become
+              # available again, so we want to give the pod plenty of time to retry.
+              "tls: internal error" = 10
+
             [runners.kubernetes.affinity]
               [runners.kubernetes.affinity.node_affinity]
 

diff --git a/k8s/production/runners/public/graviton/4/release.yaml b/k8s/production/runners/public/graviton/4/release.yaml
@@ -96,6 +96,24 @@ spec:
             poll_timeout = 600  # ten minutes
             service_account = "runner"
 
+            # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
+            # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
+            # We increase it to 30 seconds to give the pod more time to reach the server; it's more important
+            # that the job eventually starts than for it to fail quickly and get retried.
+            # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
+            retry_backoff_max = 30000
+
+            # This is the default retry limit. We override this for specific classes of
+            # errors below.
+            retry_limit = 5
+
+            [runners.kubernetes.retry_limits]
+              # Retry this type of error 10 times instead of 5.
+              # This error usually occurs when the EKS API server times out or
+              # is unreachable. Presumably the server will eventually become
+              # available again, so we want to give the pod plenty of time to retry.
+              "tls: internal error" = 10
+
             [runners.kubernetes.affinity]
               [runners.kubernetes.affinity.node_affinity]
 

diff --git a/k8s/production/runners/public/x86_64/v2-win/release.yaml b/k8s/production/runners/public/x86_64/v2-win/release.yaml
@@ -121,6 +121,24 @@ spec:
             ephemeral_storage_request = "500M"
             helper_ephemeral_storage_request = "500M"
 
+            # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
+            # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
+            # We increase it to 30 seconds to give the pod more time to reach the server; it's more important
+            # that the job eventually starts than for it to fail quickly and get retried.
+            # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
+            retry_backoff_max = 30000
+
+            # This is the default retry limit. We override this for specific classes of
+            # errors below.
+            retry_limit = 5
+
+            [runners.kubernetes.retry_limits]
+              # Retry this type of error 10 times instead of 5.
+              # This error usually occurs when the EKS API server times out or
+              # is unreachable. Presumably the server will eventually become
+              # available again, so we want to give the pod plenty of time to retry.
+              "tls: internal error" = 10
+
             # Place pod close to other pipeline pods if possible ("pack" the pods tightly)
             # Docs: https://docs.gitlab.com/runner/executors/kubernetes.html#define-nodes-where-pods-are-scheduled
             [runners.kubernetes.affinity]

diff --git a/k8s/production/runners/public/x86_64/v2/release.yaml b/k8s/production/runners/public/x86_64/v2/release.yaml
@@ -97,6 +97,24 @@ spec:
             poll_timeout = 600  # ten minutes
             service_account = "runner"
 
+            # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
+            # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
+            # We increase it to 30 seconds to give the pod more time to reach the server; it's more important
+            # that the job eventually starts than for it to fail quickly and get retried.
+            # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
+            retry_backoff_max = 30000
+
+            # This is the default retry limit. We override this for specific classes of
+            # errors below.
+            retry_limit = 5
+
+            [runners.kubernetes.retry_limits]
+              # Retry this type of error 10 times instead of 5.
+              # This error usually occurs when the EKS API server times out or
+              # is unreachable. Presumably the server will eventually become
+              # available again, so we want to give the pod plenty of time to retry.
+              "tls: internal error" = 10
+
             [runners.kubernetes.affinity]
               [runners.kubernetes.affinity.node_affinity]
 

diff --git a/k8s/production/runners/public/x86_64/v3/release.yaml b/k8s/production/runners/public/x86_64/v3/release.yaml
@@ -95,6 +95,24 @@ spec:
             poll_timeout = 600  # ten minutes
             service_account = "runner"
 
+            # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
+            # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
+            # We increase it to 30 seconds to give the pod more time to reach the server; it's more important
+            # that the job eventually starts than for it to fail quickly and get retried.
+            # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
+            retry_backoff_max = 30000
+
+            # This is the default retry limit. We override this for specific classes of
+            # errors below.
+            retry_limit = 5
+
+            [runners.kubernetes.retry_limits]
+              # Retry this type of error 10 times instead of 5.
+              # This error usually occurs when the EKS API server times out or
+              # is unreachable. Presumably the server will eventually become
+              # available again, so we want to give the pod plenty of time to retry.
+              "tls: internal error" = 10
+
             [runners.kubernetes.affinity]
               [runners.kubernetes.affinity.node_affinity]
 

diff --git a/k8s/production/runners/public/x86_64/v4/release.yaml b/k8s/production/runners/public/x86_64/v4/release.yaml
@@ -95,6 +95,24 @@ spec:
             poll_timeout = 600  # ten minutes
             service_account = "runner"
 
+            # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
+            # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
+            # We increase it to 30 seconds to give the pod more time to reach the server; it's more important
+            # that the job eventually starts than for it to fail quickly and get retried.
+            # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
+            retry_backoff_max = 30000
+
+            # This is the default retry limit. We override this for specific classes of
+            # errors below.
+            retry_limit = 5
+
+            [runners.kubernetes.retry_limits]
+              # Retry this type of error 10 times instead of 5.
+              # This error usually occurs when the EKS API server times out or
+              # is unreachable. Presumably the server will eventually become
+              # available again, so we want to give the pod plenty of time to retry.
+              "tls: internal error" = 10
+
             [runners.kubernetes.affinity]
               [runners.kubernetes.affinity.node_affinity]
 

diff --git a/k8s/production/runners/signing/release.yaml b/k8s/production/runners/signing/release.yaml
@@ -91,6 +91,24 @@ spec:
             poll_timeout = 600  # ten minutes
             service_account = "notary"
 
+            # This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
+            # it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
+            # We increase it to 30 seconds to give the pod more time to reach the server; it's more important
+            # that the job eventually starts than for it to fail quickly and get retried.
+            # https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
+            retry_backoff_max = 30000
+
+            # This is the default retry limit. We override this for specific classes of
+            # errors below.
+            retry_limit = 5
+
+            [runners.kubernetes.retry_limits]
+              # Retry this type of error 10 times instead of 5.
+              # This error usually occurs when the EKS API server times out or
+              # is unreachable. Presumably the server will eventually become
+              # available again, so we want to give the pod plenty of time to retry.
+              "tls: internal error" = 10
+
             # TODO Change to actual image before merge
             allowed_images = ["ghcr.io/spack/notary:*", "ghcr.io/spack/notary@*"]
             allowed_services = [""]