Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions k8s/production/runners/protected/graviton/3/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,24 @@ spec:
poll_timeout = 600 # ten minutes
service_account = "runner"

# This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
# it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
# We increase it to 30 seconds to give the pod more time to reach the server; it's more important
# that the job eventually starts than for it to fail quickly and get retried.
# https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
retry_backoff_max = 30000

# This is the default retry limit. We override this for specific classes of
# errors below.
retry_limit = 5

[runners.kubernetes.retry_limits]
# Retry this type of error 10 times instead of 5.
# This error usually occurs when the EKS API server times out or
# is unreachable. Presumably the server will eventually become
# available again, so we want to give the pod plenty of time to retry.
"tls: internal error" = 10
Comment on lines +104 to +115
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

retry_backoff_max seems to just control the maximum value the retry interval can reach. Do you know what value the retry interval starts at? And how the backoff is incremented? Is it doubled each time, etc.?


[runners.kubernetes.affinity]
[runners.kubernetes.affinity.node_affinity]

Expand Down
18 changes: 18 additions & 0 deletions k8s/production/runners/protected/graviton/4/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,24 @@ spec:
poll_timeout = 600 # ten minutes
service_account = "runner"

# This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
# it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
# We increase it to 30 seconds to give the pod more time to reach the server; it's more important
# that the job eventually starts than for it to fail quickly and get retried.
# https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
retry_backoff_max = 30000

# This is the default retry limit. We override this for specific classes of
# errors below.
retry_limit = 5

[runners.kubernetes.retry_limits]
# Retry this type of error 10 times instead of 5.
# This error usually occurs when the EKS API server times out or
# is unreachable. Presumably the server will eventually become
# available again, so we want to give the pod plenty of time to retry.
"tls: internal error" = 10

[runners.kubernetes.affinity]
[runners.kubernetes.affinity.node_affinity]

Expand Down
18 changes: 18 additions & 0 deletions k8s/production/runners/protected/x86_64/v2-win/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,24 @@ spec:
ephemeral_storage_request = "500M"
helper_ephemeral_storage_request = "500M"

# This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
# it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
# We increase it to 30 seconds to give the pod more time to reach the server; it's more important
# that the job eventually starts than for it to fail quickly and get retried.
# https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
retry_backoff_max = 30000

# This is the default retry limit. We override this for specific classes of
# errors below.
retry_limit = 5

[runners.kubernetes.retry_limits]
# Retry this type of error 10 times instead of 5.
# This error usually occurs when the EKS API server times out or
# is unreachable. Presumably the server will eventually become
# available again, so we want to give the pod plenty of time to retry.
"tls: internal error" = 10

# Place pod close to other pipeline pods if possible ("pack" the pods tightly)
# Docs: https://docs.gitlab.com/runner/executors/kubernetes.html#define-nodes-where-pods-are-scheduled
[runners.kubernetes.affinity]
Expand Down
18 changes: 18 additions & 0 deletions k8s/production/runners/protected/x86_64/v2/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,24 @@ spec:
poll_timeout = 600 # ten minutes
service_account = "runner"

# This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
# it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
# We increase it to 30 seconds to give the pod more time to reach the server; it's more important
# that the job eventually starts than for it to fail quickly and get retried.
# https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
retry_backoff_max = 30000

# This is the default retry limit. We override this for specific classes of
# errors below.
retry_limit = 5

[runners.kubernetes.retry_limits]
# Retry this type of error 10 times instead of 5.
# This error usually occurs when the EKS API server times out or
# is unreachable. Presumably the server will eventually become
# available again, so we want to give the pod plenty of time to retry.
"tls: internal error" = 10

[runners.kubernetes.affinity]
[runners.kubernetes.affinity.node_affinity]

Expand Down
18 changes: 18 additions & 0 deletions k8s/production/runners/protected/x86_64/v3/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,24 @@ spec:
poll_timeout = 600 # ten minutes
service_account = "runner"

# This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
# it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
# We increase it to 30 seconds to give the pod more time to reach the server; it's more important
# that the job eventually starts than for it to fail quickly and get retried.
# https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
retry_backoff_max = 30000

# This is the default retry limit. We override this for specific classes of
# errors below.
retry_limit = 5

[runners.kubernetes.retry_limits]
# Retry this type of error 10 times instead of 5.
# This error usually occurs when the EKS API server times out or
# is unreachable. Presumably the server will eventually become
# available again, so we want to give the pod plenty of time to retry.
"tls: internal error" = 10

[runners.kubernetes.affinity]
[runners.kubernetes.affinity.node_affinity]

Expand Down
18 changes: 18 additions & 0 deletions k8s/production/runners/protected/x86_64/v4/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,24 @@ spec:
poll_timeout = 600 # ten minutes
service_account = "runner"

# This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
# it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
# We increase it to 30 seconds to give the pod more time to reach the server; it's more important
# that the job eventually starts than for it to fail quickly and get retried.
# https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
retry_backoff_max = 30000

# This is the default retry limit. We override this for specific classes of
# errors below.
retry_limit = 5

[runners.kubernetes.retry_limits]
# Retry this type of error 10 times instead of 5.
# This error usually occurs when the EKS API server times out or
# is unreachable. Presumably the server will eventually become
# available again, so we want to give the pod plenty of time to retry.
"tls: internal error" = 10

[runners.kubernetes.affinity]
[runners.kubernetes.affinity.node_affinity]

Expand Down
18 changes: 18 additions & 0 deletions k8s/production/runners/public/graviton/3/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,24 @@ spec:
poll_timeout = 600 # ten minutes
service_account = "runner"

# This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
# it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
# We increase it to 30 seconds to give the pod more time to reach the server; it's more important
# that the job eventually starts than for it to fail quickly and get retried.
# https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
retry_backoff_max = 30000

# This is the default retry limit. We override this for specific classes of
# errors below.
retry_limit = 5

[runners.kubernetes.retry_limits]
# Retry this type of error 10 times instead of 5.
# This error usually occurs when the EKS API server times out or
# is unreachable. Presumably the server will eventually become
# available again, so we want to give the pod plenty of time to retry.
"tls: internal error" = 10

[runners.kubernetes.affinity]
[runners.kubernetes.affinity.node_affinity]

Expand Down
18 changes: 18 additions & 0 deletions k8s/production/runners/public/graviton/4/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,24 @@ spec:
poll_timeout = 600 # ten minutes
service_account = "runner"

# This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
# it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
# We increase it to 30 seconds to give the pod more time to reach the server; it's more important
# that the job eventually starts than for it to fail quickly and get retried.
# https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
retry_backoff_max = 30000

# This is the default retry limit. We override this for specific classes of
# errors below.
retry_limit = 5

[runners.kubernetes.retry_limits]
# Retry this type of error 10 times instead of 5.
# This error usually occurs when the EKS API server times out or
# is unreachable. Presumably the server will eventually become
# available again, so we want to give the pod plenty of time to retry.
"tls: internal error" = 10

[runners.kubernetes.affinity]
[runners.kubernetes.affinity.node_affinity]

Expand Down
18 changes: 18 additions & 0 deletions k8s/production/runners/public/x86_64/v2-win/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,24 @@ spec:
ephemeral_storage_request = "500M"
helper_ephemeral_storage_request = "500M"

# This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
# it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
# We increase it to 30 seconds to give the pod more time to reach the server; it's more important
# that the job eventually starts than for it to fail quickly and get retried.
# https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
retry_backoff_max = 30000

# This is the default retry limit. We override this for specific classes of
# errors below.
retry_limit = 5

[runners.kubernetes.retry_limits]
# Retry this type of error 10 times instead of 5.
# This error usually occurs when the EKS API server times out or
# is unreachable. Presumably the server will eventually become
# available again, so we want to give the pod plenty of time to retry.
"tls: internal error" = 10

# Place pod close to other pipeline pods if possible ("pack" the pods tightly)
# Docs: https://docs.gitlab.com/runner/executors/kubernetes.html#define-nodes-where-pods-are-scheduled
[runners.kubernetes.affinity]
Expand Down
18 changes: 18 additions & 0 deletions k8s/production/runners/public/x86_64/v2/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,24 @@ spec:
poll_timeout = 600 # ten minutes
service_account = "runner"

# This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
# it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
# We increase it to 30 seconds to give the pod more time to reach the server; it's more important
# that the job eventually starts than for it to fail quickly and get retried.
# https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
retry_backoff_max = 30000

# This is the default retry limit. We override this for specific classes of
# errors below.
retry_limit = 5

[runners.kubernetes.retry_limits]
# Retry this type of error 10 times instead of 5.
# This error usually occurs when the EKS API server times out or
# is unreachable. Presumably the server will eventually become
# available again, so we want to give the pod plenty of time to retry.
"tls: internal error" = 10

[runners.kubernetes.affinity]
[runners.kubernetes.affinity.node_affinity]

Expand Down
18 changes: 18 additions & 0 deletions k8s/production/runners/public/x86_64/v3/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,24 @@ spec:
poll_timeout = 600 # ten minutes
service_account = "runner"

# This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
# it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
# We increase it to 30 seconds to give the pod more time to reach the server; it's more important
# that the job eventually starts than for it to fail quickly and get retried.
# https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
retry_backoff_max = 30000

# This is the default retry limit. We override this for specific classes of
# errors below.
retry_limit = 5

[runners.kubernetes.retry_limits]
# Retry this type of error 10 times instead of 5.
# This error usually occurs when the EKS API server times out or
# is unreachable. Presumably the server will eventually become
# available again, so we want to give the pod plenty of time to retry.
"tls: internal error" = 10

[runners.kubernetes.affinity]
[runners.kubernetes.affinity.node_affinity]

Expand Down
18 changes: 18 additions & 0 deletions k8s/production/runners/public/x86_64/v4/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,24 @@ spec:
poll_timeout = 600 # ten minutes
service_account = "runner"

# This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
# it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
# We increase it to 30 seconds to give the pod more time to reach the server; it's more important
# that the job eventually starts than for it to fail quickly and get retried.
# https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
retry_backoff_max = 30000

# This is the default retry limit. We override this for specific classes of
# errors below.
retry_limit = 5

[runners.kubernetes.retry_limits]
# Retry this type of error 10 times instead of 5.
# This error usually occurs when the EKS API server times out or
# is unreachable. Presumably the server will eventually become
# available again, so we want to give the pod plenty of time to retry.
"tls: internal error" = 10

[runners.kubernetes.affinity]
[runners.kubernetes.affinity.node_affinity]

Expand Down
18 changes: 18 additions & 0 deletions k8s/production/runners/signing/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,24 @@ spec:
poll_timeout = 600 # ten minutes
service_account = "notary"

# This is set to 2000 ms by default, i.e. if a pod can't reach the EKS API server within 2 seconds,
# it is killed and the job will fail with a `error dialing backend: remote error: tls: internal error`.
# We increase it to 30 seconds to give the pod more time to reach the server; it's more important
# that the job eventually starts than for it to fail quickly and get retried.
# https://docs.gitlab.com/runner/executors/kubernetes/#configure-the-number-of-request-attempts-to-the-kubernetes-api
retry_backoff_max = 30000

# This is the default retry limit. We override this for specific classes of
# errors below.
retry_limit = 5

[runners.kubernetes.retry_limits]
# Retry this type of error 10 times instead of 5.
# This error usually occurs when the EKS API server times out or
# is unreachable. Presumably the server will eventually become
# available again, so we want to give the pod plenty of time to retry.
"tls: internal error" = 10

# TODO Change to actual image before merge
allowed_images = ["ghcr.io/spack/notary:*", "ghcr.io/spack/notary@*"]
allowed_services = [""]
Expand Down