Skip to content

Reduce cost of staging #2562

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Apr 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 47 additions & 9 deletions config/staging.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,29 @@ binderhub:
hub_url: https://hub.gke2.staging.mybinder.org
badge_base_url: https://staging.mybinder.org
image_prefix: gcr.io/binderhub-288415/r2d-staging-g5b5b759-
# image_prefix: us-central1-docker.pkg.dev/binderhub-288415/staging/r2d-2023-04-
sticky_builds: true
build_memory_limit: "2G"
# DockerRegistry:
# token_url: "https://us-central1-docker.pkg.dev/v2/token"

# registry:
# url: "https://us-central1-docker.pkg.dev"
# username: "_json_key"

extraEnv:
EVENT_LOG_NAME: "binderhub-staging-events-text"

resources:
requests:
cpu: 0.25
cpu: 0.1
memory: 512Mi

hpa:
enabled: true
maxReplicas: 3
minReplicas: 1
targetCPU: 50 # 50% of cpu request, so 0.125 CPU
targetCPU: 90 # 90% of cpu request, so 90m CPU

dind:
resources:
Expand All @@ -30,7 +38,7 @@ binderhub:
memory: 1Gi
limits:
cpu: "1"
memory: 2.5Gi
memory: 1Gi

ingress:
hosts:
Expand All @@ -41,14 +49,14 @@ binderhub:
hub:
resources:
requests:
memory: 512Mi
cpu: null
memory: 100M
cpu: "10m"
singleuser:
memory:
guarantee: 256M
guarantee: 100M
limit: 256M
cpu:
guarantee: 0.1
guarantee: .01
limit: 0.5
ingress:
hosts:
Expand All @@ -61,7 +69,20 @@ binderhub:
- hub.gke2.staging.mybinder.org
scheduling:
userPlaceholder:
replicas: 2
replicas: 1

proxy:
chp:
resources:
requests:
memory: 100M
cpu: 10m

minesweeper:
resources:
requests:
cpu: 5m
memory: 100Mi

grafana:
ingress:
Expand Down Expand Up @@ -101,8 +122,16 @@ prometheus:

ingress-nginx:
controller:
replicaCount: 2
service:
loadBalancerIP: 35.222.35.25
resources:
requests:
cpu: 10m
memory: 100Mi
limits:
cpu: 500m
memory: 500Mi

static:
ingress:
Expand Down Expand Up @@ -137,7 +166,9 @@ analyticsPublisher:
events:
logName: binderhub-staging-events-text
sourceBucket: binder-staging-events-raw-export

resources:
requests:
cpu: 10m
# TODO: copy to values
cloudCosts:
sourceBucket: binder-billing-archive
Expand All @@ -153,6 +184,13 @@ gcsProxy:
federationRedirect:
host: staging.mybinder.org
enabled: true
resources:
requests:
cpu: 10m
memory: 100Mi
limits:
cpu: 0.2
memory: 200Mi
hosts:
gke:
url: https://gke2.staging.mybinder.org
Expand Down
8 changes: 1 addition & 7 deletions mybinder/templates/analytics-publisher/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,6 @@ spec:
- name: config
mountPath: /etc/analytics-publisher
readOnly: true
resources:
requests:
cpu: 0.2
memory: 200Mi
limits:
cpu: 0.2
memory: 300Mi
resources: {{ .Values.analyticsPublisher.resources | toJson }}

{{- end }}
9 changes: 3 additions & 6 deletions mybinder/templates/federation-redirect/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,10 @@ spec:
- name: config
mountPath: /etc/federation-redirect
readOnly: true
{{- with .Values.federationRedirect.resources }}
resources:
requests:
cpu: 0.2
memory: 200Mi
limits:
cpu: 0.2
memory: 300Mi
{{- . | toYaml | nindent 10 }}
{{- end }}
tolerations:
- key: "node.kubernetes.io/unschedulable"
operator: "Exists"
Expand Down
5 changes: 4 additions & 1 deletion mybinder/templates/matomo/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,10 @@ spec:
volumeMounts:
- name: matomo-config-rw
mountPath: /var/www/html/config/
resources: {{ toJson .Values.matomo.resources }}
{{- with .Values.matomo.resources }}
resources:
{{- . | toYaml | nindent 10 }}
{{- end }}
- name: nginx
image: nginx:1.15
ports:
Expand Down
3 changes: 2 additions & 1 deletion mybinder/templates/minesweeper/daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ spec:
add:
- KILL
{{- with .Values.minesweeper.resources }}
resources: {{ toJson . }}
resources:
{{- . | toYaml | nindent 10 }}
{{- end }}
volumeMounts:
- name: config
Expand Down
16 changes: 15 additions & 1 deletion mybinder/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -542,14 +542,28 @@ analyticsPublisher:
fileName: cloud-costs.jsonl
kind: csv
nodeSelector: {}
resources:
requests:
cpu: 0.2
memory: 200Mi
limits:
cpu: 0.2
memory: 300Mi

# this is defined in secrets/ for the OVH cluster
eventsArchiver:
serviceAccountKey: ""

federationRedirect:
host: mybinder.org
enabled: false
host: mybinder.org
resources:
requests:
cpu: 0.2
memory: 200Mi
limits:
cpu: 0.2
memory: 300Mi
image:
name: jupyterhub/mybinder.org-federation-redirect
tag: "set-by-chartpress"
Expand Down
9 changes: 8 additions & 1 deletion terraform/gcp/modules/mybinder/resource.tf
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ locals {
},
binderhub-builder = {
display_name = "Storage access for ${var.name} image builder",
role = "roles/storage.admin",
role = var.use_artifact_registry ? "roles/artifactregistry.createOnPushWriter" : "roles/storage.admin",
},
}
# add -staging to events prefix, but don't include 'prod' in prod events
Expand All @@ -25,6 +25,13 @@ locals {
events_log_prefix = var.name == "prod" ? "binderhub" : "binderhub-${var.name}"
}

resource "google_artifact_registry_repository" "repo" {
location = var.registry_location != null ? var.registry_location : data.google_client_config.provider.region
repository_id = var.name
description = "${var.name} container registry"
format = "DOCKER"
}

resource "google_container_cluster" "cluster" {
name = var.name
location = var.gke_location != null ? var.gke_location : data.google_client_config.provider.zone
Expand Down
14 changes: 13 additions & 1 deletion terraform/gcp/modules/mybinder/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,19 @@ variable "gke_master_version" {

variable "gke_location" {
type = string
description = "GKE location for cluster if different, e.g. us-central1 for regional cluster"
description = "GKE location for cluster if different from provider zone, e.g. us-central1 for regional cluster"
default = null
}

variable "use_artifact_registry" {
type = bool
description = "Use artifact registry instead of legacy container registry"
default = false
}

variable "registry_location" {
type = string
description = "Registry location for cluster if different from provider region, e.g. us for multi-region"
default = null
}

Expand Down
27 changes: 16 additions & 11 deletions terraform/gcp/staging/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -12,31 +12,36 @@ provider "google" {
}

locals {
gke_version = "1.19.14-gke.1900"
gke_version = "1.24.10-gke.2300"
}

module "mybinder" {
source = "../modules/mybinder"
name = "staging"
gke_master_version = local.gke_version
federation_members = []
source = "../modules/mybinder"
name = "staging"
gke_master_version = local.gke_version
use_artifact_registry = false
federation_members = []
}

# define node pools here, too hard to encode with variables
resource "google_container_node_pool" "pool" {
name = "pool-2020-09"
resource "google_container_node_pool" "pool-1" {
name = "pool-2023-04"
cluster = module.mybinder.cluster_name

autoscaling {
min_node_count = 1
max_node_count = 4
max_node_count = 3
}

version = local.gke_version

node_config {
machine_type = "n1-standard-4"
disk_size_gb = 500
# e2-medium is 2cpu, 8GB shared-core
# only 1 CPU allocatable, though, and k8s itself needs most of that
# e2-standard-2 is 2x as expensive
# but 2 e2-standard-2 is $100/month
machine_type = "e2-standard-2"
disk_size_gb = 100
disk_type = "pd-standard"
# https://www.terraform.io/docs/providers/google/r/container_cluster.html#oauth_scopes-1
oauth_scopes = [
Expand Down Expand Up @@ -65,7 +70,7 @@ resource "google_container_node_pool" "pool" {

output "private_keys" {
value = module.mybinder.private_keys
description = "GCP serice account keys"
description = "GCP service account keys"
sensitive = true
}

Expand Down