Skip to content

Commit dc22dde

Browse files
committed
Add production-ready vLLM Nebius MK8s terraform stack
Includes: - GPU autoscaling support - Secure ingress + TLS - Prometheus + Grafana monitoring - Built-in vLLM Grafana dashboards - Terraform + Helm integration Signed-off-by: Kosseila (CloudThrill) <[email protected]>
1 parent 4ca1966 commit dc22dde

19 files changed

+3727
-0
lines changed
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#################################################################
2+
# Default .gitignore content for all terraform-aws-modules below
3+
#################################################################
4+
5+
.DS_Store
6+
7+
# Local .terraform directories
8+
**/.terraform/*
9+
10+
# Terraform lockfile
11+
.terraform.lock.hcl
12+
13+
# .tfstate files
14+
*.tfstate
15+
*.tfstate.*
16+
*.tfplan
17+
18+
# Crash log files
19+
crash.log
20+
21+
# Exclude all .tfvars files, which are likely to contain sentitive data, such as
22+
# password, private keys, and other secrets. These should not be part of version
23+
# control as they are data points which are potentially sensitive and subject
24+
# to change depending on the environment.
25+
*.tfvars
26+
*env-vars
27+
# Ignore override files as they are usually used to override resources locally and so
28+
# are not checked in
29+
override.tf
30+
override.tf.json
31+
*_override.tf
32+
*_override.tf.json
33+
34+
# Ignore CLI configuration files
35+
.terraformrc
36+
terraform.rc

tutorials/terraform/nebius/README.md

Lines changed: 597 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
# cluster-tools.tf
2+
###################################################################
3+
# Cert manager for Kubernetes
4+
###################################################################
5+
6+
# cert-manager (Nebius catalog)
7+
resource "nebius_applications_v1alpha1_k8s_release" "cert_manager" {
8+
cluster_id = nebius_mk8s_v1_cluster.k8s.id
9+
parent_id = var.neb_project_id
10+
application_name = "cert-manager"
11+
namespace = "cert-manager"
12+
product_slug = "bitnami/cert-manager"
13+
14+
set = {
15+
"installCRDs" : "true",
16+
"ingressShim.defaultIssuerName" : "letsencrypt-prod",
17+
"ingressShim.defaultIssuerKind" : "ClusterIssuer"
18+
}
19+
}
20+
21+
22+
# ClusterIssuer for Let's Encrypt
23+
resource "kubectl_manifest" "letsencrypt_issuer" {
24+
count = var.enable_cert_manager ? 1 : 0
25+
26+
yaml_body = templatefile(
27+
"${path.module}/config/manifests/letsencrypt-issuer.yaml",
28+
{
29+
letsencrypt_email = var.letsencrypt_email
30+
}
31+
)
32+
33+
depends_on = [
34+
# helm_release.cert_manager
35+
nebius_applications_v1alpha1_k8s_release.cert_manager
36+
]
37+
}
38+
39+
##########################
40+
# Observability Stack
41+
##########################
42+
43+
resource "helm_release" "kube_prometheus_stack" {
44+
name = "kube-prometheus-stack"
45+
repository = "https://prometheus-community.github.io/helm-charts"
46+
chart = "kube-prometheus-stack"
47+
namespace = "kube-prometheus-stack" #
48+
version = "75.15.0"
49+
create_namespace = true
50+
51+
values = [
52+
templatefile(
53+
"${path.module}/config/helm/kube-prome-stack.yaml",
54+
{
55+
nginx_ip_hex = local.nginx_ip_hex
56+
grafana_admin_password = var.grafana_admin_password
57+
# dns_prefix = var.prefix
58+
# location = var.location
59+
}
60+
)
61+
]
62+
63+
depends_on = [
64+
nebius_applications_v1alpha1_k8s_release.cert_manager,
65+
# helm_release.cert_manager, # cert-manager must be up first
66+
kubectl_manifest.letsencrypt_issuer, # ClusterIssuer must exist
67+
nebius_mk8s_v1_cluster.k8s,
68+
nebius_mk8s_v1_node_group.cpu,
69+
nebius_mk8s_v1_node_group.gpu,
70+
data.kubernetes_service.nginx_ingress
71+
]
72+
}
73+
74+
75+
#####################################################
76+
# Observability Stack - using application Module (Nebius catalog)
77+
# #####################################################
78+
79+
# resource "nebius_applications_v1alpha1_k8s_release" "prometheus" {
80+
# cluster_id = nebius_mk8s_v1_cluster.k8s.id
81+
# parent_id = var.neb_project_id
82+
83+
# application_name = "grafana-and-prometheus"
84+
# namespace = "kube-prometheus-stack"
85+
# product_slug = "nebius/grafana-and-prometheus"
86+
87+
# set = {
88+
# "prometheus.alertmanager.enabled" : true, # Enable Alertmanager
89+
# "prometheus.prometheus-pushgateway.enabled" : false,
90+
# "prometheus.prometheus-node-exporter.enabled" : true,
91+
# "grafana.adminPassword" : var.grafana_admin_password,
92+
# "prometheus.server.scrape_interval" : var.prometheus_scrape_interval,
93+
# "prometheus.server.retention" : var.prometheus_retention,
94+
# "prometheus.server.persistentVolume.size" : var.prometheus_pv_size
95+
96+
# }
97+
98+
# depends_on = [
99+
# nebius_mk8s_v1_node_group.cpu ,
100+
# nebius_mk8s_v1_node_group.gpu ,
101+
# ]
102+
# }
103+
# ###################################################
104+
# Cert manager using Helm chart
105+
#####################################################
106+
# resource "helm_release" "cert_manager" {
107+
# name = "cert-manager"
108+
# repository = "https://charts.jetstack.io"
109+
# chart = "cert-manager"
110+
# namespace = "cert-manager"
111+
# version = "v1.15.5"
112+
113+
# create_namespace = true
114+
# set = [
115+
# { name = "installCRDs", value = "true" },
116+
# # For Azure GKE with HTTP Application Routing
117+
# { name = "ingressShim.defaultIssuerName", value = "letsencrypt-prod" },
118+
# { name = "ingressShim.defaultIssuerKind", value = "ClusterIssuer" }
119+
# ]
120+
121+
# depends_on = [
122+
# nebius_mk8s_v1_cluster.k8s
123+
# ]
124+
# }
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
nodeExporter:
2+
hostRootfs: false
3+
containerSecurityContext:
4+
privileged: true
5+
6+
prometheus:
7+
prometheusSpec:
8+
storageSpec:
9+
volumeClaimTemplate:
10+
spec:
11+
storageClassName: compute-csi-default-sc
12+
accessModes: ["ReadWriteOnce"]
13+
resources:
14+
requests:
15+
storage: 5Gi
16+
ingress:
17+
enabled: false
18+
# ingressClassName: nginx
19+
# annotations:
20+
# cert-manager.io/cluster-issuer: letsencrypt-prod
21+
# hosts:
22+
# - prometheus.${nginx_ip_hex}.nip.io
23+
24+
grafana:
25+
# Fix plugin signature issues by allowing unsigned plugins
26+
plugins:
27+
- grafana-kubernetes-app
28+
- grafana-worldmap-panel
29+
- grafana-piechart-panel
30+
31+
# Remove problematic unsigned plugin and add plugin configuration
32+
env:
33+
GF_INSTALL_PLUGINS: "grafana-kubernetes-app,grafana-worldmap-panel,grafana-piechart-panel"
34+
35+
# Remove custom datasource configuration - let the chart handle it automatically
36+
# The kube-prometheus-stack chart automatically creates the Prometheus datasource
37+
38+
# Simplified dashboard configuration
39+
dashboardProviders:
40+
dashboardproviders.yaml:
41+
apiVersion: 1
42+
providers:
43+
- name: 'default'
44+
orgId: 1
45+
folder: 'Kubernetes'
46+
type: file
47+
disableDeletion: false
48+
editable: true
49+
allowUiUpdates: true
50+
options:
51+
path: /var/lib/grafana/dashboards/default
52+
53+
# Reduced dashboard list to avoid conflicts
54+
dashboards:
55+
default:
56+
kubernetes-cluster-monitoring:
57+
gnetId: 7249
58+
revision: 1
59+
datasource: Prometheus
60+
kubernetes-pods:
61+
gnetId: 6336
62+
revision: 1
63+
datasource: Prometheus
64+
kubernetes-networking:
65+
gnetId: 12658
66+
revision: 1
67+
datasource: Prometheus
68+
69+
sidecar:
70+
datasources:
71+
enabled: true
72+
label: grafana_datasource
73+
labelValue: "1"
74+
searchNamespace: ALL
75+
dashboards:
76+
enabled: true
77+
label: grafana_dashboard
78+
labelValue: "1"
79+
searchNamespace: ALL
80+
folder: /var/lib/grafana/dashboards
81+
folderAnnotation: grafana_folder
82+
provider:
83+
allowUiUpdates: true
84+
foldersFromFilesStructure: true
85+
86+
persistence:
87+
enabled: true
88+
storageClassName: compute-csi-default-sc
89+
size: 5Gi
90+
accessModes:
91+
- ReadWriteOnce
92+
93+
# Add resource limits to prevent OOM issues
94+
resources:
95+
limits:
96+
cpu: 500m
97+
memory: 1Gi
98+
requests:
99+
cpu: 250m
100+
memory: 512Mi
101+
102+
# Security context fixes
103+
securityContext:
104+
runAsUser: 472
105+
runAsGroup: 472
106+
fsGroup: 472
107+
108+
ingress:
109+
enabled: true
110+
ingressClassName: nginx
111+
annotations:
112+
cert-manager.io/cluster-issuer: letsencrypt-prod
113+
nginx.ingress.kubernetes.io/ssl-redirect: "true"
114+
hosts:
115+
- grafana.${nginx_ip_hex}.nip.io # or Use nip.io for wildcard DNS resolution
116+
tls:
117+
- secretName: grafana-tls
118+
hosts:
119+
- grafana.${nginx_ip_hex}.nip.io
120+
121+
adminPassword: ${grafana_admin_password}
122+
123+
# Additional Grafana configuration
124+
grafana.ini:
125+
server:
126+
domain: grafana.${nginx_ip_hex}.nip.io
127+
root_url: https://grafana.${nginx_ip_hex}.nip.io # or Use nip.io for wildcard DNS resolution
128+
security:
129+
allow_embedding: true
130+
plugins:
131+
allow_loading_unsigned_plugins: "grafana-kubernetes-app,grafana-worldmap-panel,grafana-piechart-panel"
132+
log:
133+
level: info
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
2+
apiVersion: v1
3+
clusters:
4+
- cluster:
5+
certificate-authority-data: ${cluster_ca}
6+
server: ${cluster_endpoint}
7+
name: ${cluster_name}
8+
contexts:
9+
- context:
10+
cluster: ${cluster_name}
11+
user: ${cluster_name}
12+
name: ${cluster_name}
13+
current-context: ${cluster_name}
14+
kind: Config
15+
users:
16+
- name: ${cluster_name}
17+
user:
18+
exec:
19+
apiVersion: client.authentication.k8s.io/v1
20+
command: bash
21+
interactiveMode: IfAvailable
22+
args:
23+
- -c
24+
- |
25+
tok=$(nebius iam get-access-token --format json%{ if profile != "" } --profile ${profile}%{ endif });
26+
jq -n --arg token "$tok" '{apiVersion: "client.authentication.k8s.io/v1", kind: "ExecCredential", status: {token: $token}}'

0 commit comments

Comments
 (0)