Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions tutorials/terraform/nebius/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#################################################################
# Default .gitignore content for all terraform-aws-modules below
#################################################################

.DS_Store

# Local .terraform directories
**/.terraform/*

# Terraform lockfile
.terraform.lock.hcl

# .tfstate files
*.tfstate
*.tfstate.*
*.tfplan

# Crash log files
crash.log

# Exclude all .tfvars files, which are likely to contain sentitive data, such as
# password, private keys, and other secrets. These should not be part of version
# control as they are data points which are potentially sensitive and subject
# to change depending on the environment.
*.tfvars
*env-vars
# Ignore override files as they are usually used to override resources locally and so
# are not checked in
override.tf
override.tf.json
*_override.tf
*_override.tf.json

# Ignore CLI configuration files
.terraformrc
terraform.rc
597 changes: 597 additions & 0 deletions tutorials/terraform/nebius/README.md

Large diffs are not rendered by default.

124 changes: 124 additions & 0 deletions tutorials/terraform/nebius/cluster-tools.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
# cluster-tools.tf
###################################################################
# Cert manager for Kubernetes
###################################################################

# cert-manager (Nebius catalog)
resource "nebius_applications_v1alpha1_k8s_release" "cert_manager" {
cluster_id = nebius_mk8s_v1_cluster.k8s.id
parent_id = var.neb_project_id
application_name = "cert-manager"
namespace = "cert-manager"
product_slug = "bitnami/cert-manager"

set = {
"installCRDs" : "true",
"ingressShim.defaultIssuerName" : "letsencrypt-prod",
"ingressShim.defaultIssuerKind" : "ClusterIssuer"
}
}


# ClusterIssuer for Let's Encrypt
resource "kubectl_manifest" "letsencrypt_issuer" {
count = var.enable_cert_manager ? 1 : 0

yaml_body = templatefile(
"${path.module}/config/manifests/letsencrypt-issuer.yaml",
{
letsencrypt_email = var.letsencrypt_email
}
)

depends_on = [
# helm_release.cert_manager
nebius_applications_v1alpha1_k8s_release.cert_manager
]
}

##########################
# Observability Stack
##########################

resource "helm_release" "kube_prometheus_stack" {
name = "kube-prometheus-stack"
repository = "https://prometheus-community.github.io/helm-charts"
chart = "kube-prometheus-stack"
namespace = "kube-prometheus-stack" #
version = "75.15.0"
create_namespace = true

values = [
templatefile(
"${path.module}/config/helm/kube-prome-stack.yaml",
{
nginx_ip_hex = local.nginx_ip_hex
grafana_admin_password = var.grafana_admin_password
# dns_prefix = var.prefix
# location = var.location
}
)
]

depends_on = [
nebius_applications_v1alpha1_k8s_release.cert_manager,
# helm_release.cert_manager, # cert-manager must be up first
kubectl_manifest.letsencrypt_issuer, # ClusterIssuer must exist
nebius_mk8s_v1_cluster.k8s,
nebius_mk8s_v1_node_group.cpu,
nebius_mk8s_v1_node_group.gpu,
data.kubernetes_service.nginx_ingress
]
}


#####################################################
# Observability Stack - using application Module (Nebius catalog)
# #####################################################

# resource "nebius_applications_v1alpha1_k8s_release" "prometheus" {
# cluster_id = nebius_mk8s_v1_cluster.k8s.id
# parent_id = var.neb_project_id

# application_name = "grafana-and-prometheus"
# namespace = "kube-prometheus-stack"
# product_slug = "nebius/grafana-and-prometheus"

# set = {
# "prometheus.alertmanager.enabled" : true, # Enable Alertmanager
# "prometheus.prometheus-pushgateway.enabled" : false,
# "prometheus.prometheus-node-exporter.enabled" : true,
# "grafana.adminPassword" : var.grafana_admin_password,
# "prometheus.server.scrape_interval" : var.prometheus_scrape_interval,
# "prometheus.server.retention" : var.prometheus_retention,
# "prometheus.server.persistentVolume.size" : var.prometheus_pv_size

# }

# depends_on = [
# nebius_mk8s_v1_node_group.cpu ,
# nebius_mk8s_v1_node_group.gpu ,
# ]
# }
# ###################################################
# Cert manager using Helm chart
#####################################################
# resource "helm_release" "cert_manager" {
# name = "cert-manager"
# repository = "https://charts.jetstack.io"
# chart = "cert-manager"
# namespace = "cert-manager"
# version = "v1.15.5"

# create_namespace = true
# set = [
# { name = "installCRDs", value = "true" },
# # For Azure GKE with HTTP Application Routing
# { name = "ingressShim.defaultIssuerName", value = "letsencrypt-prod" },
# { name = "ingressShim.defaultIssuerKind", value = "ClusterIssuer" }
# ]

# depends_on = [
# nebius_mk8s_v1_cluster.k8s
# ]
# }
133 changes: 133 additions & 0 deletions tutorials/terraform/nebius/config/helm/kube-prome-stack.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
nodeExporter:
hostRootfs: false
containerSecurityContext:
privileged: true

prometheus:
prometheusSpec:
storageSpec:
volumeClaimTemplate:
spec:
storageClassName: compute-csi-default-sc
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 5Gi
ingress:
enabled: false
# ingressClassName: nginx
# annotations:
# cert-manager.io/cluster-issuer: letsencrypt-prod
# hosts:
# - prometheus.${nginx_ip_hex}.nip.io

grafana:
# Fix plugin signature issues by allowing unsigned plugins
plugins:
- grafana-kubernetes-app
- grafana-worldmap-panel
- grafana-piechart-panel

# Remove problematic unsigned plugin and add plugin configuration
env:
GF_INSTALL_PLUGINS: "grafana-kubernetes-app,grafana-worldmap-panel,grafana-piechart-panel"

# Remove custom datasource configuration - let the chart handle it automatically
# The kube-prometheus-stack chart automatically creates the Prometheus datasource

# Simplified dashboard configuration
dashboardProviders:
dashboardproviders.yaml:
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: 'Kubernetes'
type: file
disableDeletion: false
editable: true
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards/default

# Reduced dashboard list to avoid conflicts
dashboards:
default:
kubernetes-cluster-monitoring:
gnetId: 7249
revision: 1
datasource: Prometheus
kubernetes-pods:
gnetId: 6336
revision: 1
datasource: Prometheus
kubernetes-networking:
gnetId: 12658
revision: 1
datasource: Prometheus

sidecar:
datasources:
enabled: true
label: grafana_datasource
labelValue: "1"
searchNamespace: ALL
dashboards:
enabled: true
label: grafana_dashboard
labelValue: "1"
searchNamespace: ALL
folder: /var/lib/grafana/dashboards
folderAnnotation: grafana_folder
provider:
allowUiUpdates: true
foldersFromFilesStructure: true

persistence:
enabled: true
storageClassName: compute-csi-default-sc
size: 5Gi
accessModes:
- ReadWriteOnce

# Add resource limits to prevent OOM issues
resources:
limits:
cpu: 500m
memory: 1Gi
requests:
cpu: 250m
memory: 512Mi

# Security context fixes
securityContext:
runAsUser: 472
runAsGroup: 472
fsGroup: 472

ingress:
enabled: true
ingressClassName: nginx
annotations:
cert-manager.io/cluster-issuer: letsencrypt-prod
nginx.ingress.kubernetes.io/ssl-redirect: "true"
hosts:
- grafana.${nginx_ip_hex}.nip.io # or Use nip.io for wildcard DNS resolution
tls:
- secretName: grafana-tls
hosts:
- grafana.${nginx_ip_hex}.nip.io

adminPassword: ${grafana_admin_password}

# Additional Grafana configuration
grafana.ini:
server:
domain: grafana.${nginx_ip_hex}.nip.io
root_url: https://grafana.${nginx_ip_hex}.nip.io # or Use nip.io for wildcard DNS resolution
security:
allow_embedding: true
plugins:
allow_loading_unsigned_plugins: "grafana-kubernetes-app,grafana-worldmap-panel,grafana-piechart-panel"
log:
level: info
26 changes: 26 additions & 0 deletions tutorials/terraform/nebius/config/kubeconfig.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@

apiVersion: v1
clusters:
- cluster:
certificate-authority-data: ${cluster_ca}
server: ${cluster_endpoint}
name: ${cluster_name}
contexts:
- context:
cluster: ${cluster_name}
user: ${cluster_name}
name: ${cluster_name}
current-context: ${cluster_name}
kind: Config
users:
- name: ${cluster_name}
user:
exec:
apiVersion: client.authentication.k8s.io/v1
command: bash
interactiveMode: IfAvailable
args:
- -c
- |
tok=$(nebius iam get-access-token --format json%{ if profile != "" } --profile ${profile}%{ endif });
jq -n --arg token "$tok" '{apiVersion: "client.authentication.k8s.io/v1", kind: "ExecCredential", status: {token: $token}}'
Loading