Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions infra/fridge/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,25 @@ def patch_namespace(name: str, pss: PodSecurityStandard) -> NamespacePatch:
),
)

# Set up monitoring
monitoring_system = components.Monitoring(
name=f"{stack_name}-monitoring-system",
args=components.MonitoringArgs(
k8s_environment=k8s_environment,
argo_server_ns=argo_workflows.argo_server_ns,
),
opts=ResourceOptions(
depends_on=[
argo_workflows,
ingress_nginx,
cert_manager,
cert_manager_issuers,
minio,
storage_classes,
],
),
)

# Harbor
harbor = components.ContainerRegistry(
"harbor",
Expand Down
1 change: 1 addition & 0 deletions infra/fridge/components/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from .container_registry import ContainerRegistry, ContainerRegistryArgs
from .ingress import Ingress, IngressArgs
from .minio_config import MinioConfigJob, MinioConfigArgs
from .monitoring import Monitoring, MonitoringArgs
from .network_policies import NetworkPolicies
from .object_storage import ObjectStorage, ObjectStorageArgs
from .storage_classes import StorageClasses, StorageClassesArgs
Expand Down
204 changes: 204 additions & 0 deletions infra/fridge/components/monitoring.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
from pulumi import ComponentResource, ResourceOptions
from pulumi_kubernetes.core.v1 import (
Namespace,
Service,
ServicePortArgs,
ServiceSpecArgs,
)
from pulumi_kubernetes.helm.v3 import Release, ReleaseArgs
from pulumi_kubernetes.meta.v1 import ObjectMetaArgs
from pulumi_kubernetes.yaml import ConfigFile

from enums import K8sEnvironment


class MonitoringArgs:
def __init__(self, k8s_environment: K8sEnvironment, argo_server_ns: str):
self.k8s_environment = k8s_environment
self.argo_server_ns = argo_server_ns


class Monitoring(ComponentResource):
def __init__(
self, name: str, args: MonitoringArgs, opts: ResourceOptions | None = None
) -> None:
super().__init__("fridge:k8s:Monitoring", name, {}, opts)
child_opts = ResourceOptions.merge(opts, ResourceOptions(parent=self))

match args.k8s_environment:
case K8sEnvironment.AKS:

monitoring_ns = Namespace(
"monitoring-system",
metadata=ObjectMetaArgs(
name="monitoring-system",
labels={"name": "monitoring-system"},
),
opts=child_opts,
)
# Start by deploying the monitoring stack for AKS
# 1. Prometheus Operator
# 2. Grafana
prometheus_operator = Release(
"monitoring-operator",
ReleaseArgs(
chart="kube-prometheus-stack",
version="75.15.1",
repository_opts={
"repo": "https://prometheus-community.github.io/helm-charts"
},
namespace=monitoring_ns.metadata.name, # Compatibility with Dawn
create_namespace=False,
),
opts=child_opts,
)
case K8sEnvironment.DAWN:
# The namespace is already created on Dawn
monitoring_ns = Namespace.get("monitoring-ns", "monitoring-system")
prometheus_operator = Release.get(
"monitoring-operator", "monitoring-system/kube-prometheus-stack"
)
grafana_loki = Release.get(
"grafana-loki", "monitoring-system/loki-stack"
)

# Add service for metrics endpoint for Argo Workflows
argo_workflows_metrics_svc = Service(
"argo-workflows-metrics-svc",
metadata=ObjectMetaArgs(
name="workflow-controller-metrics",
namespace=args.argo_server_ns,
labels={"app": "workflow-controller"},
),
spec=ServiceSpecArgs(
cluster_ip=None,
ports=[
ServicePortArgs(
name="metrics",
port=9090,
protocol="TCP",
target_port=9090,
),
],
selector={"app": "workflow-controller"},
),
opts=child_opts,
)

case K8sEnvironment.K3S:
monitoring_ns = Namespace(
"monitoring-system",
metadata=ObjectMetaArgs(
name="monitoring-system",
labels={"name": "monitoring-system"},
),
opts=child_opts,
)

# Start by deploying the monitoring stack
# 1. Prometheus Operator
# 2. Grafana Loki
prometheus_operator = Release(
"monitoring-operator",
ReleaseArgs(
chart="kube-prometheus-stack",
version="75.15.1",
repository_opts={
"repo": "https://prometheus-community.github.io/helm-charts"
},
namespace=monitoring_ns.metadata.name, # Compatibility with Dawn
create_namespace=False,
values={
"alertmanager": {
"alertmanagerSpec": {
"retention": "168h",
"storage": {
"volumeClaimTemplate": {
"spec": {
"accessMode": ["ReadWriteOnce"],
"resources": {
"requests": {"storage": "3Gi"}
},
}
}
},
}
},
"prometheus": {
"prometheusSpec": {
"retention": "4d",
"retentionSize": "2GiB",
"storageSpec": {
"volumeClaimTemplate": {
"spec": {
"accessModes": ["ReadWriteOnce"],
"resources": {
"requests": {"storage": "3Gi"}
},
}
}
},
}
},
},
),
opts=child_opts,
)

grafana_loki = Release(
"grafana-loki",
ReleaseArgs(
chart="loki-stack",
version="2.10.2",
repository_opts={
"repo": "https://grafana.github.io/helm-charts"
},
namespace=monitoring_ns.metadata.name,
create_namespace=False,
),
opts=child_opts,
)

# Add service for metrics endpoint for Argo Workflows
argo_workflows_metrics_svc = Service(
"argo-workflows-metrics-svc",
metadata=ObjectMetaArgs(
name="workflow-controller-metrics",
namespace=args.argo_server_ns,
labels={"app": "workflow-controller"},
),
spec=ServiceSpecArgs(
cluster_ip=None,
ports=[
ServicePortArgs(
name="metrics",
port=9090,
protocol="TCP",
target_port=9090,
),
],
selector={"app": "workflow-controller"},
),
opts=child_opts,
)

# Add service monitor to allow Prometheus to scrape the metrics
# Note: Pulumi has no native support for ServiceMonitor,
# so using ConfigFile to deploy
argo_workflows_svc_monitor = ConfigFile(
"argo-workflows-service-monitor",
file="./k8s/argo_workflows/prometheus.yaml",
opts=ResourceOptions(
depends_on=[argo_workflows_metrics_svc],
),
)

self.register_outputs(
{
"argo_workflows_metrics_svc": argo_workflows_metrics_svc,
"argo_workflows_svc_monitor": argo_workflows_svc_monitor,
"namespace": monitoring_ns.metadata.name,
"grafana_loki": grafana_loki,
"prometheus_operator": prometheus_operator,
}
)
7 changes: 7 additions & 0 deletions infra/fridge/components/network_policies.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,13 @@ def __init__(
file="./k8s/cilium/k3s.yaml",
opts=child_opts,
)
# Add network policy to allow Prometheus monitoring for resources already deployed on Dawn
# On Dawn, Prometheus is also already deployed
ConfigFile(
"network_policy_prometheus",
file="./k8s/cilium/prometheus.yaml",
opts=child_opts,
)

ConfigFile(
"network_policy_argo_workflows",
Expand Down
11 changes: 11 additions & 0 deletions infra/fridge/k8s/argo_workflows/prometheus.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: argo-workflows
namespace: argo-server
spec:
endpoints:
- port: metrics
selector:
matchLabels:
app: workflow-controller
2 changes: 2 additions & 0 deletions infra/fridge/k8s/argo_workflows/values.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
singleNamespace: false
controller:
metricsConfig:
enabled: true
securityContext:
allowPrivilegeEscalation: false
capabilities:
Expand Down
Loading