diff --git a/README.md b/README.md index e6fc35f..4decc83 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@
+
Installation
.
How KRR works
@@ -202,7 +202,7 @@ Apart from running KRR as a CLI tool you can also run KRR inside your cluster. W
-You can also run KRR in-cluster as a Kubernetes Job, if you don't want to view results easily in a UI.
+You can also run KRR in-cluster as a Kubernetes Job, if you don't want to view results easily in a UI.
```
kubectl apply -f https://raw.githubusercontent.com/robusta-dev/krr/refs/heads/main/docs/krr-in-cluster/krr-in-cluster-job.yaml
@@ -400,6 +400,22 @@ Refer to `krr simple --help`, and look at the flags `--prometheus-url`, `--prome
If you need help, contact us on Slack, email, or by opening a GitHub issue.
+
VCluster
+
+KRR supports VCluster software when Prometheus is outside of the VCluster (on physical cluster or centralized). Because of VCluster pod renaming, you need to provide :
+
+- `vcluster-namespace` : The namespace on physical cluster where VCluster is
+- `vcluster-name` : The name of your VCluster (set during VCluster deployment)
+
+Other parameters like namespace selector, pod selector etc work as expected.
+
+```sh
+krr simple --vcluster-name my-vcluster-name --vcluster-namespace my-vcluster-namespace
+```
+
+Debug mode
If you want to see additional debug logs:
diff --git a/requirements.txt b/requirements.txt
index 246f68b..b181094 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -53,4 +53,4 @@ tzlocal==5.2 ; python_version >= "3.9" and python_full_version < "3.13"
urllib3==1.26.19 ; python_version >= "3.9" and python_full_version < "3.13"
websocket-client==1.7.0 ; python_version >= "3.9" and python_full_version < "3.13"
zipp==3.19.2 ; python_version >= "3.9" and python_version < "3.13"
-tenacity==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
\ No newline at end of file
+tenacity==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
diff --git a/robusta_krr/core/integrations/prometheus/metrics/base.py b/robusta_krr/core/integrations/prometheus/metrics/base.py
index 347e6b9..1698b09 100644
--- a/robusta_krr/core/integrations/prometheus/metrics/base.py
+++ b/robusta_krr/core/integrations/prometheus/metrics/base.py
@@ -4,6 +4,7 @@
import asyncio
import datetime
import enum
+import hashlib
from concurrent.futures import ThreadPoolExecutor
from functools import reduce
from typing import Any, Optional, TypedDict
@@ -259,3 +260,43 @@ def combine_batches(self, results: list[PodsTimeData]) -> PodsTimeData:
"""
return reduce(lambda x, y: x | y, results, {})
+
+ ## Vcluster
+ def get_vcluster_pod_real_name(self, pod_name: str, pod_namespace: str) -> str:
+ """
+ Returns the pod name on the (host) cluster, which is different from the pod name in the VCluster.
+ When not in a VCluster, just returns the pod name as is.
+
+ Args:
+ pod_name (string): The pod name in the cluster krr connected to
+ pod_namespace (string): The pod namespace in the cluster krr connected to
+
+ Returns:
+ string: the pod name in the host cluster.
+ """
+
+ if settings.vcluster_name is None:
+ return pod_name
+ else:
+ host_pod_name = f"{pod_name}-x-{pod_namespace}-x-{settings.vcluster_name}"
+ if len(host_pod_name) > 63:
+ host_pod_name_sha256 = hashlib.sha256(host_pod_name.encode()).hexdigest()
+ host_pod_name = f"{host_pod_name[:52]}-{host_pod_name_sha256[:10]}"
+ return host_pod_name
+
+ def get_pod_namespace(self, pod_namespace: str) -> str:
+ """
+ Returns the pod namespace on the (host) cluster, which is different from the pod namespace in the VCluster.
+ When not in a VCluster, just returns the pod namespace as is.
+
+ Args:
+ pod_namespace (string): The pod namespace in the cluster krr connected to
+
+ Returns:
+ string: the pod namepace in the host cluster.
+ """
+
+ if settings.vcluster_namespace is None:
+ return pod_namespace
+ else:
+ return settings.vcluster_namespace
diff --git a/robusta_krr/core/integrations/prometheus/metrics/cpu.py b/robusta_krr/core/integrations/prometheus/metrics/cpu.py
index c7a2c73..98954d0 100644
--- a/robusta_krr/core/integrations/prometheus/metrics/cpu.py
+++ b/robusta_krr/core/integrations/prometheus/metrics/cpu.py
@@ -1,8 +1,10 @@
from robusta_krr.core.models.objects import K8sObjectData
from .base import PrometheusMetric, QueryType
+import logging
-
+logger = logging.getLogger("krr")
+
class CPULoader(PrometheusMetric):
"""
A metric loader for loading CPU usage metrics.
@@ -11,20 +13,24 @@ class CPULoader(PrometheusMetric):
query_type: QueryType = QueryType.QueryRange
def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
- pods_selector = "|".join(pod.name for pod in object.pods)
+ pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods)
+ pods_namespace = self.get_pod_namespace(object.namespace)
cluster_label = self.get_prometheus_cluster_label()
- return f"""
+ prom_query = f"""
max(
- rate(
- container_cpu_usage_seconds_total{{
- namespace="{object.namespace}",
- pod=~"{pods_selector}",
- container="{object.container}"
- {cluster_label}
- }}[{step}]
- )
- ) by (container, pod, job)
- """
+ rate(
+ container_cpu_usage_seconds_total{{
+ namespace="{pods_namespace}",
+ pod=~"{pods_selector}",
+ container="{object.container}"
+ {cluster_label}
+ }}[{step}]
+ )
+ ) by (container, pod, job)
+ """
+ logger.debug(f"{prom_query}")
+
+ return prom_query
def PercentileCPULoader(percentile: float) -> type[PrometheusMetric]:
@@ -37,15 +43,16 @@ def PercentileCPULoader(percentile: float) -> type[PrometheusMetric]:
class PercentileCPULoader(PrometheusMetric):
def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
- pods_selector = "|".join(pod.name for pod in object.pods)
+ pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods)
+ pods_namespace = self.get_pod_namespace(object.namespace)
cluster_label = self.get_prometheus_cluster_label()
- return f"""
+ prom_query = f"""
quantile_over_time(
{round(percentile / 100, 2)},
max(
rate(
container_cpu_usage_seconds_total{{
- namespace="{object.namespace}",
+ namespace="{pods_namespace}",
pod=~"{pods_selector}",
container="{object.container}"
{cluster_label}
@@ -55,6 +62,8 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
[{duration}:{step}]
)
"""
+ logger.debug(f"{prom_query}")
+ return prom_query
return PercentileCPULoader
@@ -65,13 +74,14 @@ class CPUAmountLoader(PrometheusMetric):
"""
def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
- pods_selector = "|".join(pod.name for pod in object.pods)
+ pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods)
+ pods_namespace = self.get_pod_namespace(object.namespace)
cluster_label = self.get_prometheus_cluster_label()
- return f"""
+ prom_query = f"""
count_over_time(
max(
container_cpu_usage_seconds_total{{
- namespace="{object.namespace}",
+ namespace="{pods_namespace}",
pod=~"{pods_selector}",
container="{object.container}"
{cluster_label}
@@ -80,3 +90,5 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
[{duration}:{step}]
)
"""
+ logger.debug(f"{prom_query}")
+ return prom_query
diff --git a/robusta_krr/core/integrations/prometheus/metrics/memory.py b/robusta_krr/core/integrations/prometheus/metrics/memory.py
index 85dfba6..faa78ac 100644
--- a/robusta_krr/core/integrations/prometheus/metrics/memory.py
+++ b/robusta_krr/core/integrations/prometheus/metrics/memory.py
@@ -1,7 +1,9 @@
from robusta_krr.core.models.objects import K8sObjectData
from .base import PrometheusMetric, QueryType
+import logging
+logger = logging.getLogger("krr")
class MemoryLoader(PrometheusMetric):
"""
@@ -11,18 +13,21 @@ class MemoryLoader(PrometheusMetric):
query_type: QueryType = QueryType.QueryRange
def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
- pods_selector = "|".join(pod.name for pod in object.pods)
+ pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods)
+ pods_namespace = self.get_pod_namespace(object.namespace)
cluster_label = self.get_prometheus_cluster_label()
- return f"""
+ prom_query = f"""
max(
container_memory_working_set_bytes{{
- namespace="{object.namespace}",
+ namespace="{pods_namespace}",
pod=~"{pods_selector}",
container="{object.container}"
{cluster_label}
}}
) by (container, pod, job)
"""
+ logger.debug(f"{prom_query}")
+ return prom_query
class MaxMemoryLoader(PrometheusMetric):
@@ -31,13 +36,14 @@ class MaxMemoryLoader(PrometheusMetric):
"""
def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
- pods_selector = "|".join(pod.name for pod in object.pods)
+ pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods)
+ pods_namespace = self.get_pod_namespace(object.namespace)
cluster_label = self.get_prometheus_cluster_label()
- return f"""
+ prom_query = f"""
max_over_time(
max(
container_memory_working_set_bytes{{
- namespace="{object.namespace}",
+ namespace="{pods_namespace}",
pod=~"{pods_selector}",
container="{object.container}"
{cluster_label}
@@ -46,7 +52,8 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
[{duration}:{step}]
)
"""
-
+ logger.debug(f"{prom_query}")
+ return prom_query
class MemoryAmountLoader(PrometheusMetric):
"""
@@ -54,13 +61,14 @@ class MemoryAmountLoader(PrometheusMetric):
"""
def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
- pods_selector = "|".join(pod.name for pod in object.pods)
+ pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods)
+ pods_namespace = self.get_pod_namespace(object.namespace)
cluster_label = self.get_prometheus_cluster_label()
- return f"""
+ prom_query = f"""
count_over_time(
max(
container_memory_working_set_bytes{{
- namespace="{object.namespace}",
+ namespace="{pods_namespace}",
pod=~"{pods_selector}",
container="{object.container}"
{cluster_label}
@@ -69,7 +77,9 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
[{duration}:{step}]
)
"""
-
+ logger.debug(f"{prom_query}")
+ return prom_query
+
# TODO: Need to battle test if this one is correct.
class MaxOOMKilledMemoryLoader(PrometheusMetric):
"""
@@ -79,15 +89,16 @@ class MaxOOMKilledMemoryLoader(PrometheusMetric):
warning_on_no_data = False
def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
- pods_selector = "|".join(pod.name for pod in object.pods)
+ pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods)
+ pods_namespace = self.get_pod_namespace(object.namespace)
cluster_label = self.get_prometheus_cluster_label()
- return f"""
+ prom_query = f"""
max_over_time(
max(
max(
kube_pod_container_resource_limits{{
resource="memory",
- namespace="{object.namespace}",
+ namespace="{pods_namespace}",
pod=~"{pods_selector}",
container="{object.container}"
{cluster_label}
@@ -97,7 +108,7 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
max(
kube_pod_container_status_last_terminated_reason{{
reason="OOMKilled",
- namespace="{object.namespace}",
+ namespace="{pods_namespace}",
pod=~"{pods_selector}",
container="{object.container}"
{cluster_label}
@@ -107,3 +118,5 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
[{duration}:{step}]
)
"""
+ logger.debug(f"{prom_query}")
+ return prom_query
\ No newline at end of file
diff --git a/robusta_krr/core/models/config.py b/robusta_krr/core/models/config.py
index 32241ed..ca2cc5e 100644
--- a/robusta_krr/core/models/config.py
+++ b/robusta_krr/core/models/config.py
@@ -71,6 +71,10 @@ class Config(pd.BaseSettings):
inside_cluster: bool = False
_logging_console: Optional[Console] = pd.PrivateAttr(None)
+ # vcluster settings
+ vcluster_name: Optional[str] = pd.Field(None)
+ vcluster_namespace: Optional[str] = pd.Field(None)
+
def __init__(self, **kwargs: Any) -> None:
super().__init__(**kwargs)
diff --git a/robusta_krr/main.py b/robusta_krr/main.py
index 7159bdd..8c3542d 100644
--- a/robusta_krr/main.py
+++ b/robusta_krr/main.py
@@ -266,6 +266,18 @@ def run_strategy(
help="Send to output to a slack channel, must have SLACK_BOT_TOKEN",
rich_help_panel="Output Settings",
),
+ vcluster_namespace: str = typer.Option(
+ None,
+ "--vcluster-namespace",
+ help="The vcluster namespace on physical cluster",
+ rich_help_panel="VCluster Settings",
+ ),
+ vcluster_name: str = typer.Option(
+ None,
+ "--vcluster-name",
+ help="The vcluster name on physical cluster",
+ rich_help_panel="VCluster Settings",
+ ),
**strategy_args,
) -> None:
f"""Run KRR using the `{_strategy_name}` strategy"""
@@ -310,6 +322,8 @@ def run_strategy(
show_severity=show_severity,
strategy=_strategy_name,
other_args=strategy_args,
+ vcluster_namespace=vcluster_namespace,
+ vcluster_name=vcluster_name,
)
Config.set_config(config)
except ValidationError: