robusta-dev · mysiki · Mar 27, 2025 · Mar 28, 2025
diff --git a/README.md b/README.md
@@ -14,7 +14,7 @@
 <div align="center">
   <h1 align="center">Kubernetes Resource Recommendations Based on Historical Data</h1>
   <h2 align="center">Get recommendations based on your existing data in Prometheus/Coralogix/Thanos/Mimir and more!</h2>
-  <p align="center">    
+  <p align="center">
     <a href="#installation"><strong>Installation</strong></a>
     .
     <a href="#how-krr-works"><strong>How KRR works</strong></a>
@@ -202,7 +202,7 @@ Apart from running KRR as a CLI tool you can also run KRR inside your cluster. W
 
 <img src="./images/ui_recommendation.png">
 
-You can also run KRR in-cluster as a Kubernetes Job, if you don't want to view results easily in a <a href="https://platform.robusta.dev/signup/?benefits=krr&utm_source=github&utm_medium=krr-readme&utm_content=in-cluster-ui">UI</a>. 
+You can also run KRR in-cluster as a Kubernetes Job, if you don't want to view results easily in a <a href="https://platform.robusta.dev/signup/?benefits=krr&utm_source=github&utm_medium=krr-readme&utm_content=in-cluster-ui">UI</a>.
 
 ```
 kubectl apply -f https://raw.githubusercontent.com/robusta-dev/krr/refs/heads/main/docs/krr-in-cluster/krr-in-cluster-job.yaml
@@ -400,6 +400,22 @@ Refer to `krr simple --help`, and look at the flags `--prometheus-url`, `--prome
 If you need help, contact us on Slack, email, or by opening a GitHub issue.
 </details>
 
+<details>
+  <summary>VCluster</summary>
+
+KRR supports VCluster software when Prometheus is outside of the VCluster (on physical cluster or centralized). Because of VCluster pod renaming, you need to provide :
+
+- `vcluster-namespace` : The namespace on physical cluster where VCluster is
+- `vcluster-name` : The name of your VCluster (set during VCluster deployment)
+
+Other parameters like namespace selector, pod selector etc work as expected.
+
+```sh
+krr simple --vcluster-name my-vcluster-name --vcluster-namespace my-vcluster-namespace
+```
+
+</details>
+
 <details>
   <summary>Debug mode</summary>
 If you want to see additional debug logs:

diff --git a/requirements.txt b/requirements.txt
@@ -53,4 +53,4 @@ tzlocal==5.2 ; python_version >= "3.9" and python_full_version < "3.13"
 urllib3==1.26.19 ; python_version >= "3.9" and python_full_version < "3.13"
 websocket-client==1.7.0 ; python_version >= "3.9" and python_full_version < "3.13"
 zipp==3.19.2 ; python_version >= "3.9" and python_version < "3.13"
-tenacity==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
+tenacity==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
diff --git a/robusta_krr/core/integrations/prometheus/metrics/base.py b/robusta_krr/core/integrations/prometheus/metrics/base.py
@@ -4,6 +4,7 @@
 import asyncio
 import datetime
 import enum
+import hashlib
 from concurrent.futures import ThreadPoolExecutor
 from functools import reduce
 from typing import Any, Optional, TypedDict
@@ -259,3 +260,43 @@ def combine_batches(self, results: list[PodsTimeData]) -> PodsTimeData:
         """
 
         return reduce(lambda x, y: x | y, results, {})
+
+    ## Vcluster
+    def get_vcluster_pod_real_name(self, pod_name: str, pod_namespace: str) -> str:
+        """
+        Returns the pod name on the (host) cluster, which is different from the pod name in the VCluster.
+        When not in a VCluster, just returns the pod name as is.
+
+        Args:
+        pod_name (string): The pod name in the cluster krr connected to
+        pod_namespace (string): The pod namespace in the cluster krr connected to
+
+        Returns:
+        string: the pod name in the host cluster.
+        """
+
+        if settings.vcluster_name is None:
+            return pod_name
+        else:
+            host_pod_name = f"{pod_name}-x-{pod_namespace}-x-{settings.vcluster_name}"
+            if len(host_pod_name) > 63:
+                host_pod_name_sha256 = hashlib.sha256(host_pod_name.encode()).hexdigest()
+                host_pod_name = f"{host_pod_name[:52]}-{host_pod_name_sha256[:10]}"
+            return host_pod_name
+
+    def get_pod_namespace(self, pod_namespace: str) -> str:
+        """
+        Returns the pod namespace on the (host) cluster, which is different from the pod namespace in the VCluster.
+        When not in a VCluster, just returns the pod namespace as is.
+
+        Args:
+        pod_namespace (string): The pod namespace in the cluster krr connected to
+
+        Returns:
+        string: the pod namepace in the host cluster.
+        """
+
+        if settings.vcluster_namespace is None:
+            return pod_namespace
+        else:
+            return settings.vcluster_namespace
diff --git a/robusta_krr/core/integrations/prometheus/metrics/cpu.py b/robusta_krr/core/integrations/prometheus/metrics/cpu.py
@@ -1,8 +1,10 @@
 from robusta_krr.core.models.objects import K8sObjectData
 
 from .base import PrometheusMetric, QueryType
+import logging
 
-
+logger = logging.getLogger("krr")
+
 class CPULoader(PrometheusMetric):
     """
     A metric loader for loading CPU usage metrics.
@@ -11,20 +13,24 @@ class CPULoader(PrometheusMetric):
     query_type: QueryType = QueryType.QueryRange
 
     def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
-        pods_selector = "|".join(pod.name for pod in object.pods)
+        pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods)
+        pods_namespace = self.get_pod_namespace(object.namespace)
         cluster_label = self.get_prometheus_cluster_label()
-        return f"""
+        prom_query = f"""
             max(
-                rate(
-                    container_cpu_usage_seconds_total{{
-                        namespace="{object.namespace}",
-                        pod=~"{pods_selector}",
-                        container="{object.container}"
-                        {cluster_label}
-                    }}[{step}]
-                )
-            ) by (container, pod, job)
-        """
+                    rate(
+                        container_cpu_usage_seconds_total{{
+                            namespace="{pods_namespace}",
+                            pod=~"{pods_selector}",
+                            container="{object.container}"
+                            {cluster_label}
+                        }}[{step}]
+                    )
+                ) by (container, pod, job)
+            """
+        logger.debug(f"{prom_query}")
+
+        return prom_query
 
 
 def PercentileCPULoader(percentile: float) -> type[PrometheusMetric]:
@@ -37,15 +43,16 @@ def PercentileCPULoader(percentile: float) -> type[PrometheusMetric]:
 
     class PercentileCPULoader(PrometheusMetric):
         def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
-            pods_selector = "|".join(pod.name for pod in object.pods)
+            pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods)
+            pods_namespace = self.get_pod_namespace(object.namespace)
             cluster_label = self.get_prometheus_cluster_label()
-            return f"""
+            prom_query = f"""
                 quantile_over_time(
                     {round(percentile / 100, 2)},
                     max(
                         rate(
                             container_cpu_usage_seconds_total{{
-                                namespace="{object.namespace}",
+                                namespace="{pods_namespace}",
                                 pod=~"{pods_selector}",
                                 container="{object.container}"
                                 {cluster_label}
@@ -55,6 +62,8 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
                     [{duration}:{step}]
                 )
             """
+            logger.debug(f"{prom_query}")
+            return prom_query
 
     return PercentileCPULoader
 
@@ -65,13 +74,14 @@ class CPUAmountLoader(PrometheusMetric):
     """
 
     def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
-        pods_selector = "|".join(pod.name for pod in object.pods)
+        pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods)
+        pods_namespace = self.get_pod_namespace(object.namespace)
         cluster_label = self.get_prometheus_cluster_label()
-        return f"""
+        prom_query = f"""
             count_over_time(
                 max(
                     container_cpu_usage_seconds_total{{
-                        namespace="{object.namespace}",
+                        namespace="{pods_namespace}",
                         pod=~"{pods_selector}",
                         container="{object.container}"
                         {cluster_label}
@@ -80,3 +90,5 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
                 [{duration}:{step}]
             )
         """
+        logger.debug(f"{prom_query}")
+        return prom_query
diff --git a/robusta_krr/core/integrations/prometheus/metrics/memory.py b/robusta_krr/core/integrations/prometheus/metrics/memory.py
@@ -1,7 +1,9 @@
 from robusta_krr.core.models.objects import K8sObjectData
 
 from .base import PrometheusMetric, QueryType
+import logging
 
+logger = logging.getLogger("krr")
 
 class MemoryLoader(PrometheusMetric):
     """
@@ -11,18 +13,21 @@ class MemoryLoader(PrometheusMetric):
     query_type: QueryType = QueryType.QueryRange
 
     def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
-        pods_selector = "|".join(pod.name for pod in object.pods)
+        pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods)
+        pods_namespace = self.get_pod_namespace(object.namespace)
         cluster_label = self.get_prometheus_cluster_label()
-        return f"""
+        prom_query = f"""
             max(
                 container_memory_working_set_bytes{{
-                    namespace="{object.namespace}",
+                    namespace="{pods_namespace}",
                     pod=~"{pods_selector}",
                     container="{object.container}"
                     {cluster_label}
                 }}
             ) by (container, pod, job)
         """
+        logger.debug(f"{prom_query}")
+        return prom_query
 
 
 class MaxMemoryLoader(PrometheusMetric):
@@ -31,13 +36,14 @@ class MaxMemoryLoader(PrometheusMetric):
     """
 
     def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
-        pods_selector = "|".join(pod.name for pod in object.pods)
+        pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods)
+        pods_namespace = self.get_pod_namespace(object.namespace)
         cluster_label = self.get_prometheus_cluster_label()
-        return f"""
+        prom_query = f"""
             max_over_time(
                 max(
                     container_memory_working_set_bytes{{
-                        namespace="{object.namespace}",
+                        namespace="{pods_namespace}",
                         pod=~"{pods_selector}",
                         container="{object.container}"
                         {cluster_label}
@@ -46,21 +52,23 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
                 [{duration}:{step}]
             )
         """
-
+        logger.debug(f"{prom_query}")
+        return prom_query
 
 class MemoryAmountLoader(PrometheusMetric):
     """
     A metric loader for loading memory points count.
     """
 
     def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
-        pods_selector = "|".join(pod.name for pod in object.pods)
+        pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods)
+        pods_namespace = self.get_pod_namespace(object.namespace)
         cluster_label = self.get_prometheus_cluster_label()
-        return f"""
+        prom_query = f"""
             count_over_time(
                 max(
                     container_memory_working_set_bytes{{
-                        namespace="{object.namespace}",
+                        namespace="{pods_namespace}",
                         pod=~"{pods_selector}",
                         container="{object.container}"
                         {cluster_label}
@@ -69,7 +77,9 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
                 [{duration}:{step}]
             )
         """
-
+        logger.debug(f"{prom_query}")
+        return prom_query
+
 # TODO: Need to battle test if this one is correct.
 class MaxOOMKilledMemoryLoader(PrometheusMetric):
     """
@@ -79,15 +89,16 @@ class MaxOOMKilledMemoryLoader(PrometheusMetric):
     warning_on_no_data = False
 
     def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
-        pods_selector = "|".join(pod.name for pod in object.pods)
+        pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods)
+        pods_namespace = self.get_pod_namespace(object.namespace)
         cluster_label = self.get_prometheus_cluster_label()
-        return f"""
+        prom_query = f"""
             max_over_time(
                 max(
                     max(
                         kube_pod_container_resource_limits{{
                             resource="memory",
-                            namespace="{object.namespace}",
+                            namespace="{pods_namespace}",
                             pod=~"{pods_selector}",
                             container="{object.container}"
                             {cluster_label}
@@ -97,7 +108,7 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
                     max(
                         kube_pod_container_status_last_terminated_reason{{
                             reason="OOMKilled",
-                            namespace="{object.namespace}",
+                            namespace="{pods_namespace}",
                             pod=~"{pods_selector}",
                             container="{object.container}"
                             {cluster_label}
@@ -107,3 +118,5 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
                 [{duration}:{step}]
             )
         """
+        logger.debug(f"{prom_query}")
+        return prom_query
diff --git a/robusta_krr/core/models/config.py b/robusta_krr/core/models/config.py
@@ -71,6 +71,10 @@ class Config(pd.BaseSettings):
     inside_cluster: bool = False
     _logging_console: Optional[Console] = pd.PrivateAttr(None)
 
+    # vcluster settings
+    vcluster_name: Optional[str] = pd.Field(None)
+    vcluster_namespace: Optional[str] = pd.Field(None)
+
     def __init__(self, **kwargs: Any) -> None:
         super().__init__(**kwargs)
 

diff --git a/robusta_krr/main.py b/robusta_krr/main.py
@@ -266,6 +266,18 @@ def run_strategy(
                     help="Send to output to a slack channel, must have SLACK_BOT_TOKEN",
                     rich_help_panel="Output Settings",
                 ),
+                vcluster_namespace: str = typer.Option(
+                    None,
+                    "--vcluster-namespace",
+                    help="The vcluster namespace on physical cluster",
+                    rich_help_panel="VCluster Settings",
+                ),
+                vcluster_name: str = typer.Option(
+                    None,
+                    "--vcluster-name",
+                    help="The vcluster name on physical cluster",
+                    rich_help_panel="VCluster Settings",
+                ),
                 **strategy_args,
             ) -> None:
                 f"""Run KRR using the `{_strategy_name}` strategy"""
@@ -310,6 +322,8 @@ def run_strategy(
                         show_severity=show_severity,
                         strategy=_strategy_name,
                         other_args=strategy_args,
+                        vcluster_namespace=vcluster_namespace,
+                        vcluster_name=vcluster_name,
                     )
                     Config.set_config(config)
                 except ValidationError: