diff --git a/README.md b/README.md index e6fc35f..4decc83 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@

Kubernetes Resource Recommendations Based on Historical Data

Get recommendations based on your existing data in Prometheus/Coralogix/Thanos/Mimir and more!

-

+

Installation . How KRR works @@ -202,7 +202,7 @@ Apart from running KRR as a CLI tool you can also run KRR inside your cluster. W -You can also run KRR in-cluster as a Kubernetes Job, if you don't want to view results easily in a UI. +You can also run KRR in-cluster as a Kubernetes Job, if you don't want to view results easily in a UI. ``` kubectl apply -f https://raw.githubusercontent.com/robusta-dev/krr/refs/heads/main/docs/krr-in-cluster/krr-in-cluster-job.yaml @@ -400,6 +400,22 @@ Refer to `krr simple --help`, and look at the flags `--prometheus-url`, `--prome If you need help, contact us on Slack, email, or by opening a GitHub issue. +

+ VCluster + +KRR supports VCluster software when Prometheus is outside of the VCluster (on physical cluster or centralized). Because of VCluster pod renaming, you need to provide : + +- `vcluster-namespace` : The namespace on physical cluster where VCluster is +- `vcluster-name` : The name of your VCluster (set during VCluster deployment) + +Other parameters like namespace selector, pod selector etc work as expected. + +```sh +krr simple --vcluster-name my-vcluster-name --vcluster-namespace my-vcluster-namespace +``` + +
+
Debug mode If you want to see additional debug logs: diff --git a/requirements.txt b/requirements.txt index 246f68b..b181094 100644 --- a/requirements.txt +++ b/requirements.txt @@ -53,4 +53,4 @@ tzlocal==5.2 ; python_version >= "3.9" and python_full_version < "3.13" urllib3==1.26.19 ; python_version >= "3.9" and python_full_version < "3.13" websocket-client==1.7.0 ; python_version >= "3.9" and python_full_version < "3.13" zipp==3.19.2 ; python_version >= "3.9" and python_version < "3.13" -tenacity==9.0.0 ; python_version >= "3.9" and python_version < "3.13" \ No newline at end of file +tenacity==9.0.0 ; python_version >= "3.9" and python_version < "3.13" diff --git a/robusta_krr/core/integrations/prometheus/metrics/base.py b/robusta_krr/core/integrations/prometheus/metrics/base.py index 347e6b9..1698b09 100644 --- a/robusta_krr/core/integrations/prometheus/metrics/base.py +++ b/robusta_krr/core/integrations/prometheus/metrics/base.py @@ -4,6 +4,7 @@ import asyncio import datetime import enum +import hashlib from concurrent.futures import ThreadPoolExecutor from functools import reduce from typing import Any, Optional, TypedDict @@ -259,3 +260,43 @@ def combine_batches(self, results: list[PodsTimeData]) -> PodsTimeData: """ return reduce(lambda x, y: x | y, results, {}) + + ## Vcluster + def get_vcluster_pod_real_name(self, pod_name: str, pod_namespace: str) -> str: + """ + Returns the pod name on the (host) cluster, which is different from the pod name in the VCluster. + When not in a VCluster, just returns the pod name as is. + + Args: + pod_name (string): The pod name in the cluster krr connected to + pod_namespace (string): The pod namespace in the cluster krr connected to + + Returns: + string: the pod name in the host cluster. + """ + + if settings.vcluster_name is None: + return pod_name + else: + host_pod_name = f"{pod_name}-x-{pod_namespace}-x-{settings.vcluster_name}" + if len(host_pod_name) > 63: + host_pod_name_sha256 = hashlib.sha256(host_pod_name.encode()).hexdigest() + host_pod_name = f"{host_pod_name[:52]}-{host_pod_name_sha256[:10]}" + return host_pod_name + + def get_pod_namespace(self, pod_namespace: str) -> str: + """ + Returns the pod namespace on the (host) cluster, which is different from the pod namespace in the VCluster. + When not in a VCluster, just returns the pod namespace as is. + + Args: + pod_namespace (string): The pod namespace in the cluster krr connected to + + Returns: + string: the pod namepace in the host cluster. + """ + + if settings.vcluster_namespace is None: + return pod_namespace + else: + return settings.vcluster_namespace diff --git a/robusta_krr/core/integrations/prometheus/metrics/cpu.py b/robusta_krr/core/integrations/prometheus/metrics/cpu.py index c7a2c73..98954d0 100644 --- a/robusta_krr/core/integrations/prometheus/metrics/cpu.py +++ b/robusta_krr/core/integrations/prometheus/metrics/cpu.py @@ -1,8 +1,10 @@ from robusta_krr.core.models.objects import K8sObjectData from .base import PrometheusMetric, QueryType +import logging - +logger = logging.getLogger("krr") + class CPULoader(PrometheusMetric): """ A metric loader for loading CPU usage metrics. @@ -11,20 +13,24 @@ class CPULoader(PrometheusMetric): query_type: QueryType = QueryType.QueryRange def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: - pods_selector = "|".join(pod.name for pod in object.pods) + pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods) + pods_namespace = self.get_pod_namespace(object.namespace) cluster_label = self.get_prometheus_cluster_label() - return f""" + prom_query = f""" max( - rate( - container_cpu_usage_seconds_total{{ - namespace="{object.namespace}", - pod=~"{pods_selector}", - container="{object.container}" - {cluster_label} - }}[{step}] - ) - ) by (container, pod, job) - """ + rate( + container_cpu_usage_seconds_total{{ + namespace="{pods_namespace}", + pod=~"{pods_selector}", + container="{object.container}" + {cluster_label} + }}[{step}] + ) + ) by (container, pod, job) + """ + logger.debug(f"{prom_query}") + + return prom_query def PercentileCPULoader(percentile: float) -> type[PrometheusMetric]: @@ -37,15 +43,16 @@ def PercentileCPULoader(percentile: float) -> type[PrometheusMetric]: class PercentileCPULoader(PrometheusMetric): def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: - pods_selector = "|".join(pod.name for pod in object.pods) + pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods) + pods_namespace = self.get_pod_namespace(object.namespace) cluster_label = self.get_prometheus_cluster_label() - return f""" + prom_query = f""" quantile_over_time( {round(percentile / 100, 2)}, max( rate( container_cpu_usage_seconds_total{{ - namespace="{object.namespace}", + namespace="{pods_namespace}", pod=~"{pods_selector}", container="{object.container}" {cluster_label} @@ -55,6 +62,8 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: [{duration}:{step}] ) """ + logger.debug(f"{prom_query}") + return prom_query return PercentileCPULoader @@ -65,13 +74,14 @@ class CPUAmountLoader(PrometheusMetric): """ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: - pods_selector = "|".join(pod.name for pod in object.pods) + pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods) + pods_namespace = self.get_pod_namespace(object.namespace) cluster_label = self.get_prometheus_cluster_label() - return f""" + prom_query = f""" count_over_time( max( container_cpu_usage_seconds_total{{ - namespace="{object.namespace}", + namespace="{pods_namespace}", pod=~"{pods_selector}", container="{object.container}" {cluster_label} @@ -80,3 +90,5 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: [{duration}:{step}] ) """ + logger.debug(f"{prom_query}") + return prom_query diff --git a/robusta_krr/core/integrations/prometheus/metrics/memory.py b/robusta_krr/core/integrations/prometheus/metrics/memory.py index 85dfba6..faa78ac 100644 --- a/robusta_krr/core/integrations/prometheus/metrics/memory.py +++ b/robusta_krr/core/integrations/prometheus/metrics/memory.py @@ -1,7 +1,9 @@ from robusta_krr.core.models.objects import K8sObjectData from .base import PrometheusMetric, QueryType +import logging +logger = logging.getLogger("krr") class MemoryLoader(PrometheusMetric): """ @@ -11,18 +13,21 @@ class MemoryLoader(PrometheusMetric): query_type: QueryType = QueryType.QueryRange def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: - pods_selector = "|".join(pod.name for pod in object.pods) + pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods) + pods_namespace = self.get_pod_namespace(object.namespace) cluster_label = self.get_prometheus_cluster_label() - return f""" + prom_query = f""" max( container_memory_working_set_bytes{{ - namespace="{object.namespace}", + namespace="{pods_namespace}", pod=~"{pods_selector}", container="{object.container}" {cluster_label} }} ) by (container, pod, job) """ + logger.debug(f"{prom_query}") + return prom_query class MaxMemoryLoader(PrometheusMetric): @@ -31,13 +36,14 @@ class MaxMemoryLoader(PrometheusMetric): """ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: - pods_selector = "|".join(pod.name for pod in object.pods) + pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods) + pods_namespace = self.get_pod_namespace(object.namespace) cluster_label = self.get_prometheus_cluster_label() - return f""" + prom_query = f""" max_over_time( max( container_memory_working_set_bytes{{ - namespace="{object.namespace}", + namespace="{pods_namespace}", pod=~"{pods_selector}", container="{object.container}" {cluster_label} @@ -46,7 +52,8 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: [{duration}:{step}] ) """ - + logger.debug(f"{prom_query}") + return prom_query class MemoryAmountLoader(PrometheusMetric): """ @@ -54,13 +61,14 @@ class MemoryAmountLoader(PrometheusMetric): """ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: - pods_selector = "|".join(pod.name for pod in object.pods) + pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods) + pods_namespace = self.get_pod_namespace(object.namespace) cluster_label = self.get_prometheus_cluster_label() - return f""" + prom_query = f""" count_over_time( max( container_memory_working_set_bytes{{ - namespace="{object.namespace}", + namespace="{pods_namespace}", pod=~"{pods_selector}", container="{object.container}" {cluster_label} @@ -69,7 +77,9 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: [{duration}:{step}] ) """ - + logger.debug(f"{prom_query}") + return prom_query + # TODO: Need to battle test if this one is correct. class MaxOOMKilledMemoryLoader(PrometheusMetric): """ @@ -79,15 +89,16 @@ class MaxOOMKilledMemoryLoader(PrometheusMetric): warning_on_no_data = False def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: - pods_selector = "|".join(pod.name for pod in object.pods) + pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods) + pods_namespace = self.get_pod_namespace(object.namespace) cluster_label = self.get_prometheus_cluster_label() - return f""" + prom_query = f""" max_over_time( max( max( kube_pod_container_resource_limits{{ resource="memory", - namespace="{object.namespace}", + namespace="{pods_namespace}", pod=~"{pods_selector}", container="{object.container}" {cluster_label} @@ -97,7 +108,7 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: max( kube_pod_container_status_last_terminated_reason{{ reason="OOMKilled", - namespace="{object.namespace}", + namespace="{pods_namespace}", pod=~"{pods_selector}", container="{object.container}" {cluster_label} @@ -107,3 +118,5 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: [{duration}:{step}] ) """ + logger.debug(f"{prom_query}") + return prom_query \ No newline at end of file diff --git a/robusta_krr/core/models/config.py b/robusta_krr/core/models/config.py index 32241ed..ca2cc5e 100644 --- a/robusta_krr/core/models/config.py +++ b/robusta_krr/core/models/config.py @@ -71,6 +71,10 @@ class Config(pd.BaseSettings): inside_cluster: bool = False _logging_console: Optional[Console] = pd.PrivateAttr(None) + # vcluster settings + vcluster_name: Optional[str] = pd.Field(None) + vcluster_namespace: Optional[str] = pd.Field(None) + def __init__(self, **kwargs: Any) -> None: super().__init__(**kwargs) diff --git a/robusta_krr/main.py b/robusta_krr/main.py index 7159bdd..8c3542d 100644 --- a/robusta_krr/main.py +++ b/robusta_krr/main.py @@ -266,6 +266,18 @@ def run_strategy( help="Send to output to a slack channel, must have SLACK_BOT_TOKEN", rich_help_panel="Output Settings", ), + vcluster_namespace: str = typer.Option( + None, + "--vcluster-namespace", + help="The vcluster namespace on physical cluster", + rich_help_panel="VCluster Settings", + ), + vcluster_name: str = typer.Option( + None, + "--vcluster-name", + help="The vcluster name on physical cluster", + rich_help_panel="VCluster Settings", + ), **strategy_args, ) -> None: f"""Run KRR using the `{_strategy_name}` strategy""" @@ -310,6 +322,8 @@ def run_strategy( show_severity=show_severity, strategy=_strategy_name, other_args=strategy_args, + vcluster_namespace=vcluster_namespace, + vcluster_name=vcluster_name, ) Config.set_config(config) except ValidationError: