Skip to content

Feature/vcluster #421

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
<div align="center">
<h1 align="center">Kubernetes Resource Recommendations Based on Historical Data</h1>
<h2 align="center">Get recommendations based on your existing data in Prometheus/Coralogix/Thanos/Mimir and more!</h2>
<p align="center">
<p align="center">
<a href="#installation"><strong>Installation</strong></a>
.
<a href="#how-krr-works"><strong>How KRR works</strong></a>
Expand Down Expand Up @@ -202,7 +202,7 @@ Apart from running KRR as a CLI tool you can also run KRR inside your cluster. W

<img src="./images/ui_recommendation.png">

You can also run KRR in-cluster as a Kubernetes Job, if you don't want to view results easily in a <a href="https://platform.robusta.dev/signup/?benefits=krr&utm_source=github&utm_medium=krr-readme&utm_content=in-cluster-ui">UI</a>.
You can also run KRR in-cluster as a Kubernetes Job, if you don't want to view results easily in a <a href="https://platform.robusta.dev/signup/?benefits=krr&utm_source=github&utm_medium=krr-readme&utm_content=in-cluster-ui">UI</a>.

```
kubectl apply -f https://raw.githubusercontent.com/robusta-dev/krr/refs/heads/main/docs/krr-in-cluster/krr-in-cluster-job.yaml
Expand Down Expand Up @@ -400,6 +400,22 @@ Refer to `krr simple --help`, and look at the flags `--prometheus-url`, `--prome
If you need help, contact us on Slack, email, or by opening a GitHub issue.
</details>

<details>
<summary>VCluster</summary>

KRR supports VCluster software when Prometheus is outside of the VCluster (on physical cluster or centralized). Because of VCluster pod renaming, you need to provide :

- `vcluster-namespace` : The namespace on physical cluster where VCluster is
- `vcluster-name` : The name of your VCluster (set during VCluster deployment)

Other parameters like namespace selector, pod selector etc work as expected.

```sh
krr simple --vcluster-name my-vcluster-name --vcluster-namespace my-vcluster-namespace
```

</details>

<details>
<summary>Debug mode</summary>
If you want to see additional debug logs:
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,4 @@ tzlocal==5.2 ; python_version >= "3.9" and python_full_version < "3.13"
urllib3==1.26.19 ; python_version >= "3.9" and python_full_version < "3.13"
websocket-client==1.7.0 ; python_version >= "3.9" and python_full_version < "3.13"
zipp==3.19.2 ; python_version >= "3.9" and python_version < "3.13"
tenacity==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
tenacity==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
41 changes: 41 additions & 0 deletions robusta_krr/core/integrations/prometheus/metrics/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import asyncio
import datetime
import enum
import hashlib
from concurrent.futures import ThreadPoolExecutor
from functools import reduce
from typing import Any, Optional, TypedDict
Expand Down Expand Up @@ -259,3 +260,43 @@ def combine_batches(self, results: list[PodsTimeData]) -> PodsTimeData:
"""

return reduce(lambda x, y: x | y, results, {})

## Vcluster
def get_vcluster_pod_real_name(self, pod_name: str, pod_namespace: str) -> str:
"""
Returns the pod name on the (host) cluster, which is different from the pod name in the VCluster.
When not in a VCluster, just returns the pod name as is.

Args:
pod_name (string): The pod name in the cluster krr connected to
pod_namespace (string): The pod namespace in the cluster krr connected to

Returns:
string: the pod name in the host cluster.
"""

if settings.vcluster_name is None:
return pod_name
else:
host_pod_name = f"{pod_name}-x-{pod_namespace}-x-{settings.vcluster_name}"
if len(host_pod_name) > 63:
host_pod_name_sha256 = hashlib.sha256(host_pod_name.encode()).hexdigest()
host_pod_name = f"{host_pod_name[:52]}-{host_pod_name_sha256[:10]}"
return host_pod_name

def get_pod_namespace(self, pod_namespace: str) -> str:
"""
Returns the pod namespace on the (host) cluster, which is different from the pod namespace in the VCluster.
When not in a VCluster, just returns the pod namespace as is.

Args:
pod_namespace (string): The pod namespace in the cluster krr connected to

Returns:
string: the pod namepace in the host cluster.
"""

if settings.vcluster_namespace is None:
return pod_namespace
else:
return settings.vcluster_namespace
50 changes: 31 additions & 19 deletions robusta_krr/core/integrations/prometheus/metrics/cpu.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from robusta_krr.core.models.objects import K8sObjectData

from .base import PrometheusMetric, QueryType
import logging


logger = logging.getLogger("krr")

class CPULoader(PrometheusMetric):
"""
A metric loader for loading CPU usage metrics.
Expand All @@ -11,20 +13,24 @@ class CPULoader(PrometheusMetric):
query_type: QueryType = QueryType.QueryRange

def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
pods_selector = "|".join(pod.name for pod in object.pods)
pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods)
pods_namespace = self.get_pod_namespace(object.namespace)
cluster_label = self.get_prometheus_cluster_label()
return f"""
prom_query = f"""
max(
rate(
container_cpu_usage_seconds_total{{
namespace="{object.namespace}",
pod=~"{pods_selector}",
container="{object.container}"
{cluster_label}
}}[{step}]
)
) by (container, pod, job)
"""
rate(
container_cpu_usage_seconds_total{{
namespace="{pods_namespace}",
pod=~"{pods_selector}",
container="{object.container}"
{cluster_label}
}}[{step}]
)
) by (container, pod, job)
"""
logger.debug(f"{prom_query}")

return prom_query


def PercentileCPULoader(percentile: float) -> type[PrometheusMetric]:
Expand All @@ -37,15 +43,16 @@ def PercentileCPULoader(percentile: float) -> type[PrometheusMetric]:

class PercentileCPULoader(PrometheusMetric):
def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
pods_selector = "|".join(pod.name for pod in object.pods)
pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods)
pods_namespace = self.get_pod_namespace(object.namespace)
cluster_label = self.get_prometheus_cluster_label()
return f"""
prom_query = f"""
quantile_over_time(
{round(percentile / 100, 2)},
max(
rate(
container_cpu_usage_seconds_total{{
namespace="{object.namespace}",
namespace="{pods_namespace}",
pod=~"{pods_selector}",
container="{object.container}"
{cluster_label}
Expand All @@ -55,6 +62,8 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
[{duration}:{step}]
)
"""
logger.debug(f"{prom_query}")
return prom_query

return PercentileCPULoader

Expand All @@ -65,13 +74,14 @@ class CPUAmountLoader(PrometheusMetric):
"""

def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
pods_selector = "|".join(pod.name for pod in object.pods)
pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods)
pods_namespace = self.get_pod_namespace(object.namespace)
cluster_label = self.get_prometheus_cluster_label()
return f"""
prom_query = f"""
count_over_time(
max(
container_cpu_usage_seconds_total{{
namespace="{object.namespace}",
namespace="{pods_namespace}",
pod=~"{pods_selector}",
container="{object.container}"
{cluster_label}
Expand All @@ -80,3 +90,5 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
[{duration}:{step}]
)
"""
logger.debug(f"{prom_query}")
return prom_query
43 changes: 28 additions & 15 deletions robusta_krr/core/integrations/prometheus/metrics/memory.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from robusta_krr.core.models.objects import K8sObjectData

from .base import PrometheusMetric, QueryType
import logging

logger = logging.getLogger("krr")

class MemoryLoader(PrometheusMetric):
"""
Expand All @@ -11,18 +13,21 @@ class MemoryLoader(PrometheusMetric):
query_type: QueryType = QueryType.QueryRange

def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
pods_selector = "|".join(pod.name for pod in object.pods)
pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods)
pods_namespace = self.get_pod_namespace(object.namespace)
cluster_label = self.get_prometheus_cluster_label()
return f"""
prom_query = f"""
max(
container_memory_working_set_bytes{{
namespace="{object.namespace}",
namespace="{pods_namespace}",
pod=~"{pods_selector}",
container="{object.container}"
{cluster_label}
}}
) by (container, pod, job)
"""
logger.debug(f"{prom_query}")
return prom_query


class MaxMemoryLoader(PrometheusMetric):
Expand All @@ -31,13 +36,14 @@ class MaxMemoryLoader(PrometheusMetric):
"""

def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
pods_selector = "|".join(pod.name for pod in object.pods)
pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods)
pods_namespace = self.get_pod_namespace(object.namespace)
cluster_label = self.get_prometheus_cluster_label()
return f"""
prom_query = f"""
max_over_time(
max(
container_memory_working_set_bytes{{
namespace="{object.namespace}",
namespace="{pods_namespace}",
pod=~"{pods_selector}",
container="{object.container}"
{cluster_label}
Expand All @@ -46,21 +52,23 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
[{duration}:{step}]
)
"""

logger.debug(f"{prom_query}")
return prom_query

class MemoryAmountLoader(PrometheusMetric):
"""
A metric loader for loading memory points count.
"""

def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
pods_selector = "|".join(pod.name for pod in object.pods)
pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods)
pods_namespace = self.get_pod_namespace(object.namespace)
cluster_label = self.get_prometheus_cluster_label()
return f"""
prom_query = f"""
count_over_time(
max(
container_memory_working_set_bytes{{
namespace="{object.namespace}",
namespace="{pods_namespace}",
pod=~"{pods_selector}",
container="{object.container}"
{cluster_label}
Expand All @@ -69,7 +77,9 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
[{duration}:{step}]
)
"""

logger.debug(f"{prom_query}")
return prom_query

# TODO: Need to battle test if this one is correct.
class MaxOOMKilledMemoryLoader(PrometheusMetric):
"""
Expand All @@ -79,15 +89,16 @@ class MaxOOMKilledMemoryLoader(PrometheusMetric):
warning_on_no_data = False

def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
pods_selector = "|".join(pod.name for pod in object.pods)
pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods)
pods_namespace = self.get_pod_namespace(object.namespace)
cluster_label = self.get_prometheus_cluster_label()
return f"""
prom_query = f"""
max_over_time(
max(
max(
kube_pod_container_resource_limits{{
resource="memory",
namespace="{object.namespace}",
namespace="{pods_namespace}",
pod=~"{pods_selector}",
container="{object.container}"
{cluster_label}
Expand All @@ -97,7 +108,7 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
max(
kube_pod_container_status_last_terminated_reason{{
reason="OOMKilled",
namespace="{object.namespace}",
namespace="{pods_namespace}",
pod=~"{pods_selector}",
container="{object.container}"
{cluster_label}
Expand All @@ -107,3 +118,5 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
[{duration}:{step}]
)
"""
logger.debug(f"{prom_query}")
return prom_query
4 changes: 4 additions & 0 deletions robusta_krr/core/models/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ class Config(pd.BaseSettings):
inside_cluster: bool = False
_logging_console: Optional[Console] = pd.PrivateAttr(None)

# vcluster settings
vcluster_name: Optional[str] = pd.Field(None)
vcluster_namespace: Optional[str] = pd.Field(None)

def __init__(self, **kwargs: Any) -> None:
super().__init__(**kwargs)

Expand Down
14 changes: 14 additions & 0 deletions robusta_krr/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,18 @@ def run_strategy(
help="Send to output to a slack channel, must have SLACK_BOT_TOKEN",
rich_help_panel="Output Settings",
),
vcluster_namespace: str = typer.Option(
None,
"--vcluster-namespace",
help="The vcluster namespace on physical cluster",
rich_help_panel="VCluster Settings",
),
vcluster_name: str = typer.Option(
None,
"--vcluster-name",
help="The vcluster name on physical cluster",
rich_help_panel="VCluster Settings",
),
**strategy_args,
) -> None:
f"""Run KRR using the `{_strategy_name}` strategy"""
Expand Down Expand Up @@ -310,6 +322,8 @@ def run_strategy(
show_severity=show_severity,
strategy=_strategy_name,
other_args=strategy_args,
vcluster_namespace=vcluster_namespace,
vcluster_name=vcluster_name,
)
Config.set_config(config)
except ValidationError:
Expand Down