Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 24 additions & 5 deletions src/xpk/core/kueue.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
run_command_with_updates,
run_command_with_updates_retry,
)
from .pathways import add_pw_resource_flavors, add_pw_resources_to_kueue
from .pathways import add_pw_resource_flavors
from .resources import AutoprovisioningConfig
from .scheduling import (
create_accelerator_label,
Expand Down Expand Up @@ -104,7 +104,6 @@
namespaceSelector: {{}} # match all.
resourceGroups:
{covered_resources_config}
{pw_resources_kueue}
{admission_checks}
---
apiVersion: kueue.x-k8s.io/v1beta1
Expand Down Expand Up @@ -432,6 +431,7 @@ def install_kueue_crs(
cluster_hardware_name=cluster_hardware_name,
resource_type=resource_type,
total_chips=total_chips,
enable_pathways=args.enable_pathways,
)
topology_label = ''
if system.device_type in [
Expand All @@ -456,7 +456,6 @@ def install_kueue_crs(
covered_resources_config=covered_resources_config,
resource_type=res_type,
pw_resource_flavors=add_pw_resource_flavors(args),
pw_resources_kueue=add_pw_resources_to_kueue(args),
admission_checks=admission_checks,
managed_resource=res_type,
cluster_queue_name=CLUSTER_QUEUE_NAME,
Expand All @@ -480,30 +479,50 @@ def install_kueue_crs(


def get_kueue_covered_resources_config(
cluster_hardware_name, resource_type, total_chips
cluster_hardware_name, resource_type, total_chips, enable_pathways
Copy link
Collaborator

@RoshaniN RoshaniN Aug 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would retain pathways specific helpers / logic in pathways specific files such as src/xpk/core/pathways.py , if that's possible and avoid changing function headers.

LGTM to the change though.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did consider that originally but I personally feel it's cleaner to keep all Kueue config inside kueue.py. So when I want to change the kueue config, I just have to look at kueue.py.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The way the code is now, I can easily understand what the covered resource config is going to look like without having to go between different functions and files.

So could we keep the PR as-is please?

) -> str:
"""Gets Kueue covered resources configuration.

Args:
cluster_hardware_name: cluster hardware name.
resource_type: resource type of tpu or gpu.
total_chips: total number of chips for the specific resource type.
enable_pathways: if pathways is enabled.

Returns:
A string of Kueue covered resources configuration.
"""
pathways_resources = ''
if enable_pathways:
pathways_resources = """
- name: cpu-user
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think you can have two resource flavors with the same covered resources, IIUC. Please test before submitting this change.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tested, this works fine.

resources:
- name: "cpu"
nominalQuota: 480
- name: "memory"
nominalQuota: 2000G
- name: "{resource_type}"
nominalQuota: 0
""".format(resource_type=resource_type)

config_format = """
- coveredResources: ["{resource_type}"]
- coveredResources: ["cpu", "memory", "{resource_type}"]
flavors:
{pathways_resources}
- name: {cluster_hardware_name}
resources:
- name: "cpu"
nominalQuota: "9999999999"
- name: "memory"
nominalQuota: "99999999999Gi"
- name: "{resource_type}"
nominalQuota: {total_chips}
"""
config_string = config_format.format(
cluster_hardware_name=cluster_hardware_name,
resource_type=resource_type,
total_chips=total_chips,
pathways_resources=pathways_resources,
)
return config_string

Expand Down
13 changes: 0 additions & 13 deletions src/xpk/core/pathways.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,19 +37,6 @@ def add_pw_resource_flavors(args):
return ''


def add_pw_resources_to_kueue(args):
"""Add resource flavors required for Pathways, to the cluster queue."""
resources_yaml = """- coveredResources: ["cpu", "memory"]
flavors:
- name: cpu-user
resources:
- name: "cpu"
nominalQuota: 480
- name: "memory"
nominalQuota: 2000G"""
if args.enable_pathways:
return resources_yaml
return ''


def ensure_pathways_workload_prerequisites(args, system) -> bool:
Expand Down