Add queue and kv-cache scorers (#762)

liu-cong · web-flow · commit a04d395e538f · 2025-04-30T13:07:55.000-07:00
* Add queue and kv-cache scorers

* Remove helper function
diff --git a/pkg/epp/scheduling/plugins/picker/max_score_picker.go b/pkg/epp/scheduling/plugins/picker/max_score_picker.go
@@ -1,3 +1,19 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
 package picker
 
 import (
diff --git a/pkg/epp/scheduling/plugins/scorer/kvcache.go b/pkg/epp/scheduling/plugins/scorer/kvcache.go
@@ -0,0 +1,35 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package scorer
+
+import (
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
+)
+
+type KVCacheScorer struct{}
+
+func (ss *KVCacheScorer) Name() string {
+	return "kv-cache"
+}
+
+func (ss *KVCacheScorer) Score(ctx *types.SchedulingContext, pods []types.Pod) map[types.Pod]float64 {
+	scores := make(map[types.Pod]float64, len(pods))
+	for _, pod := range pods {
+		scores[pod] = 1 - pod.GetMetrics().KVCacheUsagePercent
+	}
+	return scores
+}
diff --git a/pkg/epp/scheduling/plugins/scorer/kvcache_test.go b/pkg/epp/scheduling/plugins/scorer/kvcache_test.go
@@ -0,0 +1,95 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package scorer
+
+import (
+	"context"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
+	backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
+)
+
+func TestKvCacheScorer(t *testing.T) {
+	tests := []struct {
+		name              string
+		pods              []types.Pod
+		expectedScoresPod map[int]float64 // Map of pod index to expected score
+	}{
+		{
+			name: "Different KV cache utilization",
+			pods: []types.Pod{
+				&types.PodMetrics{Pod: &backend.Pod{}, Metrics: &backendmetrics.Metrics{KVCacheUsagePercent: 0.8}},
+				&types.PodMetrics{Pod: &backend.Pod{}, Metrics: &backendmetrics.Metrics{KVCacheUsagePercent: 0.5}},
+				&types.PodMetrics{Pod: &backend.Pod{}, Metrics: &backendmetrics.Metrics{KVCacheUsagePercent: 0.0}},
+			},
+			expectedScoresPod: map[int]float64{
+				0: 0.2, // Highest KV cache usage (0.8) gets lowest score (1-0.8=0.2)
+				1: 0.5, // Medium KV cache usage (0.5) gets medium score (1-0.5=0.5)
+				2: 1.0, // No KV cache usage (0.0) gets highest score (1-0=1.0)
+			},
+		},
+		{
+			name: "Same KV cache utilization",
+			pods: []types.Pod{
+				&types.PodMetrics{Pod: &backend.Pod{}, Metrics: &backendmetrics.Metrics{KVCacheUsagePercent: 0.6}},
+				&types.PodMetrics{Pod: &backend.Pod{}, Metrics: &backendmetrics.Metrics{KVCacheUsagePercent: 0.6}},
+			},
+			expectedScoresPod: map[int]float64{
+				0: 0.4, // Both get same score (1-0.6=0.4)
+				1: 0.4,
+			},
+		},
+		{
+			name: "Zero KV cache utilization",
+			pods: []types.Pod{
+				&types.PodMetrics{Pod: &backend.Pod{}, Metrics: &backendmetrics.Metrics{KVCacheUsagePercent: 0.0}},
+				&types.PodMetrics{Pod: &backend.Pod{}, Metrics: &backendmetrics.Metrics{KVCacheUsagePercent: 0.0}},
+			},
+			expectedScoresPod: map[int]float64{
+				0: 1.0, // No KV cache usage gets highest score
+				1: 1.0,
+			},
+		},
+		{
+			name: "Full KV cache utilization",
+			pods: []types.Pod{
+				&types.PodMetrics{Pod: &backend.Pod{}, Metrics: &backendmetrics.Metrics{KVCacheUsagePercent: 1.0}},
+				&types.PodMetrics{Pod: &backend.Pod{}, Metrics: &backendmetrics.Metrics{KVCacheUsagePercent: 0.5}},
+			},
+			expectedScoresPod: map[int]float64{
+				0: 0.0, // Full KV cache (1.0) gets lowest score (1-1=0)
+				1: 0.5, // Half KV cache (0.5) gets medium score (1-0.5=0.5)
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			ctx := types.NewSchedulingContext(context.Background(), &types.LLMRequest{}, tt.pods)
+			scorer := &KVCacheScorer{}
+			scores := scorer.Score(ctx, tt.pods)
+
+			for i, pod := range tt.pods {
+				expectedScore := tt.expectedScoresPod[i]
+				assert.InDelta(t, expectedScore, scores[pod], 0.0001, "Pod %d should have score %f", i, expectedScore)
+			}
+		})
+	}
+}
diff --git a/pkg/epp/scheduling/plugins/scorer/queue.go b/pkg/epp/scheduling/plugins/scorer/queue.go
@@ -0,0 +1,61 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package scorer
+
+import (
+	"math"
+
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
+)
+
+type QueueScorer struct{}
+
+func (q *QueueScorer) Name() string {
+	return "queue"
+}
+
+func (q *QueueScorer) Score(ctx *types.SchedulingContext, pods []types.Pod) map[types.Pod]float64 {
+	minQueueSize := math.MaxInt
+	maxQueueSize := math.MinInt
+
+	// Iterate through the remaining pods to find min and max
+	for _, pod := range pods {
+		queueSize := pod.GetMetrics().WaitingQueueSize
+		if queueSize < minQueueSize {
+			minQueueSize = queueSize
+		}
+		if queueSize > maxQueueSize {
+			maxQueueSize = queueSize
+		}
+	}
+
+	// podScoreFunc calculates the score based on the queue size of each pod. Longer queue gets a lower score.
+	podScoreFunc := func(pod types.Pod) float64 {
+		if maxQueueSize == minQueueSize {
+			// If all pods have the same queue size, return a neutral score
+			return 1.0
+		}
+		return float64(maxQueueSize-pod.GetMetrics().WaitingQueueSize) / float64(maxQueueSize-minQueueSize)
+	}
+
+	// Create a map to hold the scores for each pod
+	scores := make(map[types.Pod]float64, len(pods))
+	for _, pod := range pods {
+		scores[pod] = podScoreFunc(pod)
+	}
+	return scores
+}
diff --git a/pkg/epp/scheduling/plugins/scorer/queue_test.go b/pkg/epp/scheduling/plugins/scorer/queue_test.go
@@ -0,0 +1,85 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package scorer
+
+import (
+	"context"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
+	backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
+)
+
+func TestQueueScorer(t *testing.T) {
+	tests := []struct {
+		name              string
+		pods              []types.Pod
+		expectedScoresPod map[int]float64 // Map of pod index to expected score
+	}{
+		{
+			name: "Different queue sizes",
+			pods: []types.Pod{
+				&types.PodMetrics{Pod: &backend.Pod{}, Metrics: &backendmetrics.Metrics{WaitingQueueSize: 10}},
+				&types.PodMetrics{Pod: &backend.Pod{}, Metrics: &backendmetrics.Metrics{WaitingQueueSize: 5}},
+				&types.PodMetrics{Pod: &backend.Pod{}, Metrics: &backendmetrics.Metrics{WaitingQueueSize: 0}},
+			},
+			expectedScoresPod: map[int]float64{
+				0: 0.0, // Longest queue (10) gets lowest score
+				1: 0.5, // Medium queue (5) gets medium score
+				2: 1.0, // Shortest queue (0) gets highest score
+			},
+		},
+		{
+			name: "Same queue sizes",
+			pods: []types.Pod{
+				&types.PodMetrics{Pod: &backend.Pod{}, Metrics: &backendmetrics.Metrics{WaitingQueueSize: 5}},
+				&types.PodMetrics{Pod: &backend.Pod{}, Metrics: &backendmetrics.Metrics{WaitingQueueSize: 5}},
+			},
+			expectedScoresPod: map[int]float64{
+				0: 1.0, // When all pods have the same queue size, they get the same neutral score
+				1: 1.0,
+			},
+		},
+		{
+			name: "Zero queue sizes",
+			pods: []types.Pod{
+				&types.PodMetrics{Pod: &backend.Pod{}, Metrics: &backendmetrics.Metrics{WaitingQueueSize: 0}},
+				&types.PodMetrics{Pod: &backend.Pod{}, Metrics: &backendmetrics.Metrics{WaitingQueueSize: 0}},
+			},
+			expectedScoresPod: map[int]float64{
+				0: 1.0,
+				1: 1.0,
+			},
+		},
+	}
+
+	scorer := &QueueScorer{}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			ctx := types.NewSchedulingContext(context.Background(), &types.LLMRequest{}, tt.pods)
+			scores := scorer.Score(ctx, tt.pods)
+
+			for i, pod := range tt.pods {
+				expectedScore := tt.expectedScoresPod[i]
+				assert.InDelta(t, expectedScore, scores[pod], 0.0001, "Pod %d should have score %f", i, expectedScore)
+			}
+		})
+	}
+}