kubernetes-sigs · k8s-ci-robot · Jun 29, 2025 · Jun 29, 2025 · Jun 29, 2025 · Jun 29, 2025
diff --git a/cmd/epp/runner/runner.go b/cmd/epp/runner/runner.go
@@ -47,7 +47,6 @@ import (
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/saturationdetector"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
-	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/filter"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/multi/prefix"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/picker"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/profile"
@@ -292,7 +291,6 @@ func (r *Runner) initializeScheduler() (*scheduling.Scheduler, error) {
 		kvCacheScorerWeight := envutil.GetEnvInt("KV_CACHE_SCORE_WEIGHT", scorer.DefaultKVCacheScorerWeight, setupLog)
 
 		schedulerProfile := framework.NewSchedulerProfile().
-			WithFilters(filter.NewSubsetFilter()).
 			WithScorers(framework.NewWeightedScorer(scorer.NewQueueScorer(), queueScorerWeight),
 				framework.NewWeightedScorer(scorer.NewKVCacheScorer(), kvCacheScorerWeight)).
 			WithPicker(picker.NewMaxScorePicker())

diff --git a/conformance/testing-epp/sheduler_test.go → conformance/testing-epp/scheduler_test.go b/conformance/testing-epp/sheduler_test.go → conformance/testing-epp/scheduler_test.go
diff --git a/pkg/bbr/handlers/server.go b/pkg/bbr/handlers/server.go
@@ -118,7 +118,7 @@ type streamedBody struct {
 func (s *Server) processRequestBody(ctx context.Context, body *extProcPb.HttpBody, streamedBody *streamedBody, logger logr.Logger) ([]*extProcPb.ProcessingResponse, error) {
 	loggerVerbose := logger.V(logutil.VERBOSE)
 
-	var requestBody map[string]interface{}
+	var requestBody map[string]any
 	if s.streaming {
 		streamedBody.body = append(streamedBody.body, body.Body...)
 		// In the stream case, we can receive multiple request bodies.

diff --git a/pkg/epp/backend/metrics/metrics_state.go b/pkg/epp/backend/metrics/metrics_state.go
@@ -21,8 +21,8 @@ import (
 	"time"
 )
 
-// newMetricsState initializes a new MetricsState and returns its pointer.
-func newMetricsState() *MetricsState {
+// NewMetricsState initializes a new MetricsState and returns its pointer.
+func NewMetricsState() *MetricsState {
 	return &MetricsState{
 		ActiveModels:  make(map[string]int),
 		WaitingModels: make(map[string]int),

diff --git a/pkg/epp/backend/metrics/types.go b/pkg/epp/backend/metrics/types.go
@@ -51,7 +51,7 @@ func (f *PodMetricsFactory) NewPodMetrics(parentCtx context.Context, in *corev1.
 		logger:    log.FromContext(parentCtx).WithValues("pod", pod.NamespacedName),
 	}
 	pm.pod.Store(pod)
-	pm.metrics.Store(newMetricsState())
+	pm.metrics.Store(NewMetricsState())
 
 	pm.startRefreshLoop(parentCtx)
 	return pm

diff --git a/pkg/epp/handlers/response.go b/pkg/epp/handlers/response.go
@@ -34,19 +34,15 @@ const (
 )
 
 // HandleResponseBody always returns the requestContext even in the error case, as the request context is used in error handling.
-func (s *StreamingServer) HandleResponseBody(
-	ctx context.Context,
-	reqCtx *RequestContext,
-	response map[string]interface{},
-) (*RequestContext, error) {
+func (s *StreamingServer) HandleResponseBody(ctx context.Context, reqCtx *RequestContext, response map[string]any) (*RequestContext, error) {
 	logger := log.FromContext(ctx)
 	responseBytes, err := json.Marshal(response)
 	if err != nil {
 		logger.V(logutil.DEFAULT).Error(err, "error marshalling responseBody")
 		return reqCtx, err
 	}
 	if response["usage"] != nil {
-		usg := response["usage"].(map[string]interface{})
+		usg := response["usage"].(map[string]any)
 		usage := Usage{
 			PromptTokens:     int(usg["prompt_tokens"].(float64)),
 			CompletionTokens: int(usg["completion_tokens"].(float64)),
@@ -68,11 +64,7 @@ func (s *StreamingServer) HandleResponseBody(
 }
 
 // The function is to handle streaming response if the modelServer is streaming.
-func (s *StreamingServer) HandleResponseBodyModelStreaming(
-	ctx context.Context,
-	reqCtx *RequestContext,
-	responseText string,
-) {
+func (s *StreamingServer) HandleResponseBodyModelStreaming(ctx context.Context, reqCtx *RequestContext, responseText string) {
 	if strings.Contains(responseText, streamingEndMsg) {
 		resp := parseRespForUsage(ctx, responseText)
 		reqCtx.Usage = resp.Usage
@@ -160,10 +152,7 @@ func (s *StreamingServer) generateResponseHeaders(reqCtx *RequestContext) []*con
 //
 // If include_usage is not included in the request, `data: [DONE]` is returned separately, which
 // indicates end of streaming.
-func parseRespForUsage(
-	ctx context.Context,
-	responseText string,
-) ResponseBody {
+func parseRespForUsage(ctx context.Context, responseText string) ResponseBody {
 	response := ResponseBody{}
 	logger := log.FromContext(ctx)
 

diff --git a/pkg/epp/handlers/response_test.go b/pkg/epp/handlers/response_test.go
@@ -86,7 +86,7 @@ func TestHandleResponseBody(t *testing.T) {
 			if reqCtx == nil {
 				reqCtx = &RequestContext{}
 			}
-			var responseMap map[string]interface{}
+			var responseMap map[string]any
 			marshalErr := json.Unmarshal(test.body, &responseMap)
 			if marshalErr != nil {
 				t.Error(marshalErr, "Error unmarshaling request body")

diff --git a/pkg/epp/handlers/server.go b/pkg/epp/handlers/server.go
@@ -112,7 +112,7 @@ type RequestContext struct {
 
 type Request struct {
 	Headers  map[string]string
-	Body     map[string]interface{}
+	Body     map[string]any
 	Metadata map[string]any
 }
 type Response struct {
@@ -143,7 +143,7 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)
 		RequestState: RequestReceived,
 		Request: &Request{
 			Headers:  make(map[string]string),
-			Body:     make(map[string]interface{}),
+			Body:     make(map[string]any),
 			Metadata: make(map[string]any),
 		},
 		Response: &Response{
@@ -152,7 +152,7 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)
 	}
 
 	var body []byte
-	var responseBody map[string]interface{}
+	var responseBody map[string]any
 
 	// Create error handling var as each request should only report once for
 	// error metrics. This doesn't cover the error "Cannot receive stream request" because
@@ -308,7 +308,7 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)
 		// Handle the err and fire an immediate response.
 		if err != nil {
 			logger.V(logutil.DEFAULT).Error(err, "Failed to process request", "request", req)
-			resp, err := BuildErrResponse(err)
+			resp, err := buildErrResponse(err)
 			if err != nil {
 				return err
 			}
@@ -389,7 +389,7 @@ func (r *RequestContext) updateStateAndSendIfNeeded(srv extProcPb.ExternalProces
 	return nil
 }
 
-func BuildErrResponse(err error) (*extProcPb.ProcessingResponse, error) {
+func buildErrResponse(err error) (*extProcPb.ProcessingResponse, error) {
 	var resp *extProcPb.ProcessingResponse
 
 	switch errutil.CanonicalCode(err) {
@@ -416,6 +416,17 @@ func BuildErrResponse(err error) (*extProcPb.ProcessingResponse, error) {
 				},
 			},
 		}
+	// This code can be returned by the director when there are no candidate pods for the request scheduling.
+	case errutil.ServiceUnavailable:
+		resp = &extProcPb.ProcessingResponse{
+			Response: &extProcPb.ProcessingResponse_ImmediateResponse{
+				ImmediateResponse: &extProcPb.ImmediateResponse{
+					Status: &envoyTypePb.HttpStatus{
+						Code: envoyTypePb.StatusCode_ServiceUnavailable,
+					},
+				},
+			},
+		}
 	// This code can be returned when users provide invalid json request.
 	case errutil.BadRequest:
 		resp = &extProcPb.ProcessingResponse{

diff --git a/pkg/epp/requestcontrol/director.go b/pkg/epp/requestcontrol/director.go
@@ -24,12 +24,14 @@ import (
 	"math/rand"
 	"net"
 	"strconv"
+	"strings"
 	"time"
 
 	"github.com/go-logr/logr"
 	"sigs.k8s.io/controller-runtime/pkg/log"
 	"sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
+	backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/handlers"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
@@ -39,6 +41,11 @@ import (
 	requtil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/request"
 )
 
+const (
+	subsetHintNamespace = "envoy.lb.subset_hint"
+	subsetHintKey       = "x-gateway-destination-endpoint-subset"
+)
+
 // Scheduler defines the interface required by the Director for scheduling.
 type Scheduler interface {
 	Schedule(ctx context.Context, request *schedulingtypes.LLMRequest, candidatePods []schedulingtypes.Pod) (result *schedulingtypes.SchedulingResult, err error)
@@ -118,12 +125,12 @@ func (d *Director) HandleRequest(ctx context.Context, reqCtx *handlers.RequestCo
 	}
 
 	// Prepare LLMRequest (needed for both saturation detection and Scheduler)
-	reqCtx.SchedulingRequest = schedulingtypes.NewLLMRequest(
-		reqCtx.Request.Headers[requtil.RequestIdHeaderKey],
-		reqCtx.ResolvedTargetModel,
-		prompt,
-		reqCtx.Request.Headers,
-		reqCtx.Request.Metadata)
+	reqCtx.SchedulingRequest = &schedulingtypes.LLMRequest{
+		RequestId:   reqCtx.Request.Headers[requtil.RequestIdHeaderKey],
+		TargetModel: reqCtx.ResolvedTargetModel,
+		Prompt:      prompt,
+		Headers:     reqCtx.Request.Headers,
+	}
 
 	logger = logger.WithValues("model", reqCtx.Model, "resolvedTargetModel", reqCtx.ResolvedTargetModel, "criticality", requestCriticality)
 
@@ -135,11 +142,11 @@ func (d *Director) HandleRequest(ctx context.Context, reqCtx *handlers.RequestCo
 		return reqCtx, err
 	}
 
-	// --- 3. Call Scheduler ---
-	// Snapshot pod metrics from the datastore to:
-	// 1. Reduce concurrent access to the datastore.
-	// 2. Ensure consistent data during the scheduling operation of a request between all scheduling cycles.
-	candidatePods := schedulingtypes.ToSchedulerPodMetrics(d.datastore.PodGetAll())
+	// --- 3. Call Scheduler (with the relevant candidate pods) ---
+	candidatePods := d.getCandidatePodsForScheduling(ctx, reqCtx.Request.Metadata)
+	if len(candidatePods) == 0 {
+		return reqCtx, errutil.Error{Code: errutil.ServiceUnavailable, Msg: "failed to find candidate pods for serving the request"}
+	}
 	results, err := d.scheduler.Schedule(ctx, reqCtx.SchedulingRequest, candidatePods)
 	if err != nil {
 		return reqCtx, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Errorf("failed to find target pod: %w", err).Error()}
@@ -177,6 +184,52 @@ func (d *Director) admitRequest(ctx context.Context, requestCriticality v1alpha2
 	return nil
 }
 
+// getCandidatePodsForScheduling gets the list of relevant endpoints for the scheduling cycle from the datastore.
+// according to EPP protocol, if "x-gateway-destination-endpoint-subset" is set on the request metadata and specifies
+// a subset of endpoints, only these endpoints will be considered as candidates for the scheduler.
+// Snapshot pod metrics from the datastore to:
+// 1. Reduce concurrent access to the datastore.
+// 2. Ensure consistent data during the scheduling operation of a request between all scheduling cycles.
+func (d *Director) getCandidatePodsForScheduling(ctx context.Context, requestMetadata map[string]any) []schedulingtypes.Pod {
+	loggerTrace := log.FromContext(ctx).V(logutil.TRACE)
+
+	subsetMap, found := requestMetadata[subsetHintNamespace].(map[string]any)
+	if !found {
+		return schedulingtypes.ToSchedulerPodMetrics(d.datastore.PodGetAll())
+	}
+
+	// Check if endpoint key is present in the subset map and ensure there is at least one value
+	endpointSubsetList, found := subsetMap[subsetHintKey].([]any)
+	if !found {
+		return schedulingtypes.ToSchedulerPodMetrics(d.datastore.PodGetAll())
+	} else if len(endpointSubsetList) == 0 {
+		loggerTrace.Info("found empty subset filter in request metadata, filtering all pods")
+		return []schedulingtypes.Pod{}
+	}
+
+	// Create a map of endpoint addresses for easy lookup
+	endpoints := make(map[string]bool)
+	for _, endpoint := range endpointSubsetList {
+		// Extract address from endpoint
+		// The endpoint is formatted as "<address>:<port>" (ex. "10.0.1.0:8080")
+		epStr := strings.Split(endpoint.(string), ":")[0]
+		endpoints[epStr] = true
+	}
+
+	podTotalCount := 0
+	podFitleredList := d.datastore.PodList(func(pm backendmetrics.PodMetrics) bool {
+		podTotalCount++
+		if _, found := endpoints[pm.GetPod().Address]; found {
+			return true
+		}
+		return false
+	})
+
+	loggerTrace.Info("filtered candidate pods by subset filtering", "podTotalCount", podTotalCount, "filteredCount", len(podFitleredList))
+
+	return schedulingtypes.ToSchedulerPodMetrics(podFitleredList)
+}
+
 // prepareRequest populates the RequestContext and calls the registered PreRequest plugins
 // for allowing plugging customized logic based on the scheduling results.
 func (d *Director) prepareRequest(ctx context.Context, reqCtx *handlers.RequestContext, result *schedulingtypes.SchedulingResult) (*handlers.RequestContext, error) {