Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion cmd/epp/runner/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,8 @@ var (
logVerbosity = flag.Int("v", logging.DEFAULT, "number for the log level verbosity")
secureServing = flag.Bool(
"secureServing", runserver.DefaultSecureServing, "Enables secure serving. Defaults to true.")
certPath = flag.String(
healthChecking = flag.Bool("healthChecking", runserver.DefaultHealthChecking, "Enables health checking")
certPath = flag.String(
"certPath", "", "The path to the certificate for secure serving. The certificate and private key files "+
"are assumed to be named tls.crt and tls.key, respectively. If not set, and secureServing is enabled, "+
"then a self-signed certificate is used.")
Expand Down Expand Up @@ -228,6 +229,7 @@ func (r *Runner) Run() error {
PoolNamespacedName: poolNamespacedName,
Datastore: datastore,
SecureServing: *secureServing,
HealthChecking: *healthChecking,
CertPath: *certPath,
RefreshPrometheusMetricsInterval: *refreshPrometheusMetricsInterval,
Director: director,
Expand Down
16 changes: 16 additions & 0 deletions pkg/epp/server/runserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,15 @@ import (
"context"
"crypto/tls"
"fmt"

"time"

extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3"
"github.com/go-logr/logr"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials"
"google.golang.org/grpc/health"
healthgrpc "google.golang.org/grpc/health/grpc_health_v1"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/manager"
Expand All @@ -46,6 +49,7 @@ type ExtProcServerRunner struct {
PoolNamespacedName types.NamespacedName
Datastore datastore.Datastore
SecureServing bool
HealthChecking bool
CertPath string
RefreshPrometheusMetricsInterval time.Duration
Director *requestcontrol.Director
Expand All @@ -66,6 +70,7 @@ const (
DefaultRefreshMetricsInterval = 50 * time.Millisecond // default for --refreshMetricsInterval
DefaultRefreshPrometheusMetricsInterval = 5 * time.Second // default for --refreshPrometheusMetricsInterval
DefaultSecureServing = true // default for --secureServing
DefaultHealthChecking = false // default for --healthChecking
)

// NewDefaultExtProcServerRunner creates a runner with default values.
Expand All @@ -77,6 +82,7 @@ func NewDefaultExtProcServerRunner() *ExtProcServerRunner {
DestinationEndpointHintMetadataNamespace: DefaultDestinationEndpointHintMetadataNamespace,
PoolNamespacedName: types.NamespacedName{Name: DefaultPoolName, Namespace: DefaultPoolNamespace},
SecureServing: DefaultSecureServing,
HealthChecking: DefaultHealthChecking,
RefreshPrometheusMetricsInterval: DefaultRefreshPrometheusMetricsInterval,
// Dependencies can be assigned later.
}
Expand Down Expand Up @@ -152,6 +158,16 @@ func (r *ExtProcServerRunner) AsRunnable(logger logr.Logger) manager.Runnable {
extProcServer,
)

if r.HealthChecking {
healthcheck := health.NewServer()
healthgrpc.RegisterHealthServer(srv,
healthcheck,
)
svcName := extProcPb.ExternalProcessor_ServiceDesc.ServiceName
logger.Info("Setting ExternalProcessor service status to SERVING", "serviceName", svcName)
healthcheck.SetServingStatus(svcName, healthgrpc.HealthCheckResponse_SERVING)
}

// Forward to the gRPC runnable.
return runnable.GRPCServer("ext-proc", srv, r.GrpcPort).Start(ctx)
}))
Expand Down
11 changes: 10 additions & 1 deletion test/testdata/envoy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,16 @@ data:
max_pending_requests: 40000
max_requests: 40000
max_retries: 1024
health_checks:
- timeout: 2s
interval: 10s
unhealthy_threshold: 3
healthy_threshold: 2
reuse_connection: true
grpc_health_check:
service_name: "envoy.service.ext_proc.v3.ExternalProcessor"
tls_options:
alpn_protocols: ["h2"]
Comment on lines +181 to +182
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After adding the TLS options, it works now. 😅

[2025-06-15 11:08:22.735][1][debug][hc] [source/extensions/health_checkers/grpc/health_checker_impl.cc:390] [Tags: "ConnectionId":"0"] hc grpc_status=0 service_status=serving health_flags=healthy

# This ensures that envoy accepts untrusted certificates. We tried to explicitly
# set TrustChainVerification to ACCEPT_UNSTRUSTED, but that actually didn't work
# and what worked is setting the common_tls_context to empty.
Expand Down Expand Up @@ -197,7 +207,6 @@ data:
socket_address:
address: vllm-llama3-8b-instruct-epp.$E2E_NS
port_value: 9002
health_status: HEALTHY
load_balancing_weight: 1
---
apiVersion: apps/v1
Expand Down
2 changes: 1 addition & 1 deletion test/testdata/inferencepool-e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ rules:
- subjectaccessreviews
verbs:
- create
---
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
Expand Down