Skip to content

Commit bb8696c

Browse files
committed
Support scraping metrics from target running with TLS
vLLM server can run with TLS and metrics scraping doesn't work currently in that case. Signed-off-by: Pierangelo Di Pilato <pierdipi@redhat.com>
1 parent 60247b1 commit bb8696c

File tree

3 files changed

+52
-17
lines changed

3 files changed

+52
-17
lines changed

cmd/epp/runner/runner.go

Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,10 @@ package runner
1818

1919
import (
2020
"context"
21+
"crypto/tls"
2122
"flag"
2223
"fmt"
24+
"net/http"
2325
"net/http/pprof"
2426
"os"
2527

@@ -136,7 +138,9 @@ var (
136138

137139
modelServerMetricsPort = flag.Int("model-server-metrics-port", 0, "Port to scrape metrics from pods. "+
138140
"Default value will be set to InferencePool.Spec.TargetPortNumber if not set.")
139-
modelServerMetricsPath = flag.String("model-server-metrics-path", "/metrics", "Path to scrape metrics from pods")
141+
modelServerMetricsPath = flag.String("model-server-metrics-path", "/metrics", "Path to scrape metrics from pods")
142+
modelServerMetricsScheme = flag.String("model-server-metrics-scheme", "http", "Scheme to scrape metrics from pods")
143+
modelServerMetricsHttpsInsecureSkipVerify = flag.Bool("model-server-metrics-https-insecure-skip-verify", false, "When using 'https' scheme for 'model-server-metrics-scheme', configure 'InsecureSkipVerify' (default to false)")
140144

141145
setupLog = ctrl.Log.WithName("setup")
142146
)
@@ -167,13 +171,15 @@ func (r *Runner) WithSchedulerConfig(schedulerConfig *scheduling.SchedulerConfig
167171
func bindEnvToFlags() {
168172
// map[ENV_VAR]flagName – add more as needed
169173
for env, flg := range map[string]string{
170-
"GRPC_PORT": "grpc-port",
171-
"GRPC_HEALTH_PORT": "grpc-health-port",
172-
"MODEL_SERVER_METRICS_PORT": "model-server-metrics-port",
173-
"MODEL_SERVER_METRICS_PATH": "model-server-metrics-path",
174-
"DESTINATION_ENDPOINT_HINT_KEY": "destination-endpoint-hint-key",
175-
"POOL_NAME": "pool-name",
176-
"POOL_NAMESPACE": "pool-namespace",
174+
"GRPC_PORT": "grpc-port",
175+
"GRPC_HEALTH_PORT": "grpc-health-port",
176+
"MODEL_SERVER_METRICS_PORT": "model-server-metrics-port",
177+
"MODEL_SERVER_METRICS_PATH": "model-server-metrics-path",
178+
"MODEL_SERVER_METRICS_SCHEME": "model-server-metrics-scheme",
179+
"MODEL_SERVER_METRICS_HTTPS_INSECURE_SKIP_VERIFY": "model-server-metrics-https-insecure-skip-verify",
180+
"DESTINATION_ENDPOINT_HINT_KEY": "destination-endpoint-hint-key",
181+
"POOL_NAME": "pool-name",
182+
"POOL_NAMESPACE": "pool-namespace",
177183
// durations & bools work too; flag.Set expects the *string* form
178184
"REFRESH_METRICS_INTERVAL": "refresh-metrics-interval",
179185
"SECURE_SERVING": "secure-serving",
@@ -231,10 +237,26 @@ func (r *Runner) Run(ctx context.Context) error {
231237
return err
232238
}
233239
verifyMetricMapping(*mapping, setupLog)
240+
241+
var metricsHttpClient *http.Client
242+
if *modelServerMetricsScheme == "https" {
243+
metricsHttpClient = &http.Client{
244+
Transport: &http.Transport{
245+
TLSClientConfig: &tls.Config{
246+
InsecureSkipVerify: *modelServerMetricsHttpsInsecureSkipVerify,
247+
},
248+
},
249+
}
250+
} else {
251+
metricsHttpClient = http.DefaultClient
252+
}
253+
234254
pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.PodMetricsClientImpl{
235-
MetricMapping: mapping,
236-
ModelServerMetricsPort: int32(*modelServerMetricsPort),
237-
ModelServerMetricsPath: *modelServerMetricsPath,
255+
MetricMapping: mapping,
256+
ModelServerMetricsPort: int32(*modelServerMetricsPort),
257+
ModelServerMetricsPath: *modelServerMetricsPath,
258+
ModelServerMetricsScheme: *modelServerMetricsScheme,
259+
Client: metricsHttpClient,
238260
}, *refreshMetricsInterval)
239261

240262
datastore := datastore.NewDatastore(ctx, pmf)
@@ -412,6 +434,9 @@ func validateFlags() error {
412434
if *configText != "" && *configFile != "" {
413435
return fmt.Errorf("both the %q and %q flags can not be set at the same time", "configText", "configFile")
414436
}
437+
if *modelServerMetricsScheme != "http" && *modelServerMetricsScheme != "https" {
438+
return fmt.Errorf("unexpected %q value for %q flag, it can only be set to 'http' or 'https'", *modelServerMetricsScheme, "model-server-metrics-scheme")
439+
}
415440

416441
return nil
417442
}

pkg/epp/backend/metrics/metrics.go

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,12 @@ const (
3737
)
3838

3939
type PodMetricsClientImpl struct {
40-
MetricMapping *MetricMapping
41-
ModelServerMetricsPort int32
42-
ModelServerMetricsPath string
40+
MetricMapping *MetricMapping
41+
ModelServerMetricsPort int32
42+
ModelServerMetricsPath string
43+
ModelServerMetricsScheme string
44+
45+
Client *http.Client
4346
}
4447

4548
// FetchMetrics fetches metrics from a given pod, clones the existing metrics object and returns an updated one.
@@ -49,7 +52,7 @@ func (p *PodMetricsClientImpl) FetchMetrics(ctx context.Context, pod *backend.Po
4952
if err != nil {
5053
return nil, fmt.Errorf("failed to create request: %v", err)
5154
}
52-
resp, err := http.DefaultClient.Do(req)
55+
resp, err := p.Client.Do(req)
5356
if err != nil {
5457
return nil, fmt.Errorf("failed to fetch metrics from %s: %w", pod.NamespacedName, err)
5558
}
@@ -73,7 +76,7 @@ func (p *PodMetricsClientImpl) getMetricEndpoint(pod *backend.Pod, targetPortNum
7376
if p.ModelServerMetricsPort == 0 {
7477
p.ModelServerMetricsPort = targetPortNumber
7578
}
76-
return fmt.Sprintf("http://%s:%d%s", pod.Address, p.ModelServerMetricsPort, p.ModelServerMetricsPath)
79+
return fmt.Sprintf("%s://%s:%d%s", p.ModelServerMetricsScheme, pod.Address, p.ModelServerMetricsPort, p.ModelServerMetricsPath)
7780
}
7881

7982
// promToPodMetrics updates internal pod metrics with scraped Prometheus metrics.

pkg/epp/backend/metrics/metrics_test.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package metrics
1919
import (
2020
"context"
2121
"errors"
22+
"net/http"
2223
"reflect"
2324
"strconv"
2425
"strings"
@@ -495,7 +496,13 @@ func TestFetchMetrics(t *testing.T) {
495496
},
496497
}
497498
existing := &MetricsState{}
498-
p := &PodMetricsClientImpl{ModelServerMetricsPort: 9999, ModelServerMetricsPath: "/metrics"} // No MetricMapping needed for this basic test
499+
// No MetricMapping needed for this basic test
500+
p := &PodMetricsClientImpl{
501+
ModelServerMetricsScheme: "http",
502+
ModelServerMetricsPort: 9999,
503+
ModelServerMetricsPath: "/metrics",
504+
Client: http.DefaultClient,
505+
}
499506

500507
_, err := p.FetchMetrics(ctx, pod, existing, 9999) // Use a port that's unlikely to be in use
501508
if err == nil {

0 commit comments

Comments
 (0)