Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 105 additions & 0 deletions Dockerfile.dev
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# Build Stage: using Go 1.25 image
FROM quay.io/projectquay/golang:1.25 AS builder
ARG TARGETOS
ARG TARGETARCH

# Install build tools
# The builder is based on UBI8, so we need epel-release-8.
RUN dnf install -y 'https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm' && \
dnf install -y gcc-c++ libstdc++ libstdc++-devel clang zeromq-devel pkgconfig python3.12-devel python3.12-pip git && \
dnf clean all
# python3.12-devel needed for CGO compilation (Python headers and python3.12-config for linker flags)

WORKDIR /workspace

# Copy the Go Modules manifests
COPY llm-d-inference-scheduler/go.mod go.mod
COPY llm-d-inference-scheduler/go.sum go.sum

# Copy the go source
COPY llm-d-inference-scheduler/cmd/ cmd/
COPY llm-d-inference-scheduler/pkg/ pkg/
COPY llm-d-inference-scheduler/test/ test/

# Copy local dependencies from sibling directories
COPY llm-d-kv-cache-manager/ /workspace/llm-d-kv-cache-manager/
COPY gateway-api-inference-extension/ /workspace/gateway-api-inference-extension/

# Set up replace directives to use local checkouts
RUN go mod edit -replace github.com/llm-d/llm-d-kv-cache-manager=/workspace/llm-d-kv-cache-manager
RUN go mod edit -replace sigs.k8s.io/gateway-api-inference-extension=/workspace/gateway-api-inference-extension
RUN go mod tidy

# HuggingFace tokenizer bindings
RUN mkdir -p lib
# Ensure that the RELEASE_VERSION matches the one used in the imported llm-d-kv-cache-manager version
ARG RELEASE_VERSION=v1.22.1
RUN curl -L https://github.com/daulet/tokenizers/releases/download/${RELEASE_VERSION}/libtokenizers.${TARGETOS}-${TARGETARCH}.tar.gz | tar -xz -C lib
RUN ranlib lib/*.a

# Build
# the GOARCH has not a default value to allow the binary be built according to the host where the command
# was called. For example, if we call make image-build in a local env which has the Apple Silicon M1 SO
# the docker BUILDPLATFORM arg will be linux/arm64 when for Apple x86 it will be linux/amd64. Therefore,
# by leaving it empty we can ensure that the container and binary shipped on it will have the same platform.
ENV CGO_ENABLED=1
ENV GOOS=${TARGETOS:-linux}
ENV GOARCH=${TARGETARCH}
ENV PYTHON=python3.12
ENV PYTHONPATH=/usr/lib64/python3.12/site-packages:/usr/lib/python3.12/site-packages

ARG COMMIT_SHA=unknown
ARG BUILD_REF
RUN export CGO_CFLAGS="$(python3.12-config --cflags) -I/workspace/lib" && \
export CGO_LDFLAGS="$(python3.12-config --ldflags --embed) -L/workspace/lib -ltokenizers -ldl -lm" && \
go build -a -o bin/epp -ldflags="-extldflags '-L$(pwd)/lib' -X sigs.k8s.io/gateway-api-inference-extension/version.CommitSHA=${COMMIT_SHA} -X sigs.k8s.io/gateway-api-inference-extension/version.BuildRef=${BUILD_REF}" cmd/epp/main.go

# Use ubi9 as a minimal base image to package the manager binary
# Refer to https://catalog.redhat.com/software/containers/ubi9/ubi-minimal/615bd9b4075b022acc111bf5 for more details
FROM registry.access.redhat.com/ubi9/ubi-minimal:latest
WORKDIR /
COPY --from=builder /workspace/bin/epp /app/epp

# Install zeromq runtime library and Python runtime needed by the manager.
# The final image is UBI9, so we need epel-release-9.
USER root
RUN microdnf install -y dnf && \
dnf install -y 'https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm' && \
dnf install -y zeromq python3.12 python3.12-libs python3.12-pip && \
dnf clean all && \
rm -rf /var/cache/dnf /var/lib/dnf && \
ln -sf /usr/bin/python3.12 /usr/bin/python3 && \
ln -sf /usr/bin/python3.12 /usr/bin/python
# Note: python3.12 package does not automatically create python3/python symlinks - they must be created manually

# Install wrapper as a module in site-packages (from local checkout)
RUN mkdir -p /usr/local/lib/python3.12/site-packages/
COPY llm-d-kv-cache-manager/pkg/preprocessing/chat_completions/render_jinja_template_wrapper.py /usr/local/lib/python3.12/site-packages/

# Python deps (no cache, single target) – filter out torch
ENV PIP_NO_CACHE_DIR=1 PIP_DISABLE_PIP_VERSION_CHECK=1
COPY llm-d-kv-cache-manager/pkg/preprocessing/chat_completions/requirements.txt /tmp/requirements.txt
RUN sed '/^torch\b/d' /tmp/requirements.txt > /tmp/requirements.notorch.txt && \
python3.12 -m pip install --no-cache-dir --upgrade pip setuptools wheel && \
python3.12 -m pip install --no-cache-dir --target /usr/local/lib/python3.12/site-packages -r /tmp/requirements.notorch.txt && \
python3.12 -m pip install --no-cache-dir --target /usr/local/lib/python3.12/site-packages PyYAML && \
rm /tmp/requirements.txt /tmp/requirements.notorch.txt && \
rm -rf /root/.cache/pip

# Python env
ENV PYTHONPATH="/usr/local/lib/python3.12/site-packages:/usr/lib/python3.12/site-packages"
ENV PYTHON=python3.12
ENV PATH=/usr/bin:/usr/local/bin:$PATH
ENV HF_HOME="/tmp/.cache"

USER 65532:65532

# expose gRPC, health and metrics ports
EXPOSE 9002
EXPOSE 9003
EXPOSE 9090

# expose port for KV-Events ZMQ SUB socket
EXPOSE 5557

ENTRYPOINT ["/app/epp"]
1 change: 1 addition & 0 deletions Dockerfile.sidecar
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ RUN go mod download
COPY cmd/pd-sidecar/main.go cmd/cmd.go
COPY pkg/sidecar pkg/sidecar
COPY pkg/common pkg/common
COPY pkg/telemetry pkg/telemetry

# Build
# the GOARCH has not a default value to allow the binary be built according to the host where the command
Expand Down
36 changes: 35 additions & 1 deletion cmd/epp/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,20 +27,54 @@
import (
"os"

"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/gateway-api-inference-extension/cmd/epp/runner"

"github.com/llm-d/llm-d-inference-scheduler/pkg/metrics"
"github.com/llm-d/llm-d-inference-scheduler/pkg/plugins"
"github.com/llm-d/llm-d-inference-scheduler/pkg/telemetry"
)

func main() {
ctx := ctrl.SetupSignalHandler()

// Initialize tracing before creating any spans
shutdownTracing, err := telemetry.InitTracing(ctx)
if err != nil {
// Log error but don't fail - tracing is optional
ctrl.Log.Error(err, "Failed to initialize tracing")
}
if shutdownTracing != nil {
defer func() {
if err := shutdownTracing(ctx); err != nil {
ctrl.Log.Error(err, "Failed to shutdown tracing")
}
}()
}

// Add startup span to verify tracing is working
tracer := telemetry.Tracer()
ctx, span := tracer.Start(ctx, "llm_d.epp.startup")
span.SetAttributes(
attribute.String("component", "llm-d-inference-scheduler"),
attribute.String("operation", "startup"),
)
defer span.End()

// Register llm-d-inference-scheduler plugins
plugins.RegisterAllPlugins()

// Note: GIE built-in plugins are automatically registered by the runner
// when it processes configuration in runner.parsePluginsConfiguration()

if err := runner.NewRunner().
WithCustomCollectors(metrics.GetCollectors()...).
Run(ctrl.SetupSignalHandler()); err != nil {
Run(ctx); err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, "startup failed")
os.Exit(1)

Check failure on line 77 in cmd/epp/main.go

View workflow job for this annotation

GitHub Actions / lint-and-test

exitAfterDefer: os.Exit will exit, and `defer span.End()` will not run (gocritic)
}
span.SetStatus(codes.Ok, "")
}
34 changes: 34 additions & 0 deletions cmd/pd-sidecar/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ import (

"github.com/llm-d/llm-d-inference-scheduler/pkg/sidecar/proxy"
"github.com/llm-d/llm-d-inference-scheduler/pkg/sidecar/version"
"github.com/llm-d/llm-d-inference-scheduler/pkg/telemetry"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
)

var (
Expand Down Expand Up @@ -70,6 +73,29 @@ func main() {
ctx := ctrl.SetupSignalHandler()
log.IntoContext(ctx, logger)

// Initialize tracing before creating any spans
shutdownTracing, err := telemetry.InitTracing(ctx)
if err != nil {
// Log error but don't fail - tracing is optional
logger.Error(err, "Failed to initialize tracing")
}
if shutdownTracing != nil {
defer func() {
if err := shutdownTracing(ctx); err != nil {
logger.Error(err, "Failed to shutdown tracing")
}
}()
}

// Add startup span to verify tracing is working
tracer := telemetry.Tracer()
ctx, span := tracer.Start(ctx, "llm_d.pd_proxy.startup")
span.SetAttributes(
attribute.String("component", "llm-d-pd-proxy"),
attribute.String("operation", "startup"),
)
defer span.End()

logger.Info("Proxy starting", "Built on", version.BuildRef, "From Git SHA", version.CommitSHA)

// Validate connector
Expand Down Expand Up @@ -108,6 +134,8 @@ func main() {
targetURL, err := url.Parse(scheme + "://localhost:" + *vLLMPort)
if err != nil {
logger.Error(err, "failed to create targetURL")
span.RecordError(err)
span.SetStatus(codes.Error, "failed to create targetURL")
return
}

Expand All @@ -121,6 +149,8 @@ func main() {
}
if err != nil {
logger.Error(err, "failed to create TLS certificate")
span.RecordError(err)
span.SetStatus(codes.Error, "failed to create TLS certificate")
return
}
cert = &tempCert
Expand All @@ -139,11 +169,15 @@ func main() {
validator, err := proxy.NewAllowlistValidator(*enableSSRFProtection, *poolGroup, *inferencePoolNamespace, *inferencePoolName)
if err != nil {
logger.Error(err, "failed to create SSRF protection validator")
span.RecordError(err)
span.SetStatus(codes.Error, "failed to create SSRF protection validator")
return
}

proxyServer := proxy.NewProxy(*port, targetURL, config)

span.SetStatus(codes.Ok, "")

if err := proxyServer.Start(ctx, cert, validator); err != nil {
logger.Error(err, "failed to start proxy server")
}
Expand Down
8 changes: 4 additions & 4 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ require (
github.com/openai/openai-go v1.12.0
github.com/prometheus/client_golang v1.23.2
github.com/stretchr/testify v1.11.1
go.opentelemetry.io/otel v1.38.0
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0
go.opentelemetry.io/otel/sdk v1.38.0
go.opentelemetry.io/otel/trace v1.38.0
golang.org/x/sync v0.18.0
google.golang.org/grpc v1.77.0
k8s.io/api v0.34.2
Expand Down Expand Up @@ -95,13 +99,9 @@ require (
github.com/x448/float16 v0.8.4 // indirect
go.opentelemetry.io/auto/sdk v1.2.1 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 // indirect
go.opentelemetry.io/otel v1.38.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 // indirect
go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.38.0 // indirect
go.opentelemetry.io/otel/metric v1.38.0 // indirect
go.opentelemetry.io/otel/sdk v1.38.0 // indirect
go.opentelemetry.io/otel/trace v1.38.0 // indirect
go.opentelemetry.io/proto/otlp v1.7.1 // indirect
go.uber.org/atomic v1.11.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
Expand Down
37 changes: 36 additions & 1 deletion pkg/plugins/pre-request/pd_prerequest.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,15 @@ import (
"fmt"
"net"

"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
"go.opentelemetry.io/otel/trace"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"

"github.com/llm-d/llm-d-inference-scheduler/pkg/common"
"github.com/llm-d/llm-d-inference-scheduler/pkg/telemetry"
)

const (
Expand Down Expand Up @@ -67,17 +71,48 @@ func (p *PrefillHeaderHandler) WithName(name string) *PrefillHeaderHandler {
}

// PreRequest wires prefill SchedulerProfile result into a header to indicate prefill worker
func (p *PrefillHeaderHandler) PreRequest(_ context.Context, request *types.LLMRequest, schedulingResult *types.SchedulingResult) {
func (p *PrefillHeaderHandler) PreRequest(ctx context.Context, request *types.LLMRequest, schedulingResult *types.SchedulingResult) {
tracer := telemetry.Tracer()
_, span := tracer.Start(ctx, "llm_d.epp.prerequest.pd_disaggregation",
trace.WithSpanKind(trace.SpanKindInternal),
)
defer span.End()

// Add component and request attributes
span.SetAttributes(
attribute.String("component", "llm-d-inference-scheduler"),
attribute.String("operation", "prefill_disaggregation"),
)

if request != nil && request.TargetModel != "" {
span.SetAttributes(attribute.String("gen_ai.request.model", request.TargetModel))
}
if request != nil && request.RequestId != "" {
span.SetAttributes(attribute.String("gen_ai.request.id", request.RequestId))
}

if _, found := request.Headers[common.PrefillPodHeader]; found {
request.Headers[common.PrefillPodHeader] = "" // clear header, if already set
}

prefillProfileRunResult, exists := schedulingResult.ProfileResults[p.prefillProfile]
if !exists {
span.SetAttributes(
attribute.Bool("llm_d.epp.pd.disaggregation_enabled", false),
attribute.String("llm_d.epp.pd.reason", "no_prefill_profile_result"),
)
span.SetStatus(codes.Ok, "")
return // prefill profile failed to run or we chose not to run it, no-op in this case
}

targetPod := prefillProfileRunResult.TargetPods[0].GetPod()
prefillHostPort := net.JoinHostPort(targetPod.Address, targetPod.Port)
request.Headers[common.PrefillPodHeader] = prefillHostPort // in the form of <ip:port>

span.SetAttributes(
attribute.Bool("llm_d.epp.pd.disaggregation_enabled", true),
attribute.String("llm_d.epp.pd.prefill_pod_address", targetPod.Address),
attribute.String("llm_d.epp.pd.prefill_pod_port", targetPod.Port),
)
span.SetStatus(codes.Ok, "")
}
Loading
Loading