Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cmd/epp/runner/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,8 @@ func (r *Runner) parsePluginsConfiguration(ctx context.Context) error {
return fmt.Errorf("failed to load the configuration - %w", err)
}

setupLog.Info("Configuration file loaded", "config", config)

r.schedulerConfig, err = loader.LoadSchedulerConfig(config.SchedulingProfiles, handle)
if err != nil {
return fmt.Errorf("failed to create Scheduler configuration - %w", err)
Expand Down
85 changes: 85 additions & 0 deletions config/charts/inferencepool/templates/epp-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "gateway-api-inference-extension.name" . }}
namespace: {{ .Release.Namespace }}
data:
default-plugins.yaml: |
apiVersion: inference.networking.x-k8s.io/v1alpha1
kind: EndpointPickerConfig
plugins:
- type: low-queue-filter
parameters:
threshold: 128
- type: lora-affinity-filter
parameters:
threshold: 0.999
- type: least-queue-filter
- type: least-kv-cache-filter
- type: decision-tree-filter
name: low-latency-filter
parameters:
current:
pluginRef: low-queue-filter
nextOnSuccess:
decisionTree:
current:
pluginRef: lora-affinity-filter
nextOnSuccessOrFailure:
decisionTree:
current:
pluginRef: least-queue-filter
nextOnSuccessOrFailure:
decisionTree:
current:
pluginRef: least-kv-cache-filter
nextOnFailure:
decisionTree:
current:
pluginRef: least-queue-filter
nextOnSuccessOrFailure:
decisionTree:
current:
pluginRef: lora-affinity-filter
nextOnSuccessOrFailure:
decisionTree:
current:
pluginRef: least-kv-cache-filter
- type: random-picker
parameters:
maxNumOfEndpoints: 1
- type: single-profile-handler
schedulingProfiles:
- name: default
plugins:
- pluginRef: low-latency-filter
- pluginRef: random-picker
plugins-v2.yaml: |
apiVersion: inference.networking.x-k8s.io/v1alpha1
kind: EndpointPickerConfig
plugins:
- type: queue-scorer
- type: kv-cache-scorer
- type: prefix-cache-scorer
parameters:
hashBlockSize: 64
maxPrefixBlocksToMatch: 256
lruCapacityPerServer: 31250
- type: max-score-picker
parameters:
maxNumOfEndpoints: 1
- type: single-profile-handler
schedulingProfiles:
- name: default
plugins:
- pluginRef: queue-scorer
weight: 1
- pluginRef: kv-cache-scorer
weight: 1
- pluginRef: prefix-cache-scorer
weight: 1
- pluginRef: max-score-picker
{{- if (hasKey .Values.inferenceExtension "pluginsCustomConfig") }}
{{- .Values.inferenceExtension.pluginsCustomConfig | toYaml | nindent 2 }}
{{- end }}

9 changes: 9 additions & 0 deletions config/charts/inferencepool/templates/epp-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ spec:
- "9003"
- -metricsPort
- "9090"
- -configFile
- "config/{{ .Values.inferenceExtension.pluginsConfigFile }}"
# https://pkg.go.dev/flag#hdr-Command_line_flag_syntax; space is only for non-bool flags
- "-enablePprof={{ .Values.inferenceExtension.enablePprof }}"
{{- if eq (.Values.inferencePool.modelServerType | default "vllm") "triton-tensorrt-llm" }}
Expand Down Expand Up @@ -69,3 +71,10 @@ spec:
- name: {{ $key }}
value: {{ $value | quote }}
{{- end }}
volumeMounts:
- name: plugins-config-volume
mountPath: "/config"
volumes:
- name: plugins-config-volume
configMap:
name: {{ include "gateway-api-inference-extension.name" . }}
20 changes: 20 additions & 0 deletions config/charts/inferencepool/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,26 @@ inferenceExtension:
extProcPort: 9002
env: {}
enablePprof: true # Enable pprof handlers for profiling and debugging
# This is the plugins configuration file.
pluginsConfigFile: "default-plugins.yaml"
# pluginsCustomConfig:
# custom-plugins.yaml: |
# apiVersion: inference.networking.x-k8s.io/v1alpha1
# kind: EndpointPickerConfig
# plugins:
# - type: custom-scorer
# parameters:
# custom-threshold: 64
# - type: max-score-picker
# - type: single-profile-handler
# schedulingProfiles:
# - name: default
# plugins:
# - pluginRef: custom-scorer
# weight: 1
# - pluginRef: max-score-picker
# weight: 1

# Example environment variables:
# env:
# KV_CACHE_SCORE_WEIGHT: "1"
Expand Down