feat: add vLLM remote tokenizer with engine integration (#1328)

ae86zhizhi · web-flow · commit b0eebc1eaa19 · 2025-07-30T21:44:11.000-07:00
Add support for using vLLM's remote tokenizer endpoint to enable
tokenization without loading models in gateway plugins. This feature
allows the gateway to delegate tokenization to vLLM engine instances,
reducing memory usage and improving scalability.

## Key Features

- Integrate vLLM's /tokenize endpoint for remote tokenization
- Implement TokenizerPool for managing per-model tokenizer connections
- Support health checking and automatic failover to local tokenizer
- Add caching and connection pooling for performance
- Support both vLLM and other inference engines through pod label
  detection

## Implementation Details

- New remote tokenizer client with retry logic and timeout handling
- TokenizerPool with concurrent access support and automatic cleanup
- Health monitoring with 5-second timeout for tokenizer endpoints
- Fallback to local character tokenizer when remote unavailable
- Prometheus metrics for monitoring tokenizer pool status

## Configuration

- AIBRIX_ENABLE_VLLM_REMOTE_TOKENIZER: Feature flag (default: false)
- AIBRIX_VLLM_TOKENIZER_ENDPOINT_TEMPLATE: Endpoint format
  (default: "http://%s:8000")
- AIBRIX_TOKENIZER_HEALTH_CHECK_PERIOD: Health check interval
  (default: 30s)
- AIBRIX_TOKENIZER_TTL: Unused tokenizer cleanup time (default: 5m)
- AIBRIX_MAX_TOKENIZERS_PER_POOL: Pool size limit (default: 100)

## Review Feedback Addressed

- Changed default to disabled for production safety
- Fixed race conditions in concurrent access
- Optimized lock contention with double-checked locking
- Added comprehensive test coverage including benchmarks
- Created centralized constants package for Kubernetes labels

Tested with vLLM v0.4.0 and includes backward compatibility support.

Signed-off-by: ae86zhizhi &lt;550149470@qq.com&gt;
diff --git a/config/features/vllm-remote-tokenizer/README.md b/config/features/vllm-remote-tokenizer/README.md
@@ -0,0 +1,68 @@
+# vLLM Remote Tokenizer Feature
+
+This feature enables model-aware remote tokenizer support for vLLM inference engines in AIBrix gateway.
+
+## Quick Start
+
+Enable vLLM remote tokenizer with one command:
+
+```bash
+kubectl apply -k config/features/vllm-remote-tokenizer/
+```
+
+## Configuration
+
+The following environment variables are configured:
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| AIBRIX_ENABLE_VLLM_REMOTE_TOKENIZER | false | Enable remote tokenizer feature |
+| AIBRIX_VLLM_TOKENIZER_ENDPOINT_TEMPLATE | http://%s:8000 | URL template for vLLM endpoints |
+| AIBRIX_TOKENIZER_HEALTH_CHECK_PERIOD | 30s | Health check interval |
+| AIBRIX_TOKENIZER_TTL | 5m | Tokenizer cache TTL |
+| AIBRIX_MAX_TOKENIZERS_PER_POOL | 100 | Maximum tokenizers per pool |
+| AIBRIX_TOKENIZER_REQUEST_TIMEOUT | 10s | Request timeout |
+
+## Customization
+
+To use custom values, copy this directory and modify `gateway-plugins-env-patch.yaml`:
+
+```bash
+cp -r config/features/vllm-remote-tokenizer/ config/features/my-vllm-config/
+# Edit config/features/my-vllm-config/gateway-plugins-env-patch.yaml
+kubectl apply -k config/features/my-vllm-config/
+```
+
+## Enable the Feature
+
+To enable vLLM remote tokenizer after installation:
+
+```bash
+kubectl set env deployment/gateway-plugins -n aibrix-system AIBRIX_ENABLE_VLLM_REMOTE_TOKENIZER=true
+```
+
+Or use a custom Kustomization overlay with the environment variable set to `true`.
+
+## Disable
+
+To disable, set the environment variable to false:
+
+```bash
+kubectl set env deployment/gateway-plugins -n aibrix-system AIBRIX_ENABLE_VLLM_REMOTE_TOKENIZER=false
+```
+
+## Verification
+
+Check if enabled:
+
+```bash
+kubectl get deployment gateway-plugins -n aibrix-system -o json | \
+  jq '.spec.template.spec.containers[0].env[] | select(.name | startswith("AIBRIX_ENABLE_VLLM"))'
+```
+
+Check metrics:
+
+```bash
+kubectl port-forward -n aibrix-system svc/gateway-plugins 8080:8080
+curl http://localhost:8080/metrics | grep aibrix_tokenizer_pool
+```
diff --git a/config/features/vllm-remote-tokenizer/gateway-plugins-env-patch.yaml b/config/features/vllm-remote-tokenizer/gateway-plugins-env-patch.yaml
@@ -0,0 +1,23 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: gateway-plugins
+  namespace: system
+spec:
+  template:
+    spec:
+      containers:
+      - name: gateway-plugin
+        env:
+        - name: AIBRIX_ENABLE_VLLM_REMOTE_TOKENIZER
+          value: "false"
+        - name: AIBRIX_VLLM_TOKENIZER_ENDPOINT_TEMPLATE
+          value: "http://%s:8000"
+        - name: AIBRIX_TOKENIZER_HEALTH_CHECK_PERIOD
+          value: "30s"
+        - name: AIBRIX_TOKENIZER_TTL
+          value: "5m"
+        - name: AIBRIX_MAX_TOKENIZERS_PER_POOL
+          value: "100"
+        - name: AIBRIX_TOKENIZER_REQUEST_TIMEOUT
+          value: "10s"
diff --git a/config/features/vllm-remote-tokenizer/kustomization.yaml b/config/features/vllm-remote-tokenizer/kustomization.yaml
@@ -0,0 +1,16 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namespace: aibrix-system
+
+# This overlay enables vLLM remote tokenizer support
+# Apply with: kubectl apply -k config/features/vllm-remote-tokenizer/
+
+resources:
+- ../../gateway/gateway-plugin
+
+patches:
+- path: gateway-plugins-env-patch.yaml
+  target:
+    kind: Deployment
+    name: gateway-plugins
diff --git a/pkg/apis/constants/labels.go b/pkg/apis/constants/labels.go
@@ -0,0 +1,54 @@
+/*
+Copyright 2024 The Aibrix Team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package constants
+
+// Label keys used by the Aibrix system.
+// The format `resource.aibrix.ai/attribute` is the standard.
+
+const (
+	// ModelNameLabel is the label for identifying the model name
+	// Example: "model.aibrix.ai/name": "deepseek-llm-7b-chat"
+	ModelNameLabel = "model.aibrix.ai/name"
+
+	// ModelEngineLabel is the label for identifying the inference engine
+	// Example: "model.aibrix.ai/engine": "vllm"
+	ModelEngineLabel = "model.aibrix.ai/engine"
+
+	// ModelMetricPortLabel is the label for specifying the metrics port
+	// Example: "model.aibrix.ai/metric-port": "8000"
+	ModelMetricPortLabel = "model.aibrix.ai/metric-port"
+
+	// ModelPortLabel is the label for specifying the service port
+	// Example: "model.aibrix.ai/port": "8080"
+	ModelPortLabel = "model.aibrix.ai/port"
+)
+
+// GetModelName retrieves the model name from pod labels
+func GetModelName(labels map[string]string) string {
+	if model, ok := labels[ModelNameLabel]; ok {
+		return model
+	}
+	return ""
+}
+
+// GetInferenceEngine retrieves the inference engine from pod labels
+func GetInferenceEngine(labels map[string]string) string {
+	if engine, ok := labels[ModelEngineLabel]; ok {
+		return engine
+	}
+	return ""
+}
diff --git a/pkg/cache/cache_metrics.go b/pkg/cache/cache_metrics.go
@@ -21,6 +21,7 @@ import (
 
 	prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1"
 	dto "github.com/prometheus/client_model/go"
+	"github.com/vllm-project/aibrix/pkg/apis/constants"
 	"github.com/vllm-project/aibrix/pkg/metrics"
 	"github.com/vllm-project/aibrix/pkg/utils"
 	"k8s.io/klog/v2"
@@ -29,10 +30,8 @@ import (
 const (
 	// When the engine's HTTP proxy is separated from the engine itself,
 	// the request port and metrics port may differ, so a dedicated metrics port is required.
-	MetricPortLabel                     = "model.aibrix.ai/metric-port"
-	engineLabel                         = "model.aibrix.ai/engine"
-	portLabel                           = "model.aibrix.ai/port"
-	modelLabel                          = "model.aibrix.ai/name"
+	// Note: Using MetricPortLabel for backward compatibility, but it's the same as constants.ModelMetricPortLabel
+	MetricPortLabel                     = constants.ModelMetricPortLabel
 	defaultMetricPort                   = 8000
 	defaultEngineLabelValue             = "vllm"
 	defaultPodMetricRefreshIntervalInMS = 50
@@ -337,7 +336,7 @@ func (c *Store) fetchMetrics(pod *Pod, allMetrics map[string]*dto.MetricFamily,
 		klog.V(4).Infof("Cannot find labelMetricName %v in collected metrics names", labelMetricName)
 		return nil, false
 	}
-	engineType, err := getPodLabel(pod, engineLabel)
+	engineType, err := getPodLabel(pod, constants.ModelEngineLabel)
 	if engineType == "" {
 		klog.V(4).Infof(err.Error())
 		engineType = defaultEngineLabelValue
@@ -363,7 +362,7 @@ func (c *Store) updatePodRecord(pod *Pod, modelName string, metricName string, s
 	} else if scope == metrics.PodModelMetricScope {
 		var err error
 		if modelName == "" {
-			modelName, err = getPodLabel(pod, modelLabel)
+			modelName, err = getPodLabel(pod, constants.ModelNameLabel)
 			if err != nil {
 				return fmt.Errorf("modelName should not be empty for scope %v", scope)
 			}
diff --git a/pkg/plugins/gateway/algorithms/prefix_cache.go b/pkg/plugins/gateway/algorithms/prefix_cache.go
@@ -20,6 +20,7 @@ import (
 	"math"
 	"math/rand"
 	"sort"
+	"time"
 
 	"github.com/vllm-project/aibrix/pkg/cache"
 	"github.com/vllm-project/aibrix/pkg/types"
@@ -41,6 +42,14 @@ var (
 	tokenizerType                                             = utils.LoadEnv("AIBRIX_PREFIX_CACHE_TOKENIZER_TYPE", "character")
 	podRunningRequestImbalanceAbsCount int                    = utils.LoadEnvInt("AIBRIX_PREFIX_CACHE_POD_RUNNING_REQUEST_IMBALANCE_ABS_COUNT", defaultPodRunningRequestImbalanceAbsCount)
 	standardDeviationFactor            int                    = utils.LoadEnvInt("AIBRIX_PREFIX_CACHE_STANDARD_DEVIATION_FACTOR", defaultStandardDeviationFactor)
+
+	// vLLM Remote Tokenizer configuration
+	enableVLLMRemoteTokenizer     = utils.LoadEnvBool("AIBRIX_ENABLE_VLLM_REMOTE_TOKENIZER", false)
+	vllmTokenizerEndpointTemplate = utils.LoadEnv("AIBRIX_VLLM_TOKENIZER_ENDPOINT_TEMPLATE", "http://%s:8000")
+	tokenizerHealthCheckPeriod    = utils.LoadEnvDuration("AIBRIX_TOKENIZER_HEALTH_CHECK_PERIOD", 30*time.Second)
+	tokenizerTTL                  = utils.LoadEnvDuration("AIBRIX_TOKENIZER_TTL", 5*time.Minute)
+	maxTokenizersPerPool          = utils.LoadEnvInt("AIBRIX_MAX_TOKENIZERS_PER_POOL", 100)
+	tokenizerRequestTimeout       = utils.LoadEnvDuration("AIBRIX_TOKENIZER_REQUEST_TIMEOUT", 10*time.Second)
 )
 
 func init() {
@@ -49,41 +58,60 @@ func init() {
 
 type prefixCacheRouter struct {
 	cache              cache.Cache
-	tokenizer          tokenizer.Tokenizer
+	tokenizer          tokenizer.Tokenizer // Fallback tokenizer for backward compatibility
+	tokenizerPool      *TokenizerPool      // Model-aware tokenizer pool
 	prefixCacheIndexer *prefixcacheindexer.PrefixHashTable
 }
 
 func NewPrefixCacheRouter() (types.Router, error) {
-	// Create tokenizer based on type
+	c, err := cache.Get()
+	if err != nil {
+		klog.Error("fail to get cache store in prefix cache router")
+		return nil, err
+	}
+
+	// Create fallback tokenizer based on type
 	// Supported tokenizers: ["character", "tiktoken"]
 	// Default: "character" for any unrecognized type
-	// TODO: Add support for "remote" and "vllm" tokenizer types in a future PR.
-	//       This will require proper configuration handling for remote endpoints.
-	var tokenizerObj tokenizer.Tokenizer
+	var fallbackTokenizer tokenizer.Tokenizer
 	if tokenizerType == "tiktoken" {
-		tokenizerObj = tokenizer.NewTiktokenTokenizer()
+		fallbackTokenizer = tokenizer.NewTiktokenTokenizer()
 	} else {
 		// Default to character tokenizer for backward compatibility
 		if tokenizerType != "character" {
 			klog.InfoS("unrecognized tokenizer type, defaulting to character", "type", tokenizerType)
 		}
-		tokenizerObj = tokenizer.NewCharacterTokenizer()
+		fallbackTokenizer = tokenizer.NewCharacterTokenizer()
 	}
 
-	c, err := cache.Get()
-	if err != nil {
-		klog.Error("fail to get cache store in prefix cache router")
-		return nil, err
+	// Initialize TokenizerPool for vLLM remote tokenizer support
+	poolConfig := TokenizerPoolConfig{
+		EnableVLLMRemote:     enableVLLMRemoteTokenizer,
+		EndpointTemplate:     vllmTokenizerEndpointTemplate,
+		HealthCheckPeriod:    tokenizerHealthCheckPeriod,
+		TokenizerTTL:         tokenizerTTL,
+		MaxTokenizersPerPool: maxTokenizersPerPool,
+		FallbackTokenizer:    fallbackTokenizer,
+		Timeout:              tokenizerRequestTimeout,
+		ModelServiceMap:      make(map[string]string), // Can be populated from config later
 	}
 
+	pool := NewTokenizerPool(poolConfig, c)
+
 	klog.InfoS("prefix_cache_configurations",
 		"tokenizer_type", tokenizerType,
 		"pod_running_request_imbalance_abs_count", podRunningRequestImbalanceAbsCount,
-		"matched_pods_running_requests_standard_deviation_factor", standardDeviationFactor)
+		"matched_pods_running_requests_standard_deviation_factor", standardDeviationFactor,
+		"enable_vllm_remote_tokenizer", enableVLLMRemoteTokenizer,
+		"vllm_tokenizer_endpoint_template", vllmTokenizerEndpointTemplate,
+		"tokenizer_health_check_period", tokenizerHealthCheckPeriod,
+		"tokenizer_ttl", tokenizerTTL,
+		"max_tokenizers_per_pool", maxTokenizersPerPool)
 
 	return prefixCacheRouter{
 		cache:              c,
-		tokenizer:          tokenizerObj,
+		tokenizer:          fallbackTokenizer, // Keep for backward compatibility
+		tokenizerPool:      pool,
 		prefixCacheIndexer: prefixcacheindexer.NewPrefixHashTable(),
 	}, nil
 }
@@ -93,12 +121,22 @@ func (p prefixCacheRouter) Route(ctx *types.RoutingContext, readyPodList types.P
 	var matchedPods map[string]int
 	var targetPod *v1.Pod
 
-	tokens, err := p.tokenizer.TokenizeInputText(ctx.Message)
+	readyPods := readyPodList.All()
+
+	// Get tokenizer - use pool only if vLLM remote tokenizer is enabled
+	var tokenizerObj tokenizer.Tokenizer
+	if enableVLLMRemoteTokenizer {
+		tokenizerObj = p.tokenizerPool.GetTokenizer(ctx.Model, readyPods)
+	} else {
+		// Use the original tokenizer for backward compatibility
+		tokenizerObj = p.tokenizer
+	}
+
+	tokens, err := tokenizerObj.TokenizeInputText(ctx.Message)
 	if err != nil {
 		return "", err
 	}
 
-	readyPods := readyPodList.All()
 	readyPodsMap := map[string]struct{}{}
 	for _, pod := range readyPods {
 		readyPodsMap[pod.Name] = struct{}{}
diff --git a/pkg/plugins/gateway/algorithms/tokenizer_pool.go b/pkg/plugins/gateway/algorithms/tokenizer_pool.go
diff --git a/pkg/plugins/gateway/algorithms/tokenizer_pool_test.go b/pkg/plugins/gateway/algorithms/tokenizer_pool_test.go
diff --git a/pkg/utils/util.go b/pkg/utils/util.go