feat: Add per server LRU capacity

kfirtoledo · kfirtoledo · commit d8d6659d8e74 · 2025-06-12T13:51:41.000+03:00
Signed-off-by: Kfir Toledo &lt;kfir.toledo@ibm.com&gt;
diff --git a/cmd/epp/main.go b/cmd/epp/main.go
@@ -122,9 +122,8 @@ func loadPrefixCacheConfig() prefix.Config {
 
 	return prefix.Config{
 		HashBlockSize:          envutil.GetEnvInt("PREFIX_CACHE_HASH_BLOCK_SIZE", prefix.DefaultHashBlockSize, baseLogger),
-		MaxPodsPerPrefix:       envutil.GetEnvInt("PREFIX_MAX_PODS_PER_PREFIX", prefix.DefaultMaxPodsPerPrefix, baseLogger),
 		MaxPrefixBlocksToMatch: envutil.GetEnvInt("PREFIX_CACHE_MAX_PREFIX_BLOCKS", prefix.DefaultMaxPrefixBlocks, baseLogger),
-		LRUIndexerCapacity:     envutil.GetEnvInt("PREFIX_CACHE_LRU_CAPACITY", prefix.DefaultLRUIndexerCapacity, baseLogger),
+		LRUCapacityPerServer:   envutil.GetEnvInt("PREFIX_CACHE_LRU_CAPACITY_PER_SERVER", prefix.DefaultLRUCapacityPerServer, baseLogger),
 	}
 }
 
diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer.go
@@ -18,6 +18,7 @@ package prefix
 
 import (
 	"context"
+	"fmt"
 	"sync"
 	"time"
 
@@ -27,32 +28,23 @@ import (
 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
 )
 
-// podSet holds an LRU cache of servers that may have a specific prefix hash.
-type podSet struct {
-	enteries *lru.Cache[ServerID, struct{}] // Can be extended with metadata (e.g., timestamp).
-}
-
 // An indexer maintains an LRU cache of prompt prefix hashes and the server(s) that might have that
-// prefix cached .
+// prefix cached.
 type indexer struct {
-	mu                sync.RWMutex
-	cache             *lru.Cache[BlockHash, *podSet]
-	maxCacheSize      int
-	maxServersToMatch int
+	mu         sync.RWMutex
+	hashToPods map[BlockHash]podSet                       // the lookup data structure to find pods that have the BlockHash cached
+	podToLRU   map[string]*lru.Cache[BlockHash, struct{}] // key is pod namespacedName, value is an LRU cache
+	maxLRUSize int
 }
 
 // newIndexer initializes an indexer with size limits and starts cache size reporting.
-func newIndexer(maxCacheSize, maxServersToMatch int) *indexer {
-	c, err := lru.New[BlockHash, *podSet](maxCacheSize)
-	if err != nil {
-		panic(err)
-	}
+func newIndexer(maxLRUSize int) *indexer {
 	ix := &indexer{
-		cache:             c,
-		maxCacheSize:      maxCacheSize,
-		maxServersToMatch: maxServersToMatch,
+		hashToPods: make(map[BlockHash]podSet),
+		podToLRU:   make(map[string]*lru.Cache[BlockHash, struct{}]),
+		maxLRUSize: maxLRUSize,
 	}
-	go ix.ReportCacheSize(time.Second)
+	go ix.ReportLRUSize(time.Second)
 	return ix
 }
 
@@ -61,51 +53,106 @@ func (i *indexer) Add(hashes []BlockHash, pod ServerID) {
 	if pod.Name == "" {
 		return
 	}
-
 	i.mu.Lock()
-	defer i.mu.Unlock()
+	// Check if the LRU pod exist
+	podName := pod.String()
+	lruForPod, exists := i.podToLRU[podName]
+	if !exists {
+		newLRU, _ := lru.NewWithEvict[BlockHash, struct{}](i.maxLRUSize, i.makeEvictionFn(pod))
+		i.podToLRU[podName] = newLRU
+		lruForPod = newLRU
+	}
+	i.mu.Unlock()
 
+	// Add to LRU (may evict)
 	for _, hash := range hashes {
-		p, ok := i.cache.Get(hash)
-		if !ok {
-			// Create podSet with new LRU
-			podLRU, _ := lru.New[ServerID, struct{}](i.maxServersToMatch)
-			p = &podSet{enteries: podLRU}
-			i.cache.Add(hash, p)
-		}
+		lruForPod.Add(hash, struct{}{})
+	}
 
-		p.enteries.Add(pod, struct{}{})
+	// Update hashToPods once under lock
+	i.mu.Lock()
+	for _, hash := range hashes {
+		pods := i.hashToPods[hash]
+		if pods == nil {
+			pods = make(podSet)
+		}
+		pods[pod] = struct{}{}
+		i.hashToPods[hash] = pods
 	}
+	i.mu.Unlock()
+
 }
 
 // Get returns a set of servers that have the given prefix hash cached.
-func (i *indexer) Get(hash BlockHash) map[ServerID]bool {
+func (i *indexer) Get(hash BlockHash) podSet {
 	i.mu.RLock()
 	defer i.mu.RUnlock()
 
-	res := map[ServerID]bool{}
-	pods, ok := i.cache.Get(hash)
+	res := podSet{}
+	pods, ok := i.hashToPods[hash]
 	if !ok {
 		return res
 	}
-	for _, pod := range pods.enteries.Keys() {
-		res[pod] = true
+
+	return pods
+}
+
+// makeEvictionFn returns a per-pod LRU eviction callback that removes the pod from hashToPods on eviction.
+func (i *indexer) makeEvictionFn(pod ServerID) func(BlockHash, struct{}) {
+	return func(hash BlockHash, _ struct{}) {
+		fmt.Printf("Evicted hash %v from pod %s\n", hash, pod)
+
+		i.mu.Lock()
+		defer i.mu.Unlock()
+		print("enter eviction")
+		// Remove the pod from the hash→pods map
+		if podSet, ok := i.hashToPods[hash]; ok {
+			delete(podSet, pod)
+			if len(podSet) == 0 {
+				delete(i.hashToPods, hash)
+			} else {
+				i.hashToPods[hash] = podSet
+			}
+		}
+		print("After eviction")
 	}
-	return res
 }
 
-// ReportCacheSize starts a goroutine that periodically reports the cache size metric.
-func (i *indexer) ReportCacheSize(interval time.Duration) {
+// ReportLRUSize starts a goroutine that periodically reports the LRU cache size metric.
+func (i *indexer) ReportLRUSize(interval time.Duration) {
 	ticker := time.NewTicker(interval)
 	defer ticker.Stop()
 	for range ticker.C {
 		i.mu.RLock()
-		size := i.cache.Len()
-		metrics.RecordPrefixCacheSize(int64(size))
-		log.FromContext(context.TODO()).V(logutil.TRACE).Info("LRU",
-			"# entries", size,
-			"prefix cache utilization [%]", float64(size)*100/float64(i.maxCacheSize),
+		totalEntries := 0
+		maxPodEntries := 0
+		maxPodName := ""
+
+		for podName, lruCache := range i.podToLRU {
+			size := lruCache.Len()
+			totalEntries += size
+			if size > maxPodEntries {
+				maxPodEntries = size
+				maxPodName = podName
+			}
+		}
+
+		numPods := len(i.podToLRU)
+		avg := 0.0
+		if numPods > 0 {
+			avg = float64(totalEntries) / float64(numPods)
+		}
+
+		metrics.RecordPrefixCacheSize(int64(totalEntries))
+		log.FromContext(context.TODO()).V(logutil.TRACE).Info("Prefix cache state",
+			"total entries", totalEntries,
+			"# pods", numPods,
+			"avg entries per pod", avg,
+			"pod with max cache", maxPodName,
+			"max pod size", maxPodEntries,
+			"global max LRU cache capacity per pod", i.maxLRUSize,
 		)
+
 		i.mu.RUnlock()
 	}
 }
diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer_test.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer_test.go
@@ -22,24 +22,25 @@ import (
 )
 
 func TestIndexer_AddAndGet(t *testing.T) {
-	i := newIndexer(2, 2)
+	i := newIndexer(2)
 
 	hash1 := BlockHash(1)
 	server := ServerID{Namespace: "default", Name: "server1"}
-
+	serverName := server.String()
 	// Add an entry to the cache
 	i.Add([]BlockHash{hash1}, server)
-
 	// Retrieve the entry
-	assert.Equal(t, 1, i.cache.Len(), "Cache size should be 1 after adding an entry")
+	assert.Equal(t, 1, i.podToLRU[serverName].Len(), "Cache size should be 1 after adding an entry")
 	servers := i.Get(hash1)
 	assert.Contains(t, servers, server, "Cache should contain the added server")
 
 	// Add another entry to the cache, the cache size should be incremented to 2.
 	i.Add([]BlockHash{BlockHash(2)}, server)
-	assert.Equal(t, 2, i.cache.Len(), "Cache size should  be 2 after adding an entry")
+	assert.Equal(t, 2, i.podToLRU[serverName].Len(), "Cache size should  be 2 after adding an entry")
 
 	// Add another entry to the cache, which should evict the first one due to max size.
+	print("before Add")
 	i.Add([]BlockHash{BlockHash(3)}, server)
-	assert.Equal(t, 2, i.cache.Len(), "Cache size should still be 2 after adding an entry")
+	print("after ADD")
+	assert.Equal(t, 2, i.podToLRU[serverName].Len(), "Cache size should still be 2 after adding an entry")
 }
diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go
@@ -32,10 +32,6 @@ import (
 
 const (
 	DefaultScorerWeight = 1
-	// DefaultMaxPodsPerPrefix defines the maximum number of pods (servers) to track per prefix hash in the LRU indexer.
-	// This limits the number of recent pods associated with a given prefix to reduce memory usage
-	// and ensure faster lookup. When the limit is reached, the least recently used pod is evicted.
-	DefaultMaxPodsPerPrefix = 4
 	// vLLM default token block size is 16, and a good guess of average characters per token is 4.
 	DefaultHashBlockSize = 64
 	// The maximum number of blocks to match. Two long requests with the same prefix up to this
@@ -44,16 +40,15 @@ const (
 	// accuracy. Use a small value if most requests are short to reduce cache size and speed up the
 	// matching process. Use a large value if most requests are long to increase the matching accuracy.
 	DefaultMaxPrefixBlocks = 256
-	// The indexer is an approximation to the actual prefix cache state on the model servers.
+	// The indexer is an approximation to the actual prefix LRU cache state on the model servers per server (pod).
 	// A small capacity ensures a high accuracy of cache hit on the model server, but it will
 	// increase the chance of false negatives. A high capacity does the opposite.
 	// To properly size this, consider the sum of the total number of cache entries on all model
 	// servers. Consider the llama3 8B model on 8 H100 80GB GPUs. The size of the model weight is
 	// about 16GB. Assume 50% of the remaining HBM is used for caching prefixes, we have 32GB. Each
 	// token is about 128KB in size, so we can cache 250K tokens. Using the default block size of 16
-	// in vLLM, we will have 250K / 16 = 15.6K blocks. In total we have 15.6K * 8 = 124.8K blocks, or
-	// roughly 130K.
-	DefaultLRUIndexerCapacity = 130000
+	// in vLLM, we will have 250K / 16 = 15.6K blocks.
+	DefaultLRUCapacityPerServer = 15000
 )
 
 type Config struct {
@@ -63,19 +58,20 @@ type Config struct {
 	// MaxPrefixBlocksToMatch is the maximum number of prefix blocks to match. Input beyond this limit will
 	// be ignored.
 	MaxPrefixBlocksToMatch int
-	// MaxPodsPerPrefix defines the maximum number of pods (servers) to track per prefix hash in the LRU indexer.
-	MaxPodsPerPrefix int
-	// Max (approximate) size of the LRU indexer in number of entries.
-	LRUIndexerCapacity int
+	// Max (approximate) size of the LRU indexer in number of entries per server (pod).
+	LRUCapacityPerServer int
 }
 
 type Plugin struct {
 	Config
 	indexer Indexer
 }
 
+// podSet holds an pods servers that may have a specific prefix hash.
+type podSet map[ServerID]struct{}
+
 type Indexer interface {
-	Get(hash BlockHash) map[ServerID]bool
+	Get(hash BlockHash) podSet
 	Add(hashes []BlockHash, server ServerID)
 }
 
@@ -121,7 +117,7 @@ var _ framework.PostCycle = &Plugin{}
 func New(config Config) *Plugin {
 	m := &Plugin{
 		Config:  config,
-		indexer: newIndexer(config.LRUIndexerCapacity, config.MaxPodsPerPrefix),
+		indexer: newIndexer(config.LRUCapacityPerServer),
 	}
 	return m
 }
@@ -135,7 +131,7 @@ func (m *Plugin) Name() string {
 func (m *Plugin) Score(ctx context.Context, request *types.LLMRequest, cycleState *types.CycleState, pods []types.Pod) map[types.Pod]float64 {
 	loggerTrace := log.FromContext(ctx).V(logutil.TRACE)
 	// pre score step, hashing prompt and find longest prefix match.
-	hashes := hashPrompt(ctx, request, m.HashBlockSize, m.MaxPodsPerPrefix)
+	hashes := hashPrompt(ctx, request, m.HashBlockSize, m.MaxPrefixBlocksToMatch)
 	state := &schedulingContextState{
 		PrefixHashes:       hashes,
 		PrefixCacheServers: m.matchLongestPrefix(ctx, hashes),
diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin_test.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin_test.go
@@ -35,8 +35,7 @@ func TestPrefixPlugin(t *testing.T) {
 	config := Config{
 		HashBlockSize:          4,
 		MaxPrefixBlocksToMatch: DefaultMaxPrefixBlocks,
-		LRUIndexerCapacity:     DefaultLRUIndexerCapacity,
-		MaxPodsPerPrefix:       DefaultMaxPodsPerPrefix,
+		LRUCapacityPerServer:   DefaultLRUCapacityPerServer,
 	}
 	plugin := New(config)
 
@@ -150,8 +149,7 @@ func BenchmarkPrefixPluginStress(b *testing.B) {
 	config := Config{
 		HashBlockSize:          blockSize,
 		MaxPrefixBlocksToMatch: maxPrefixBlocks,
-		LRUIndexerCapacity:     DefaultLRUIndexerCapacity,
-		MaxPodsPerPrefix:       DefaultMaxPodsPerPrefix,
+		LRUCapacityPerServer:   DefaultLRUCapacityPerServer,
 	}
 
 	plugin := New(config)
diff --git a/site-src/guides/epp-configuration/prefix-aware.md b/site-src/guides/epp-configuration/prefix-aware.md
@@ -32,11 +32,9 @@ extremely long inputs.
 128 (or 128*64=8192 characters, or roughly 2048 tokens). This is useful to tradeoff prefix match accuracy
 for performance.
 
-* `PREFIX_CACHE_LRU_CAPACITY`: Maximum capacity the prefix LRU indexer in number of block hashes. Below
+* `PREFIX_CACHE_LRU_CAPACITY_PER_SERVER`: Maximum capacity the prefix LRU cache in number of block hashes per server (pod). Below
 shows a detailed analysis on how to estimate this.
-* `PREFIX_MAX_PODS_PER_PREFIX`: Defines the maximum number of pods (servers) tracked per prefix hash in the internal LRU cache.
-This setting helps optimize memory usage by retaining only the hottest (most recently active) pods for each prefix.
-When the limit is reached, older pods are evicted based on least-recently-used (LRU) order.
+
 
 
     The prefix cache plugin estimates the prefix cache indexes in model server HBMs.  In the perfect
@@ -68,7 +66,6 @@ When the limit is reached, older pods are evicted based on least-recently-used (
     # assume avg_chars_per_token = 4, prefix_indexer_hash_block_size = 64 (default)
     # each entry is about 358KB, so the memory footrpint is abut 11 MB per server
     lru_indexer_capacity_per_server = 500,000*4/64 = 31250
-    lru_indexer_capacity_total = 3 * 31250 = 93750
     ```
 
 See the [Use Helm section](#helm) to install an inferencepool with the environment variables.
@@ -87,7 +84,7 @@ $ helm install triton-llama3-8b-instruct \
   --set provider.name=[none|gke] \
   --set inferenceExtension.env.EXPERIMENTAL_USE_SCHEDULER_V2=true \
   --set inferenceExtension.env.ENABLE_PREFIX_CACHE_SCHEDULING=true \
-  --set inferenceExtension.env.PREFIX_CACHE_LRU_CAPACITY=93750 \
+  --set inferenceExtension.env.PREFIX_CACHE_LRU_CAPACITY_PER_SERVER=31250 \
   --set inferenceExtension.env.PREFIX_CACHE_MAX_PREFIX_BLOCKS=1024 \
   oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0
 ```

Original file line number	Diff line number	Diff line change
`@@ -122,9 +122,8 @@ func loadPrefixCacheConfig() prefix.Config {`
`122`	`122`
`123`	`123`	`return prefix.Config{`
`124`	`124`	`HashBlockSize: envutil.GetEnvInt("PREFIX_CACHE_HASH_BLOCK_SIZE", prefix.DefaultHashBlockSize, baseLogger),`
`125`		`- MaxPodsPerPrefix: envutil.GetEnvInt("PREFIX_MAX_PODS_PER_PREFIX", prefix.DefaultMaxPodsPerPrefix, baseLogger),`
`126`	`125`	`MaxPrefixBlocksToMatch: envutil.GetEnvInt("PREFIX_CACHE_MAX_PREFIX_BLOCKS", prefix.DefaultMaxPrefixBlocks, baseLogger),`
`127`		`- LRUIndexerCapacity: envutil.GetEnvInt("PREFIX_CACHE_LRU_CAPACITY", prefix.DefaultLRUIndexerCapacity, baseLogger),`
	`126`	`+ LRUCapacityPerServer: envutil.GetEnvInt("PREFIX_CACHE_LRU_CAPACITY_PER_SERVER", prefix.DefaultLRUCapacityPerServer, baseLogger),`
`128`	`127`	`}`
`129`	`128`	`}`
`130`	`129`