fix: Fix typos and error handle

kfirtoledo · kfirtoledo · commit 74b1f53a34fd · 2025-06-12T22:52:21.000+03:00
Signed-off-by: Kfir Toledo &lt;kfir.toledo@ibm.com&gt;
diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer.go
@@ -32,34 +32,37 @@ import (
 // prefix cached.
 type indexer struct {
 	mu         sync.RWMutex
-	hashToPods map[BlockHash]podSet                       // the lookup data structure to find pods that have the BlockHash cached
-	podToLRU   map[string]*lru.Cache[BlockHash, struct{}] // key is pod namespacedName, value is an LRU cache
+	hashToPods map[BlockHash]podSet                         // the lookup data structure to find pods that have the BlockHash cached
+	podToLRU   map[ServerID]*lru.Cache[BlockHash, struct{}] // key is pod namespacedName, value is an LRU cache
 	maxLRUSize int
 }
 
 // newIndexer initializes an indexer with size limits and starts cache size reporting.
 func newIndexer(maxLRUSize int) *indexer {
 	ix := &indexer{
 		hashToPods: make(map[BlockHash]podSet),
-		podToLRU:   make(map[string]*lru.Cache[BlockHash, struct{}]),
+		podToLRU:   make(map[ServerID]*lru.Cache[BlockHash, struct{}]),
 		maxLRUSize: maxLRUSize,
 	}
 	go ix.ReportLRUSize(time.Second)
 	return ix
 }
 
 // Add adds a list of prefix hashes to the cache, tied to the server.
-func (i *indexer) Add(hashes []BlockHash, pod ServerID) {
+func (i *indexer) Add(hashes []BlockHash, pod ServerID) error {
 	if pod.Name == "" {
-		return
+		return fmt.Errorf("pod name is empty")
 	}
 	i.mu.Lock()
 	// Check if the LRU pod exist
-	podName := pod.String()
-	lruForPod, exists := i.podToLRU[podName]
+	lruForPod, exists := i.podToLRU[pod]
 	if !exists {
-		newLRU, _ := lru.NewWithEvict[BlockHash, struct{}](i.maxLRUSize, i.makeEvictionFn(pod))
-		i.podToLRU[podName] = newLRU
+		newLRU, err := lru.NewWithEvict[BlockHash, struct{}](i.maxLRUSize, i.makeEvictionFn(pod))
+		if err != nil {
+			i.mu.Unlock()
+			return fmt.Errorf("failed to create LRU for pod %s: %w", pod, err)
+		}
+		i.podToLRU[pod] = newLRU
 		lruForPod = newLRU
 	}
 	i.mu.Unlock()
@@ -80,7 +83,7 @@ func (i *indexer) Add(hashes []BlockHash, pod ServerID) {
 		i.hashToPods[hash] = pods
 	}
 	i.mu.Unlock()
-
+	return nil
 }
 
 // Get returns a set of servers that have the given prefix hash cached.
@@ -100,21 +103,15 @@ func (i *indexer) Get(hash BlockHash) podSet {
 // makeEvictionFn returns a per-pod LRU eviction callback that removes the pod from hashToPods on eviction.
 func (i *indexer) makeEvictionFn(pod ServerID) func(BlockHash, struct{}) {
 	return func(hash BlockHash, _ struct{}) {
-		fmt.Printf("Evicted hash %v from pod %s\n", hash, pod)
-
 		i.mu.Lock()
 		defer i.mu.Unlock()
-		print("enter eviction")
 		// Remove the pod from the hash→pods map
 		if podSet, ok := i.hashToPods[hash]; ok {
 			delete(podSet, pod)
 			if len(podSet) == 0 {
 				delete(i.hashToPods, hash)
-			} else {
-				i.hashToPods[hash] = podSet
 			}
 		}
-		print("After eviction")
 	}
 }
 
@@ -126,14 +123,14 @@ func (i *indexer) ReportLRUSize(interval time.Duration) {
 		i.mu.RLock()
 		totalEntries := 0
 		maxPodEntries := 0
-		maxPodName := ""
+		maxPodName := ServerID{}
 
-		for podName, lruCache := range i.podToLRU {
+		for pod, lruCache := range i.podToLRU {
 			size := lruCache.Len()
 			totalEntries += size
 			if size > maxPodEntries {
 				maxPodEntries = size
-				maxPodName = podName
+				maxPodName = pod
 			}
 		}
 
diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer_test.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer_test.go
@@ -26,21 +26,18 @@ func TestIndexer_AddAndGet(t *testing.T) {
 
 	hash1 := BlockHash(1)
 	server := ServerID{Namespace: "default", Name: "server1"}
-	serverName := server.String()
 	// Add an entry to the cache
 	i.Add([]BlockHash{hash1}, server)
 	// Retrieve the entry
-	assert.Equal(t, 1, i.podToLRU[serverName].Len(), "Cache size should be 1 after adding an entry")
+	assert.Equal(t, 1, i.podToLRU[server].Len(), "Cache size should be 1 after adding an entry")
 	servers := i.Get(hash1)
 	assert.Contains(t, servers, server, "Cache should contain the added server")
 
 	// Add another entry to the cache, the cache size should be incremented to 2.
 	i.Add([]BlockHash{BlockHash(2)}, server)
-	assert.Equal(t, 2, i.podToLRU[serverName].Len(), "Cache size should  be 2 after adding an entry")
+	assert.Equal(t, 2, i.podToLRU[server].Len(), "Cache size should  be 2 after adding an entry")
 
 	// Add another entry to the cache, which should evict the first one due to max size.
-	print("before Add")
 	i.Add([]BlockHash{BlockHash(3)}, server)
-	print("after ADD")
-	assert.Equal(t, 2, i.podToLRU[serverName].Len(), "Cache size should still be 2 after adding an entry")
+	assert.Equal(t, 2, i.podToLRU[server].Len(), "Cache size should still be 2 after adding an entry")
 }
diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go
@@ -44,11 +44,11 @@ const (
 	// A small capacity ensures a high accuracy of cache hit on the model server, but it will
 	// increase the chance of false negatives. A high capacity does the opposite.
 	// To properly size this, consider the sum of the total number of cache entries on all model
-	// servers. Consider the llama3 8B model on 8 H100 80GB GPUs. The size of the model weight is
-	// about 16GB. Assume 50% of the remaining HBM is used for caching prefixes, we have 32GB. Each
-	// token is about 128KB in size, so we can cache 250K tokens. Using the default block size of 16
-	// in vLLM, we will have 250K / 16 = 15.6K blocks.
-	DefaultLRUCapacityPerServer = 15000
+	// servers. Consider the llama3 8B model on a H100 80GB GPUs. The size of the model weight is
+	// about 16GB. The remaining HBM used for caching prefixes is 64GB. Each
+	// token is about 128KB in size, so we can cache 500K tokens. Using the default block size of 16
+	// in vLLM, we will have 250K / 16 = 31.25K blocks.
+	DefaultLRUCapacityPerServer = 31250
 )
 
 type Config struct {
@@ -58,7 +58,7 @@ type Config struct {
 	// MaxPrefixBlocksToMatch is the maximum number of prefix blocks to match. Input beyond this limit will
 	// be ignored.
 	MaxPrefixBlocksToMatch int
-	// Max (approximate) size of the LRU indexer in number of entries per server (pod).
+	// Max capacity size of the LRU indexer in number of entries per server (pod).
 	LRUCapacityPerServer int
 }
 
@@ -72,7 +72,7 @@ type podSet map[ServerID]struct{}
 
 type Indexer interface {
 	Get(hash BlockHash) podSet
-	Add(hashes []BlockHash, server ServerID)
+	Add(hashes []BlockHash, server ServerID) error
 }
 
 // BlockHash is a hash of the block of request body.
@@ -165,7 +165,11 @@ func (m *Plugin) PostCycle(ctx context.Context, cycleState *types.CycleState, re
 		log.FromContext(ctx).Error(err, "failed to read prefix plugin cycle state")
 		return
 	}
-	m.indexer.Add(state.PrefixHashes, ServerID(targetPod.NamespacedName))
+	err = m.indexer.Add(state.PrefixHashes, ServerID(targetPod.NamespacedName))
+	if err != nil {
+		log.FromContext(ctx).Error(err, "failed to add prefix hashes to indexer for target pod ", targetPod.NamespacedName)
+		return
+	}
 	total := len(state.PrefixHashes)
 	matchLen := state.PrefixCacheServers[ServerID(targetPod.NamespacedName)]
 	metrics.RecordPrefixCacheMatch(matchLen*m.HashBlockSize, total*m.HashBlockSize)