robbyt · robbyt · May 28, 2025 · May 28, 2025
diff --git a/runnables/httpcluster/runner.go b/runnables/httpcluster/runner.go
@@ -409,9 +409,13 @@ func (r *Runner) startServers(
 
 		// Wait for server to be ready
 		if !r.waitForIsRunning(ctx, r.deadlineServerStart, runner) {
-			logger.Error("Server failed to become ready")
+			logger.Error("Server failed to become ready",
+				"timeout", r.deadlineServerStart,
+				"state", runner.GetState())
 			// Cancel the server context to clean up
 			serverCancel()
+			// Stop the runner explicitly to ensure cleanup
+			runner.Stop()
 			// Remove entry completely since server failed to start
 			current = current.removeEntry(id)
 			continue

diff --git a/runnables/httpcluster/runner_restart_test.go b/runnables/httpcluster/runner_restart_test.go
@@ -343,13 +343,12 @@ func TestConcurrentRestartStress(t *testing.T) {
 	}, 5*time.Second, 10*time.Millisecond, "Both servers should be running after concurrent restarts")
 }
 
-// BenchmarkServerRestarts measures performance of server restarts
+// BenchmarkServerRestarts measures performance of server restarts with proper delays
 func BenchmarkServerRestarts(b *testing.B) {
 	benchmarks := []struct {
 		name         string
 		restartDelay time.Duration
 	}{
-		{"NoDelay", 0},
 		{"5msDelay", 5 * time.Millisecond},
 		{"10msDelay", 10 * time.Millisecond},
 	}
@@ -390,31 +389,122 @@ func BenchmarkServerRestarts(b *testing.B) {
 
 			b.ResetTimer()
 
+			successfulRestarts := 0
 			// Benchmark restarts
 			for i := 0; i < b.N; i++ {
 				newConfig := createRestartTestHTTPConfig(b, port, fmt.Sprintf("bench-%d", i))
 				runner.configSiphon <- map[string]*httpserver.Config{
 					serverID: newConfig,
 				}
 
-				// Wait for restart to complete
-				for {
+				// Wait for restart to complete with timeout
+				restartSuccess := false
+				for attempts := 0; attempts < 100; attempts++ {
 					runner.mu.RLock()
 					entry := runner.currentEntries.get(serverID)
 					if entry != nil && entry.runner != nil && entry.runner.IsRunning() {
 						runner.mu.RUnlock()
+						restartSuccess = true
 						break
 					}
 					runner.mu.RUnlock()
 					time.Sleep(time.Millisecond)
 				}
+
+				if restartSuccess {
+					successfulRestarts++
+				} else {
+					b.Fatalf("Restart %d failed to complete within timeout", i)
+				}
 			}
 
 			b.StopTimer()
+
+			// Verify all restarts succeeded
+			if successfulRestarts != b.N {
+				b.Fatalf("Expected %d successful restarts, got %d", b.N, successfulRestarts)
+			}
+
+			// Verify final server count
+			if runner.GetServerCount() != 1 {
+				b.Fatalf("Expected 1 server after benchmark, got %d", runner.GetServerCount())
+			}
 		})
 	}
 }
 
+// BenchmarkServerRestartsNoDelay specifically tests rapid restarts with no delay
+// This benchmark expects failures due to port binding conflicts
+func BenchmarkServerRestartsNoDelay(b *testing.B) {
+	runner, err := NewRunner(WithRestartDelay(0))
+	require.NoError(b, err)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	go func() {
+		err := runner.Run(ctx)
+		require.NoError(b, err)
+	}()
+
+	// Wait for runner to start
+	require.Eventually(b, func() bool {
+		return runner.IsRunning()
+	}, 2*time.Second, 100*time.Millisecond)
+
+	// Create server
+	port := getRestartTestPort(b)
+	serverID := "bench-server"
+	initialConfig := createRestartTestHTTPConfig(b, port, "bench")
+
+	// Start server
+	runner.configSiphon <- map[string]*httpserver.Config{
+		serverID: initialConfig,
+	}
+
+	require.Eventually(b, func() bool {
+		runner.mu.RLock()
+		defer runner.mu.RUnlock()
+		entry := runner.currentEntries.get(serverID)
+		return entry != nil && entry.runner != nil && entry.runner.IsRunning()
+	}, 5*time.Second, 100*time.Millisecond)
+
+	b.ResetTimer()
+
+	successCount := 0
+	failureCount := 0
+
+	// Benchmark restarts - expect some failures
+	for i := 0; i < b.N; i++ {
+		newConfig := createRestartTestHTTPConfig(b, port, fmt.Sprintf("bench-%d", i))
+		runner.configSiphon <- map[string]*httpserver.Config{
+			serverID: newConfig,
+		}
+
+		// Check result after a brief wait
+		time.Sleep(50 * time.Millisecond)
+
+		runner.mu.RLock()
+		entry := runner.currentEntries.get(serverID)
+		if entry != nil && entry.runner != nil && entry.runner.IsRunning() {
+			successCount++
+		} else {
+			failureCount++
+			// Need to ensure we have a server for next iteration
+			runner.mu.RUnlock()
+			runner.configSiphon <- map[string]*httpserver.Config{
+				serverID: newConfig,
+			}
+			time.Sleep(100 * time.Millisecond)
+			continue
+		}
+		runner.mu.RUnlock()
+	}
+
+	b.StopTimer()
+	b.Logf("NoDelay results: %d successes, %d failures (%.1f%% success rate)",
+		successCount, failureCount, float64(successCount)*100/float64(b.N))
+}
+
 // BenchmarkConcurrentStateChecks benchmarks concurrent state check operations
 func BenchmarkConcurrentStateChecks(b *testing.B) {
 	runner, err := NewRunner(WithRestartDelay(10 * time.Millisecond))