@@ -47,12 +47,23 @@ type Supervisor struct {
4747 inspectionExemptions map [string ]time.Time
4848 inspectionExemptionsMu sync.RWMutex
4949
50+ // Circuit breaker for inspection failures (Phase 2: Issue #105 stability)
51+ inspectionFailures map [string ]* inspectionFailureInfo
52+ inspectionFailuresMu sync.RWMutex
53+
5054 // Lifecycle
5155 ctx context.Context
5256 cancel context.CancelFunc
5357 wg sync.WaitGroup
5458}
5559
60+ // inspectionFailureInfo tracks inspection failures for circuit breaker pattern
61+ type inspectionFailureInfo struct {
62+ consecutiveFailures int
63+ lastFailureTime time.Time
64+ cooldownUntil time.Time
65+ }
66+
5667// UpstreamInterface defines the interface for upstream adapters.
5768type UpstreamInterface interface {
5869 AddServer (name string , cfg * config.ServerConfig ) error
@@ -84,6 +95,7 @@ func New(configSvc *configsvc.Service, upstream UpstreamInterface, logger *zap.L
8495 eventCh : make (chan Event , 500 ), // Phase 6: Increased buffer for async operations
8596 listeners : make ([]chan Event , 0 ),
8697 inspectionExemptions : make (map [string ]time.Time ),
98+ inspectionFailures : make (map [string ]* inspectionFailureInfo ),
8799 ctx : ctx ,
88100 cancel : cancel ,
89101 }
@@ -929,3 +941,119 @@ func (s *Supervisor) IsInspectionExempted(serverName string) bool {
929941
930942 return true
931943}
944+
945+ // ===== Circuit Breaker for Inspection Failures (Issue #105) =====
946+
947+ const (
948+ maxInspectionFailures = 3 // Max consecutive failures before cooldown
949+ inspectionCooldown = 5 * time .Minute // Cooldown duration after max failures
950+ failureResetTimeout = 10 * time .Minute // Reset counter if no failures for this long
951+ )
952+
953+ // CanInspect checks if inspection is allowed for a server (circuit breaker)
954+ // Returns (allowed bool, reason string, cooldownRemaining time.Duration)
955+ func (s * Supervisor ) CanInspect (serverName string ) (bool , string , time.Duration ) {
956+ s .inspectionFailuresMu .RLock ()
957+ defer s .inspectionFailuresMu .RUnlock ()
958+
959+ info , exists := s .inspectionFailures [serverName ]
960+ if ! exists {
961+ // No failure history - allow inspection
962+ return true , "" , 0
963+ }
964+
965+ now := time .Now ()
966+
967+ // Check if cooldown is active
968+ if now .Before (info .cooldownUntil ) {
969+ remaining := info .cooldownUntil .Sub (now )
970+ reason := fmt .Sprintf ("Server '%s' has failed inspection %d times. Circuit breaker active - please wait %v before retrying. This prevents cascading failures with unstable servers (see issue #105)." ,
971+ serverName , info .consecutiveFailures , remaining .Round (time .Second ))
972+ return false , reason , remaining
973+ }
974+
975+ // Check if failures should be reset (no failures for failureResetTimeout)
976+ if now .Sub (info .lastFailureTime ) > failureResetTimeout {
977+ // Failures are old - will be reset on next inspection
978+ return true , "" , 0
979+ }
980+
981+ // Within failure window but not in cooldown
982+ return true , "" , 0
983+ }
984+
985+ // RecordInspectionFailure records an inspection failure for circuit breaker
986+ func (s * Supervisor ) RecordInspectionFailure (serverName string ) {
987+ s .inspectionFailuresMu .Lock ()
988+ defer s .inspectionFailuresMu .Unlock ()
989+
990+ now := time .Now ()
991+
992+ info , exists := s .inspectionFailures [serverName ]
993+ if ! exists {
994+ info = & inspectionFailureInfo {}
995+ s .inspectionFailures [serverName ] = info
996+ }
997+
998+ // Reset counter if last failure was too long ago
999+ if now .Sub (info .lastFailureTime ) > failureResetTimeout {
1000+ info .consecutiveFailures = 0
1001+ }
1002+
1003+ info .consecutiveFailures ++
1004+ info .lastFailureTime = now
1005+
1006+ s .logger .Warn ("Inspection failure recorded" ,
1007+ zap .String ("server" , serverName ),
1008+ zap .Int ("consecutive_failures" , info .consecutiveFailures ),
1009+ zap .Int ("max_before_cooldown" , maxInspectionFailures ))
1010+
1011+ // Activate cooldown if max failures reached
1012+ if info .consecutiveFailures >= maxInspectionFailures {
1013+ info .cooldownUntil = now .Add (inspectionCooldown )
1014+ s .logger .Error ("⚠️ Inspection circuit breaker activated - too many failures" ,
1015+ zap .String ("server" , serverName ),
1016+ zap .Int ("failures" , info .consecutiveFailures ),
1017+ zap .Duration ("cooldown" , inspectionCooldown ),
1018+ zap .Time ("cooldown_until" , info .cooldownUntil ),
1019+ zap .String ("issue" , "#105 - preventing cascading failures" ))
1020+ }
1021+ }
1022+
1023+ // RecordInspectionSuccess records a successful inspection, resetting failure counter
1024+ func (s * Supervisor ) RecordInspectionSuccess (serverName string ) {
1025+ s .inspectionFailuresMu .Lock ()
1026+ defer s .inspectionFailuresMu .Unlock ()
1027+
1028+ info , exists := s .inspectionFailures [serverName ]
1029+ if ! exists {
1030+ return
1031+ }
1032+
1033+ if info .consecutiveFailures > 0 {
1034+ s .logger .Info ("Inspection succeeded - resetting failure counter" ,
1035+ zap .String ("server" , serverName ),
1036+ zap .Int ("previous_failures" , info .consecutiveFailures ))
1037+ }
1038+
1039+ // Reset failure counter
1040+ delete (s .inspectionFailures , serverName )
1041+ }
1042+
1043+ // GetInspectionStats returns inspection failure statistics for a server
1044+ func (s * Supervisor ) GetInspectionStats (serverName string ) (failures int , inCooldown bool , cooldownRemaining time.Duration ) {
1045+ s .inspectionFailuresMu .RLock ()
1046+ defer s .inspectionFailuresMu .RUnlock ()
1047+
1048+ info , exists := s .inspectionFailures [serverName ]
1049+ if ! exists {
1050+ return 0 , false , 0
1051+ }
1052+
1053+ now := time .Now ()
1054+ if now .Before (info .cooldownUntil ) {
1055+ return info .consecutiveFailures , true , info .cooldownUntil .Sub (now )
1056+ }
1057+
1058+ return info .consecutiveFailures , false , 0
1059+ }
0 commit comments