@@ -277,7 +277,7 @@ class GPUMonitor:
277277 def __init__ (self , sample_interval : float = 0.1 , logger : logging .Logger = None ):
278278 self .sample_interval = sample_interval
279279 self .logger = logger or logging .getLogger (__name__ )
280- self .monitoring = False
280+ self .stop_event = threading . Event ()
281281 self .thread = None
282282 self .gpu_utilization = []
283283 self .gpu_memory_used = []
@@ -308,10 +308,12 @@ def start(self):
308308 self .logger .debug ("GPU monitoring disabled: no GPUs available" )
309309 return
310310
311- self .monitoring = True
311+ # Clear the stop event to enable monitoring
312+ self .stop_event .clear ()
312313 self .gpu_utilization = []
313314 self .gpu_memory_used = []
314315 self .timestamps = []
316+ self .warning_logged = False # Reset warning flag for new monitoring session
315317 self .thread = threading .Thread (target = self ._monitor_loop )
316318 self .thread .start ()
317319 self .logger .debug ("GPU monitoring started" )
@@ -324,7 +326,8 @@ def stop_and_collect(self) -> Union[WithGPU, NoGPU]:
324326 gpu_monitoring_reason = "no_gpus_available"
325327 )
326328
327- self .monitoring = False
329+ # Signal the monitoring thread to stop
330+ self .stop_event .set ()
328331 if self .thread :
329332 self .thread .join ()
330333
@@ -348,11 +351,12 @@ def stop_and_collect(self) -> Union[WithGPU, NoGPU]:
348351 )
349352
350353 def _monitor_loop (self ):
351- """Background monitoring loop."""
354+ """Background monitoring loop using threading.Event for communication ."""
352355 consecutive_failures = 0
353356 max_consecutive_failures = 5
354357
355- while self .monitoring :
358+ # Continue monitoring until stop_event is set
359+ while not self .stop_event .is_set ():
356360 try :
357361 gpu_stats = gpustat .GPUStatCollection .new_query ()
358362 if gpu_stats and len (gpu_stats ) > 0 :
@@ -372,8 +376,12 @@ def _monitor_loop(self):
372376 if consecutive_failures >= max_consecutive_failures and not self .warning_logged :
373377 self .logger .warning (f"GPU monitoring failed after { max_consecutive_failures } attempts: { e } " )
374378 self .warning_logged = True
375-
376- time .sleep (self .sample_interval )
379+
380+ # Use Event.wait() with timeout instead of time.sleep()
381+ # This allows for immediate response to stop signal while still maintaining sample interval
382+ if self .stop_event .wait (timeout = self .sample_interval ):
383+ # Event was set, break out of loop immediately
384+ break
377385
378386
379387def get_hardware_info () -> HardwareInfo :
0 commit comments