Skip to content

Commit 9f2fafc

Browse files
committed
Use Events for thread comms
1 parent bf780ac commit 9f2fafc

File tree

1 file changed

+15
-7
lines changed

1 file changed

+15
-7
lines changed

benchmark_v2/benchmark_framework.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,7 @@ class GPUMonitor:
277277
def __init__(self, sample_interval: float = 0.1, logger: logging.Logger = None):
278278
self.sample_interval = sample_interval
279279
self.logger = logger or logging.getLogger(__name__)
280-
self.monitoring = False
280+
self.stop_event = threading.Event()
281281
self.thread = None
282282
self.gpu_utilization = []
283283
self.gpu_memory_used = []
@@ -308,10 +308,12 @@ def start(self):
308308
self.logger.debug("GPU monitoring disabled: no GPUs available")
309309
return
310310

311-
self.monitoring = True
311+
# Clear the stop event to enable monitoring
312+
self.stop_event.clear()
312313
self.gpu_utilization = []
313314
self.gpu_memory_used = []
314315
self.timestamps = []
316+
self.warning_logged = False # Reset warning flag for new monitoring session
315317
self.thread = threading.Thread(target=self._monitor_loop)
316318
self.thread.start()
317319
self.logger.debug("GPU monitoring started")
@@ -324,7 +326,8 @@ def stop_and_collect(self) -> Union[WithGPU, NoGPU]:
324326
gpu_monitoring_reason="no_gpus_available"
325327
)
326328

327-
self.monitoring = False
329+
# Signal the monitoring thread to stop
330+
self.stop_event.set()
328331
if self.thread:
329332
self.thread.join()
330333

@@ -348,11 +351,12 @@ def stop_and_collect(self) -> Union[WithGPU, NoGPU]:
348351
)
349352

350353
def _monitor_loop(self):
351-
"""Background monitoring loop."""
354+
"""Background monitoring loop using threading.Event for communication."""
352355
consecutive_failures = 0
353356
max_consecutive_failures = 5
354357

355-
while self.monitoring:
358+
# Continue monitoring until stop_event is set
359+
while not self.stop_event.is_set():
356360
try:
357361
gpu_stats = gpustat.GPUStatCollection.new_query()
358362
if gpu_stats and len(gpu_stats) > 0:
@@ -372,8 +376,12 @@ def _monitor_loop(self):
372376
if consecutive_failures >= max_consecutive_failures and not self.warning_logged:
373377
self.logger.warning(f"GPU monitoring failed after {max_consecutive_failures} attempts: {e}")
374378
self.warning_logged = True
375-
376-
time.sleep(self.sample_interval)
379+
380+
# Use Event.wait() with timeout instead of time.sleep()
381+
# This allows for immediate response to stop signal while still maintaining sample interval
382+
if self.stop_event.wait(timeout=self.sample_interval):
383+
# Event was set, break out of loop immediately
384+
break
377385

378386

379387
def get_hardware_info() -> HardwareInfo:

0 commit comments

Comments
 (0)