Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 24 additions & 17 deletions jumper_extension/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,10 @@ def _initialize_dataframe(self, level):
columns = [
"time",
"memory",
"io_read_count",
"io_write_count",
"io_read",
"io_write",
"io_read_count_cum",
"io_write_count_cum",
"io_read_cum",
"io_write_cum",
"cpu_util_avg",
"cpu_util_min",
"cpu_util_max",
Expand Down Expand Up @@ -84,10 +84,24 @@ def view(self, level="process", slice_=None, cell_history=None):
"""View data for a specific level with optional slicing."""
self._validate_level(level)
base = (
self.data[level]
self.data[level].copy()
if slice_ is None
else self.data[level].iloc[slice_[0] : slice_[1] + 1]
else self.data[level].iloc[slice_[0] : slice_[1] + 1].copy()
)

# Compute IO rates from cumulative values using diff()
if len(base) > 0:
time_diffs = base["time"].diff()

# Calculate rates (ops/s and MB/s)
base["io_read_count"] = (base["io_read_count_cum"].diff() / time_diffs).clip(lower=0).fillna(0)
base["io_write_count"] = (base["io_write_count_cum"].diff() / time_diffs).clip(lower=0).fillna(0)
base["io_read"] = (base["io_read_cum"].diff() / time_diffs / (1024**2)).clip(lower=0).fillna(0)
base["io_write"] = (base["io_write_cum"].diff() / time_diffs / (1024**2)).clip(lower=0).fillna(0)

# Drop cumulative columns from view
base = base.drop(columns=["io_read_count_cum", "io_write_count_cum", "io_read_cum", "io_write_cum"])

return (
self._attach_cell_index(base, cell_history)
if cell_history is not None
Expand All @@ -110,20 +124,13 @@ def add_sample(
self.num_system_cpus if level == "system" else self.num_cpus
)

last_timestamp = 0
if len(self.data[level]):
last_timestamp = self.data[level].loc[len(self.data[level]) - 1][
"time"
]

cumulative_metrics_ratio = time_mark - last_timestamp
row_data = {
"time": time_mark,
"memory": memory,
"io_read_count": io_counters[0] / cumulative_metrics_ratio,
"io_write_count": io_counters[1] / cumulative_metrics_ratio,
"io_read": io_counters[2] / cumulative_metrics_ratio,
"io_write": io_counters[3] / cumulative_metrics_ratio,
"io_read_count_cum": io_counters[0],
"io_write_count_cum": io_counters[1],
"io_read_cum": io_counters[2],
"io_write_cum": io_counters[3],
"cpu_util_avg": sum(cpu_util_per_core) / effective_num_cpus,
"cpu_util_min": min(cpu_util_per_core),
"cpu_util_max": max(cpu_util_per_core),
Expand Down
41 changes: 41 additions & 0 deletions jumper_extension/monitor.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import json
import logging
import os
import threading
import time
import unittest.mock
from pathlib import Path

import psutil

Expand All @@ -19,6 +21,9 @@
detect_memory_limit,
)

# Global path to visualizer configuration file
VISUALIZER_CONFIG_PATH = Path(__file__).parent / "visualizer_config.json"

logger = logging.getLogger("extension")

# GPU monitoring setup
Expand Down Expand Up @@ -366,3 +371,39 @@ def stop(self):
seconds=time.perf_counter() - self.start_time
)
)

def get_visualizer_config(self):
"""Load and patch visualizer configuration."""
with open(VISUALIZER_CONFIG_PATH, 'r') as f:
config = json.load(f)

# Recursively patch template values in the config
def patch_values(obj):
if isinstance(obj, dict):
return {k: patch_values(v) for k, v in obj.items()}
elif isinstance(obj, list):
return [patch_values(item) for item in obj]
elif isinstance(obj, str):
# Replace template variables with actual values
result = obj
result = result.replace('{{num_cpus}}', str(self.num_cpus))
result = result.replace('{{num_gpus}}', str(self.num_gpus))
result = result.replace('{{gpu_memory}}', str(self.gpu_memory))

# Patch memory limits for all levels
for level in ['process', 'user', 'system', 'slurm']:
limit = self.memory_limits.get(level, 0)
result = result.replace(f'{{{{memory_{level}}}}}', str(limit))

# Try to convert back to number if it's a numeric string
try:
if '.' in result:
return float(result)
else:
return int(result)
except ValueError:
return result
else:
return obj

return patch_values(config)
Loading