[Dashboard] [Jobs] dashboard.py
memory growth when submitting many jobs #30152
Open
Description
What happened + What you expected to happen
When running a basic script that submits 4 echo "hello world"
jobs every 4 seconds, we observe memory growth of 0.5GB per hour.
Taken from running the long_running_many_jobs test at #30110
Versions / Dependencies
Master, Mac and Linux, Python 3.8
Reproduction script
Test script
import argparse
import json
import os
import time
import random
from typing import List, Optional
from ray.dashboard.modules.job.common import JobStatus
from ray.dashboard.modules.job.pydantic_models import JobDetails
import ray
from ray.job_submission import JobSubmissionClient
NUM_CLIENTS = 1
NUM_JOBS_PER_BATCH = 4
SMOKE_TEST_TIMEOUT = 10 * 60 # 10 minutes
FULL_TEST_TIMEOUT = 8 * 60 * 60 # 8 hours
def submit_batch_jobs(
clients: List[JobSubmissionClient],
num_jobs: int,
timeout_s: int = 10 * 60,
retry_interval_s: int = 1,
) -> bool:
job_ids = []
for i in range(num_jobs):
# Cycle through clients arbitrarily
client = clients[i % len(clients)]
job_id = client.submit_job(
entrypoint="echo hello",
)
job_ids.append(job_id)
print(f"submitted job: {job_id}")
time.sleep(4)
return True
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--smoke-test", action="store_true", help="Finish quickly for testing."
)
parser.add_argument("--num-clients", type=int, default=NUM_CLIENTS)
parser.add_argument("--num-jobs-per-batch", type=int, default=NUM_JOBS_PER_BATCH)
args = parser.parse_args()
if args.smoke_test:
print(f"Running smoke test with timeout {SMOKE_TEST_TIMEOUT} seconds")
timeout = SMOKE_TEST_TIMEOUT
else:
print(f"Running full test (timeout: {FULL_TEST_TIMEOUT}s)")
timeout = FULL_TEST_TIMEOUT
start = time.time()
ray.init()
clients = [JobSubmissionClient() for i in range(NUM_CLIENTS)]
batch_counter = 0
while time.time() - start < timeout:
batch_counter += 1
print(f"Submitting batch {batch_counter}...")
# Submit a batch of jobs
if not submit_batch_jobs(clients, NUM_JOBS_PER_BATCH):
print("FAILED")
exit(1)
Memory logging script:
#!/bin/bash
# Print and log the memory usage of dashboard.py every 10 seconds.
# Usage: log_memory.sh
# Example: log_memory.sh
# The output will be written to memory.log.
# Get pid of dashboard.py
pid=$(ps aux | grep dashboard.py | grep -v grep | awk '{print $2}')
echo "PID: $pid"
echo "Logging memory usage of process $pid every 10 seconds."
echo "Press Ctrl+C to stop."
echo "Memory (MB) every 10 seconds" > memory.log
while true; do
mem=$(ps -o rss= -p $pid | awk '{print $1/1024}')
echo "$mem" >> memory.log
echo $mem
sleep 10
done
Issue Severity
High: It blocks me from completing my task.