Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from superbench.benchmarks import BenchmarkRegistry, ReturnCode
from superbench.common.devices import GPU
from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
from superbench.common.utils import network


class IBBenchmark(MicroBenchmarkWithInvoke):
Expand Down Expand Up @@ -43,6 +44,13 @@ def add_parser_arguments(self):
required=False,
help='The IB device, e.g., mlx5_0, mlx5_$LOCAL_RANK, mlx5_$((LOCAL_RANK/2)), etc.',
)
self._parser.add_argument(
'--set_ib_devices',
action='store_true',
default=False,
help='Set irregular IB devices automatically according to the local rank. \
If IB devices are not able to be probed, use env IB_DEVICES to set them manually.',
)
self._parser.add_argument(
'--gpu_dev',
type=str,
Expand Down Expand Up @@ -282,6 +290,16 @@ def __prepare_general_ib_command_params(self, msg_size, device='cpu'):
return False
# Generate ib command params
command_params = f'-F -n {self._args.iters} -d {self._args.ib_dev} {msg_size} {gpu_dev}'
if self._args.set_ib_devices:
ib_devices = network.get_ib_devices()
local_rank = int(os.getenv('OMPI_COMM_WORLD_LOCAL_RANK', 0))
if local_rank >= len(ib_devices):
self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
logger.error(
f'Local rank {local_rank} exceeds IB devices ({len(ib_devices)}) - benchmark: {self._name}'
)
return False
command_params = f'-F -n {self._args.iters} -d {ib_devices[local_rank].split(":")[0]} {msg_size} {gpu_dev}'
command_params = f'{command_params.strip()} --report_gbits'
return command_params

Expand Down
3 changes: 3 additions & 0 deletions superbench/common/utils/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import socket
import re
import os
from pathlib import Path


Expand All @@ -31,6 +32,8 @@ def get_ib_devices():
Return:
ib_devices_port (list): IB devices with available ports in current system.
"""
if os.getenv('IB_DEVICES', None):
return os.getenv('IB_DEVICES').split(',')
devices = list(p.name for p in Path('/sys/class/infiniband').glob('*'))
ib_devices_port_dict = {}
for device in devices:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,20 @@ def test_ib_traffic_performance(self, mock_gpu):
ret = benchmark._preprocess()
assert (ret is True)

os.environ['IB_DEVICES'] = 'mlx5_ibx0,mlx5_ibx1,mlx5_ibx2'
parameters = '--set_ib_devices --iters 2000 --pattern one-to-one --hostfile hostfile'
benchmark = benchmark_class(benchmark_name, parameters=parameters)
ret = benchmark._preprocess()
assert (ret is True)
expect_command = "ib_validation --send_cmd_prefix '" + benchmark._args.bin_dir + \
"/ib_write_bw -F -n 2000 -d mlx5_ibx0 -s 8388608 --report_gbits'" + \
f" --recv_cmd_prefix '{benchmark._args.bin_dir}/ib_write_bw -F -n 2000" + \
" -d mlx5_ibx0 -s 8388608 --report_gbits' " + \
f'--timeout 120 --hostfile hostfile --input_config {os.getcwd()}/config.txt'
command = benchmark._bin_name + benchmark._commands[0].split(benchmark._bin_name)[1]
assert (command == expect_command)
os.environ.pop('IB_DEVICES')

# Generate config
parameters = '--ib_dev "$(echo mlx5_0)" --iters 2000 --msg_size 33554432 --hostfile hostfile'
benchmark = benchmark_class(benchmark_name, parameters=parameters)
Expand Down
Loading