Skip to content

Commit 0c5bf35

Browse files
authored
Merge pull request #54 from MooreThreads/add_ib_test
add ib stream test btween 2 machine
2 parents 12175f0 + 817ffca commit 0c5bf35

File tree

2 files changed

+105
-0
lines changed

2 files changed

+105
-0
lines changed

setup_musa/check/README.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# ib_stream_test.sh
2+
这个脚本主要是用来测试双机IB设备之间的打流,脚本实现的原理很简单:
3+
1. A机器固定一个接收端,B机器遍历所有IB设备进行打流,
4+
2. A机器进入下一个IB设备进行接收,重复上面的动作直到所有IB设备组合都测试完毕
5+
6+
7+
### 补充手动打流方法:
8+
1. 首先查看ib设备与网卡名对应,以及状态
9+
```shell
10+
ibdev2netdev
11+
```
12+
2. 以ib_write_bw命令为例测试打流,所有的网卡全部需要测测试,这里仅以其中一个为例
13+
```shell
14+
# server端,业务IP 10.2.38.10,例如测试mlx5_9
15+
ib_write_bw -d mlx5_9
16+
17+
# clinet端,业务IP 10.2.38.11
18+
ib_write_bw -d mlx5_9 --report_gbits 10.2.38.10

setup_musa/check/ib_stream_test.sh

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
#!/bin/bash
2+
# 最好保证两台机器ssh免密
3+
4+
# 配置机器信息,请按照实际修改
5+
SERVER_A="A_IP"
6+
SERVER_B="B_IP"
7+
USER="actual_username"
8+
IB_DEVICES=("mlx5_2" "mlx5_3" "mlx5_4" "mlx5_5" "mlx5_8" "mlx5_9" "mlx5_10" "mlx5_11")
9+
FAIL_LOG="failures_$(date +%Y%m%d).log" # 含日期的日志文件
10+
11+
# 颜色定义
12+
RED='\033[1;31m'
13+
GREEN='\033[1;32m'
14+
NC='\033[0m' # 重置颜色
15+
16+
# 失败组合记录数组
17+
declare -a FAILED_PAIRS
18+
19+
cleanup() {
20+
ssh ${USER}@${SERVER_A} "pkill -f 'ib_write_bw -d'" >/dev/null 2>&1
21+
ssh ${USER}@${SERVER_B} "pkill -f 'ib_write_bw -d'" >/dev/null 2>&1
22+
}
23+
trap cleanup EXIT
24+
25+
# 测试结果处理函数
26+
process_result() {
27+
local dev_a=$1
28+
local dev_b=$2
29+
local log_file="client_${dev_a}_${dev_b}.log"
30+
31+
if grep -q "BW average" "$log_file"; then
32+
echo -e "${GREEN}[PASS]${NC} $dev_b -> $dev_a"
33+
grep -A 5 "BW average" "$log_file" | tail -6
34+
else
35+
echo -e "${RED}[FAIL]${NC} $dev_b -> $dev_a"
36+
FAILED_PAIRS+=("$dev_a-$dev_b")
37+
# 记录详细失败日志
38+
echo "===== 失败组合: $dev_a-$dev_b =====" >> "$FAIL_LOG"
39+
cat "$log_file" >> "$FAIL_LOG"
40+
echo -e "\n" >> "$FAIL_LOG"
41+
fi
42+
}
43+
44+
# 主测试循环
45+
for ((i=0; i<${#IB_DEVICES[@]}; i++)); do
46+
DEV_A="${IB_DEVICES[$i]}"
47+
echo "[INFO] 机器A启动持续接收服务: ${DEV_A}"
48+
49+
ssh ${USER}@${SERVER_A} "while true; do ib_write_bw -d ${DEV_A}; done" > "server_${DEV_A}.log" &
50+
SERVER_PID=$!
51+
sleep 8
52+
53+
for ((j=0; j<${#IB_DEVICES[@]}; j++)); do
54+
DEV_B="${IB_DEVICES[$j]}"
55+
echo "[TEST] 机器B使用设备: ${DEV_B} -> 机器A设备: ${DEV_A}"
56+
57+
# 执行测试并捕获完整输出
58+
ssh ${USER}@${SERVER_B} "ib_write_bw -d ${DEV_B} ${SERVER_A} -D 5" > "client_${DEV_A}_${DEV_B}.log"
59+
process_result "$DEV_A" "$DEV_B"
60+
sleep 2
61+
done
62+
63+
kill -15 $SERVER_PID 2>/dev/null
64+
wait $SERVER_PID 2>/dev/null
65+
ssh ${USER}@${SERVER_A} "pkill -f 'ib_write_bw -d ${DEV_A}'" >/dev/null 2>&1
66+
done
67+
68+
# 失败组合总结
69+
summarize_failures() {
70+
if [ ${#FAILED_PAIRS[@]} -eq 0 ]; then
71+
echo -e "${GREEN}所有组合测试成功!${NC}"
72+
return
73+
fi
74+
75+
echo -e "\n${RED}===== 失败组合总结 =====${NC}"
76+
echo "${#FAILED_PAIRS[@]} 组失败:"
77+
for pair in "${FAILED_PAIRS[@]}"; do
78+
echo -e "${RED} $pair${NC}"
79+
done
80+
81+
# 记录到日志文件
82+
echo -e "\n===== $(date) 失败组合汇总 =====" >> "$FAIL_LOG"
83+
printf "%s\n" "${FAILED_PAIRS[@]}" >> "$FAIL_LOG"
84+
echo -e "\n详细日志见: ${RED}$FAIL_LOG${NC}"
85+
}
86+
87+
summarize_failures

0 commit comments

Comments
 (0)