Skip to content

Commit e873603

Browse files
committed
add ib stream test btween 2 machine
1 parent 12175f0 commit e873603

File tree

1 file changed

+86
-0
lines changed

1 file changed

+86
-0
lines changed

setup_musa/check/ib_stream_test.sh

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
#!/bin/bash
2+
3+
# 配置机器信息,请按照实际修改
4+
SERVER_A="A_IP"
5+
SERVER_B="B_IP"
6+
USER="actual_username"
7+
IB_DEVICES=("mlx5_2" "mlx5_3" "mlx5_4" "mlx5_5" "mlx5_8" "mlx5_9" "mlx5_10" "mlx5_11")
8+
FAIL_LOG="failures_$(date +%Y%m%d).log" # 含日期的日志文件
9+
10+
# 颜色定义
11+
RED='\033[1;31m'
12+
GREEN='\033[1;32m'
13+
NC='\033[0m' # 重置颜色
14+
15+
# 失败组合记录数组
16+
declare -a FAILED_PAIRS
17+
18+
cleanup() {
19+
ssh ${USER}@${SERVER_A} "pkill -f 'ib_write_bw -d'" >/dev/null 2>&1
20+
ssh ${USER}@${SERVER_B} "pkill -f 'ib_write_bw -d'" >/dev/null 2>&1
21+
}
22+
trap cleanup EXIT
23+
24+
# 测试结果处理函数
25+
process_result() {
26+
local dev_a=$1
27+
local dev_b=$2
28+
local log_file="client_${dev_a}_${dev_b}.log"
29+
30+
if grep -q "BW average" "$log_file"; then
31+
echo -e "${GREEN}[PASS]${NC} $dev_b -> $dev_a"
32+
grep -A 5 "BW average" "$log_file" | tail -6
33+
else
34+
echo -e "${RED}[FAIL]${NC} $dev_b -> $dev_a"
35+
FAILED_PAIRS+=("$dev_a-$dev_b")
36+
# 记录详细失败日志
37+
echo "===== 失败组合: $dev_a-$dev_b =====" >> "$FAIL_LOG"
38+
cat "$log_file" >> "$FAIL_LOG"
39+
echo -e "\n" >> "$FAIL_LOG"
40+
fi
41+
}
42+
43+
# 主测试循环
44+
for ((i=0; i<${#IB_DEVICES[@]}; i++)); do
45+
DEV_A="${IB_DEVICES[$i]}"
46+
echo "[INFO] 机器A启动持续接收服务: ${DEV_A}"
47+
48+
ssh ${USER}@${SERVER_A} "while true; do ib_write_bw -d ${DEV_A}; done" > "server_${DEV_A}.log" &
49+
SERVER_PID=$!
50+
sleep 8
51+
52+
for ((j=0; j<${#IB_DEVICES[@]}; j++)); do
53+
DEV_B="${IB_DEVICES[$j]}"
54+
echo "[TEST] 机器B使用设备: ${DEV_B} -> 机器A设备: ${DEV_A}"
55+
56+
# 执行测试并捕获完整输出
57+
ssh ${USER}@${SERVER_B} "ib_write_bw -d ${DEV_B} ${SERVER_A} -D 5" > "client_${DEV_A}_${DEV_B}.log"
58+
process_result "$DEV_A" "$DEV_B"
59+
sleep 2
60+
done
61+
62+
kill -15 $SERVER_PID 2>/dev/null
63+
wait $SERVER_PID 2>/dev/null
64+
ssh ${USER}@${SERVER_A} "pkill -f 'ib_write_bw -d ${DEV_A}'" >/dev/null 2>&1
65+
done
66+
67+
# 失败组合总结
68+
summarize_failures() {
69+
if [ ${#FAILED_PAIRS[@]} -eq 0 ]; then
70+
echo -e "${GREEN}所有组合测试成功!${NC}"
71+
return
72+
fi
73+
74+
echo -e "\n${RED}===== 失败组合总结 =====${NC}"
75+
echo "${#FAILED_PAIRS[@]} 组失败:"
76+
for pair in "${FAILED_PAIRS[@]}"; do
77+
echo -e "${RED} $pair${NC}"
78+
done
79+
80+
# 记录到日志文件
81+
echo -e "\n===== $(date) 失败组合汇总 =====" >> "$FAIL_LOG"
82+
printf "%s\n" "${FAILED_PAIRS[@]}" >> "$FAIL_LOG"
83+
echo -e "\n详细日志见: ${RED}$FAIL_LOG${NC}"
84+
}
85+
86+
summarize_failures

0 commit comments

Comments
 (0)