Skip to content

Commit 4665a02

Browse files
author
Hao Zhi
committed
add GPU temperature and frequency monitoring script
1 parent e41d1fe commit 4665a02

File tree

2 files changed

+91
-0
lines changed

2 files changed

+91
-0
lines changed

monitor/README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
## -d deviceId
2+
## -i 刷新时间
3+
## -n 记录次数
4+
5+
## 执行脚本命令
6+
./monitor_gpu.sh -d 0 -i 1 -n 10
7+
8+
## 后台运行命令
9+
nohup ./monitor_gpu.sh -d 0 -i 1 -n 10 > /dev/null 2>&1 &
10+
tail -f gpu_monitor_log.txt

monitor/monitor_gpu.sh

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
#!/bin/bash
2+
3+
# 默认参数
4+
device_id=0
5+
interval=1
6+
count=-1
7+
output_file="gpu_monitor_log.txt"
8+
9+
# ANSI 颜色
10+
RED='\033[0;31m'
11+
NC='\033[0m' # 清除颜色
12+
13+
# 报警计数器
14+
high_temp_count=0
15+
low_freq_count=0
16+
17+
# 解析命令行参数(不包括 -o)
18+
while getopts "d:i:n:" opt; do
19+
case $opt in
20+
d) device_id=$OPTARG ;;
21+
i) interval=$OPTARG ;;
22+
n) count=$OPTARG ;;
23+
*) echo "Usage: $0 [-d device_id] [-i interval_sec] [-n count]"
24+
exit 1 ;;
25+
esac
26+
done
27+
28+
# 写入启动信息(仅首次)
29+
if [ ! -f "$output_file" ]; then
30+
echo "=== GPU Monitor Started at $(date) ===" >> "$output_file"
31+
echo "Device: $device_id, Interval: $interval sec" >> "$output_file"
32+
echo "----------------------------------------" >> "$output_file"
33+
fi
34+
35+
# 开始监控
36+
i=0
37+
while [ $count -lt 0 ] || [ $i -lt $count ]; do
38+
timestamp=$(date "+%Y-%m-%d %H:%M:%S")
39+
temp_line=$(mthreads-gmi -i "$device_id" -q | grep "GPU Current Temp")
40+
freq_line=$(mthreads-gmi -i "$device_id" -q | grep "Graphics")
41+
42+
temp=$(echo "$temp_line" | grep -oP '\d+(?=C)')
43+
freq=$(echo "$freq_line" | grep -oP '\d+(?=MHz)')
44+
45+
plain_msg=""
46+
color_msg=""
47+
48+
# 检查高温
49+
if [ "$temp" -gt 95 ]; then
50+
plain_msg+="高温报警(${temp}C)"
51+
color_msg+="${RED}高温报警(${temp}C)${NC} "
52+
((high_temp_count++))
53+
fi
54+
55+
# 检查降频
56+
if [ "$freq" -lt 1750 ]; then
57+
plain_msg+=" 降频警告(${freq}MHz)"
58+
color_msg+="${RED}降频警告(${freq}MHz)${NC}"
59+
((low_freq_count++))
60+
fi
61+
62+
{
63+
echo "[$timestamp]"
64+
echo "$temp_line"
65+
echo "$freq_line"
66+
[ -n "$plain_msg" ] && echo "$plain_msg"
67+
echo "------------------------------------"
68+
} >> "$output_file"
69+
70+
# 终端打印报警(红色)
71+
[ -n "$plain_msg" ] && echo -e "[$timestamp] $color_msg"
72+
73+
sleep "$interval"
74+
((i++))
75+
done
76+
77+
# 打印统计结果
78+
echo -e "\n=== 监控结束 ==="
79+
echo "高温次数: $high_temp_count"
80+
echo "降频次数: $low_freq_count"
81+

0 commit comments

Comments
 (0)