From 3ce6248f8ceca322fcd6eabcadbe05b5444eec91 Mon Sep 17 00:00:00 2001 From: Sagar Dhawan Date: Thu, 14 Nov 2019 20:36:34 -0800 Subject: [PATCH] Add CPU and RAM usage to Metrics (#6968) * Add CPU usage to Metrics * Add RAM usage and rename to system-stats * Shellcheck * Remove SC exception * Address review comments --- .../dashboards/testnet-monitor.json | 164 ++++++++++++++++++ net/remote/remote-node.sh | 2 + scripts/system-stats.sh | 24 +++ 3 files changed, 190 insertions(+) create mode 100644 scripts/system-stats.sh diff --git a/metrics/scripts/grafana-provisioning/dashboards/testnet-monitor.json b/metrics/scripts/grafana-provisioning/dashboards/testnet-monitor.json index ec08547564d498..c7fc10b4362b38 100644 --- a/metrics/scripts/grafana-provisioning/dashboards/testnet-monitor.json +++ b/metrics/scripts/grafana-provisioning/dashboards/testnet-monitor.json @@ -9603,6 +9603,170 @@ "align": false, "alignLevel": null } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 109 + }, + "id": 74, + "panels": [], + "title": "Resources", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + "h": 4, + "w": 8, + "x": 0, + "y": 110 + }, + "id": 70, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"cpu_usage\") as \"cpu_usage\" FROM \"$testnet\".\"autogen\".\"system-stats\" WHERE hostname =~ /$hostid/ AND $timeFilter GROUP BY time(5s) fill(null)\n", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT max(\"ram_usage\") as \"ram_usage\" FROM \"$testnet\".\"autogen\".\"system-stats\" WHERE hostname =~ /$hostid/ AND $timeFilter GROUP BY time(1s) fill(null)\n", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Resource Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } ], "refresh": "60s", diff --git a/net/remote/remote-node.sh b/net/remote/remote-node.sh index c355fac241c24a..7a05dc918e78c0 100755 --- a/net/remote/remote-node.sh +++ b/net/remote/remote-node.sh @@ -139,6 +139,8 @@ cat >> ~/solana/on-reboot < net-stats.pid scripts/iftop.sh > iftop.log 2>&1 & echo \$! > iftop.pid + scripts/system-stats.sh > system-stats.log 2>&1 & + echo \$! > system-stats.pid if ${GPU_CUDA_OK} && [[ -e /dev/nvidia0 ]]; then echo Selecting solana-validator-cuda diff --git a/scripts/system-stats.sh b/scripts/system-stats.sh new file mode 100644 index 00000000000000..2410ade72d7034 --- /dev/null +++ b/scripts/system-stats.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +# +# Reports cpu and ram usage statistics +# +set -e + +[[ $(uname) == Linux ]] || exit 0 + +# need to cd like this to avoid #SC1091 +cd "$(dirname "$0")/.." +source scripts/configure-metrics.sh + +while true; do + # collect the total cpu usage by subtracting idle usage from 100% + cpu_usage=$(top -bn1 | grep '%Cpu(s):' | sed "s/.*, *\([0-9.]*\)%* id.*/\1/" | awk '{print 100 - $1}') + # collect the total ram usage by dividing used memory / total memory + ram_total_and_usage=$(top -bn1 | grep 'MiB Mem'| sed "s/.*: *\([0-9.]*\)%* total.*, *\([0-9.]*\)%* used.*/\1 \2/") + read -r total used <<< "$ram_total_and_usage" + ram_usage=$(awk "BEGIN {print $used / $total * 100}") + + report="cpu_usage=$cpu_usage,ram_usage=$ram_usage" + ./scripts/metrics-write-datapoint.sh "system-stats,hostname=$HOSTNAME $report" + sleep 1 +done