From 79cc67d35306b9583dc7fea749825da7ea1a4292 Mon Sep 17 00:00:00 2001 From: Pierre Krieger Date: Tue, 8 Sep 2020 11:59:37 +0200 Subject: [PATCH] Update the service tasks Grafana dashboard (#7038) --- .../substrate-service-tasks.json | 400 +++++++----------- 1 file changed, 143 insertions(+), 257 deletions(-) diff --git a/.maintain/monitoring/grafana-dashboards/substrate-service-tasks.json b/.maintain/monitoring/grafana-dashboards/substrate-service-tasks.json index 245071c210bfc..539fdec086a37 100644 --- a/.maintain/monitoring/grafana-dashboards/substrate-service-tasks.json +++ b/.maintain/monitoring/grafana-dashboards/substrate-service-tasks.json @@ -44,14 +44,14 @@ { "datasource": "$data_source", "enable": true, - "expr": "increase(${metric_namespace}_tasks_ended_total{reason=\"panic\", instance=~\"${nodename}\"}[5m])", + "expr": "increase(${metric_namespace}_tasks_ended_total{reason=\"panic\", instance=~\"${nodename}\"}[10m])", "hide": true, "iconColor": "rgba(255, 96, 96, 1)", "limit": 100, "name": "Task panics", "rawQuery": "SELECT\n extract(epoch from time_column) AS time,\n text_column as text,\n tags_column as tags\nFROM\n metric_table\nWHERE\n $__timeFilter(time_column)\n", "showIn": 0, - "step": "", + "step": "10m", "tags": [], "textFormat": "{{instance}} - {{task_name}}", "titleFormat": "Panic!", @@ -60,12 +60,12 @@ { "datasource": "$data_source", "enable": true, - "expr": "changes(${metric_namespace}_process_start_time_seconds{instance=~\"${nodename}\"}[5m])", + "expr": "changes(${metric_namespace}_process_start_time_seconds{instance=~\"${nodename}\"}[10m])", "hide": false, "iconColor": "#8AB8FF", "name": "Node reboots", "showIn": 0, - "step": "", + "step": "10m", "textFormat": "{{instance}}", "titleFormat": "Reboots" } @@ -75,7 +75,7 @@ "gnetId": null, "graphTooltip": 0, "id": null, - "iteration": 1594822742772, + "iteration": 1599471940817, "links": [], "panels": [ { @@ -87,9 +87,9 @@ "x": 0, "y": 0 }, - "id": 25, + "id": 29, "panels": [], - "title": "CPU & Memory", + "title": "Tasks", "type": "row" }, { @@ -107,130 +107,23 @@ "y": 1 }, "hiddenSeries": false, - "id": 9, + "id": 11, + "interval": "1m", "legend": { - "avg": false, + "alignAsTable": true, + "avg": true, "current": false, + "hideEmpty": false, + "hideZero": false, "max": false, "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "stddev-above", - "fillBelowTo": "stddev-below", - "hideTooltip": true, - "lines": false - }, - { - "alias": "stddev-below", - "hideTooltip": true, - "lines": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "avg(${metric_namespace}_cpu_usage_percentage{instance=~\"${nodename}\"})", - "interval": "", - "legendFormat": "cpu-usage", - "refId": "A" - }, - { - "expr": "avg(${metric_namespace}_cpu_usage_percentage{instance=~\"${nodename}\"}) - stddev(${metric_namespace}_cpu_usage_percentage{instance=~\"${nodename}\"})", - "interval": "", - "legendFormat": "stddev-below", - "refId": "B" - }, - { - "expr": "avg(${metric_namespace}_cpu_usage_percentage{instance=~\"${nodename}\"}) + stddev(${metric_namespace}_cpu_usage_percentage{instance=~\"${nodename}\"})", - "interval": "", - "legendFormat": "stddev-above", - "refId": "C" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Average CPU usage and standard deviation", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, + "rightSide": true, "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$data_source", - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 6, - "w": 24, - "x": 0, - "y": 7 - }, - "hiddenSeries": false, - "id": 20, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, "total": false, - "values": false + "values": true }, "lines": true, - "linewidth": 1, + "linewidth": 2, "nullPointMode": "null", "options": { "dataLinks": [] @@ -242,12 +135,12 @@ "seriesOverrides": [], "spaceLength": 10, "stack": false, - "steppedLine": false, + "steppedLine": true, "targets": [ { - "expr": "${metric_namespace}_memory_usage_bytes{instance=~\"${nodename}\"}", + "expr": "avg(irate(${metric_namespace}_tasks_polling_duration_sum{instance=~\"${nodename}\"}[10m])) by (task_name)", "interval": "", - "legendFormat": "{{instance}}", + "legendFormat": "{{task_name}}", "refId": "A" } ], @@ -255,7 +148,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Memory usage", + "title": "CPU time spent on each task (average per node)", "tooltip": { "shared": true, "sort": 2, @@ -271,7 +164,7 @@ }, "yaxes": [ { - "format": "decbytes", + "format": "percentunit", "label": null, "logBase": 1, "max": null, @@ -292,20 +185,6 @@ "alignLevel": null } }, - { - "collapsed": false, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 13 - }, - "id": 29, - "panels": [], - "title": "Tasks", - "type": "row" - }, { "aliasColors": {}, "bars": false, @@ -318,19 +197,23 @@ "h": 6, "w": 24, "x": 0, - "y": 14 + "y": 7 }, "hiddenSeries": false, - "id": 11, - "interval": "1m", + "id": 30, + "interval": "", "legend": { - "avg": false, + "alignAsTable": true, + "avg": true, "current": false, + "hideEmpty": false, + "hideZero": false, "max": false, "min": false, - "show": false, + "rightSide": true, + "show": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 2, @@ -348,7 +231,7 @@ "steppedLine": true, "targets": [ { - "expr": "avg(increase(${metric_namespace}_tasks_polling_duration_sum{instance=~\"${nodename}\"}[$__interval])) by (task_name) * 1000 / $__interval_ms", + "expr": "avg(irate(${metric_namespace}_tasks_polling_duration_count{instance=~\"${nodename}\"}[10m])) by (task_name)", "interval": "", "legendFormat": "{{task_name}}", "refId": "A" @@ -358,7 +241,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "CPU time spent on each task (average per node)", + "title": "Task polling rate per second (average per node)", "tooltip": { "shared": true, "sort": 2, @@ -374,7 +257,7 @@ }, "yaxes": [ { - "format": "percentunit", + "format": "cps", "label": null, "logBase": 1, "max": null, @@ -407,16 +290,16 @@ "h": 6, "w": 24, "x": 0, - "y": 20 + "y": 13 }, "hiddenSeries": false, - "id": 30, + "id": 31, "interval": "", "legend": { "alignAsTable": true, - "avg": true, + "avg": false, "current": false, - "max": false, + "max": true, "min": false, "rightSide": true, "show": true, @@ -439,7 +322,7 @@ "steppedLine": true, "targets": [ { - "expr": "avg(rate(${metric_namespace}_tasks_polling_duration_count{instance=~\"${nodename}\"}[5m])) by (task_name)", + "expr": "max(irate(${metric_namespace}_tasks_polling_duration_count{instance=~\"${nodename}\"}[10m])) by (task_name)", "interval": "", "legendFormat": "{{task_name}}", "refId": "A" @@ -449,7 +332,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Task polling rate per second (average per node)", + "title": "Task polling rate per second (maximum per node)", "tooltip": { "shared": true, "sort": 2, @@ -498,16 +381,16 @@ "h": 6, "w": 24, "x": 0, - "y": 26 + "y": 19 }, "hiddenSeries": false, - "id": 31, + "id": 15, "interval": "", "legend": { "alignAsTable": true, - "avg": false, + "avg": true, "current": false, - "max": true, + "max": false, "min": false, "rightSide": true, "show": true, @@ -515,8 +398,8 @@ "values": true }, "lines": true, - "linewidth": 2, - "nullPointMode": "null", + "linewidth": 1, + "nullPointMode": "null as zero", "options": { "dataLinks": [] }, @@ -530,7 +413,7 @@ "steppedLine": true, "targets": [ { - "expr": "max(rate(${metric_namespace}_tasks_polling_duration_count{instance=~\"${nodename}\"}[5m])) by (task_name)", + "expr": "avg by(task_name) (irate(${metric_namespace}_tasks_spawned_total{instance=~\"${nodename}\"}[10m]))", "interval": "", "legendFormat": "{{task_name}}", "refId": "A" @@ -540,7 +423,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Task polling rate per second (maximum per node)", + "title": "Number of tasks started per second (average per node)", "tooltip": { "shared": true, "sort": 2, @@ -556,11 +439,11 @@ }, "yaxes": [ { - "format": "cps", + "format": "short", "label": null, - "logBase": 1, + "logBase": 10, "max": null, - "min": null, + "min": "0", "show": true }, { @@ -569,7 +452,7 @@ "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { @@ -589,21 +472,21 @@ "h": 6, "w": 24, "x": 0, - "y": 32 + "y": 25 }, "hiddenSeries": false, - "id": 15, + "id": 16, "interval": "", "legend": { - "alignAsTable": false, + "alignAsTable": true, "avg": false, "current": false, - "max": false, + "max": true, "min": false, - "rightSide": false, - "show": false, + "rightSide": true, + "show": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, @@ -621,7 +504,7 @@ "steppedLine": true, "targets": [ { - "expr": "avg by(task_name) (irate(${metric_namespace}_tasks_spawned_total{instance=~\"${nodename}\"}[5m]))", + "expr": "max by(task_name) (irate(${metric_namespace}_tasks_spawned_total{instance=~\"${nodename}\"}[10m]))", "interval": "", "legendFormat": "{{task_name}}", "refId": "A" @@ -631,7 +514,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Number of tasks started per second (average per node)", + "title": "Number of tasks started per second (maximum over all nodes)", "tooltip": { "shared": true, "sort": 2, @@ -680,21 +563,21 @@ "h": 6, "w": 24, "x": 0, - "y": 38 + "y": 31 }, "hiddenSeries": false, - "id": 16, + "id": 2, "interval": "", "legend": { - "alignAsTable": false, - "avg": false, + "alignAsTable": true, + "avg": true, "current": false, "max": false, "min": false, - "rightSide": false, - "show": false, + "rightSide": true, + "show": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, @@ -712,7 +595,7 @@ "steppedLine": true, "targets": [ { - "expr": "max by(task_name) (irate(${metric_namespace}_tasks_spawned_total{instance=~\"${nodename}\"}[5m]))", + "expr": "avg by(task_name) (${metric_namespace}_tasks_spawned_total{instance=~\"${nodename}\"} - sum(${metric_namespace}_tasks_ended_total{instance=~\"${nodename}\"}) without(reason))", "interval": "", "legendFormat": "{{task_name}}", "refId": "A" @@ -722,7 +605,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Number of tasks started per second (maximum over all nodes)", + "title": "Number of tasks running (average per node)", "tooltip": { "shared": true, "sort": 2, @@ -771,21 +654,21 @@ "h": 6, "w": 24, "x": 0, - "y": 44 + "y": 37 }, "hiddenSeries": false, - "id": 2, + "id": 3, "interval": "", "legend": { - "alignAsTable": false, + "alignAsTable": true, "avg": false, "current": false, - "max": false, + "max": true, "min": false, - "rightSide": false, - "show": false, + "rightSide": true, + "show": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, @@ -803,7 +686,7 @@ "steppedLine": true, "targets": [ { - "expr": "avg by(task_name) (${metric_namespace}_tasks_spawned_total{instance=~\"${nodename}\"} - sum(${metric_namespace}_tasks_ended_total{instance=~\"${nodename}\"}) without(reason))", + "expr": "max by(task_name) (${metric_namespace}_tasks_spawned_total{instance=~\"${nodename}\"} - sum(${metric_namespace}_tasks_ended_total{instance=~\"${nodename}\"}) without(reason))", "interval": "", "legendFormat": "{{task_name}}", "refId": "A" @@ -813,7 +696,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Number of tasks running (average per node)", + "title": "Number of tasks running (maximum over all nodes)", "tooltip": { "shared": true, "sort": 2, @@ -856,27 +739,30 @@ "dashLength": 10, "dashes": false, "datasource": "$data_source", + "decimals": null, "fill": 0, "fillGradient": 0, "gridPos": { "h": 6, "w": 24, "x": 0, - "y": 50 + "y": 43 }, "hiddenSeries": false, - "id": 3, + "id": 7, "interval": "", "legend": { - "alignAsTable": false, - "avg": false, + "alignAsTable": true, + "avg": true, "current": false, + "hideEmpty": true, + "hideZero": true, "max": false, "min": false, - "rightSide": false, - "show": false, + "rightSide": true, + "show": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, @@ -890,11 +776,11 @@ "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, - "stack": false, + "stack": true, "steppedLine": true, "targets": [ { - "expr": "max by(task_name) (${metric_namespace}_tasks_spawned_total{instance=~\"${nodename}\"} - sum(${metric_namespace}_tasks_ended_total{instance=~\"${nodename}\"}) without(reason))", + "expr": "avg(\n irate(${metric_namespace}_tasks_polling_duration_bucket{instance=~\"${nodename}\", le=\"+Inf\"}[10m])\n - ignoring(le)\n irate(${metric_namespace}_tasks_polling_duration_bucket{instance=~\"${nodename}\", le=\"1.024\"}[10m])\n) by (task_name) > 0", "interval": "", "legendFormat": "{{task_name}}", "refId": "A" @@ -904,11 +790,11 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Number of tasks running (maximum over all nodes)", + "title": "Calls to `Future::poll` that took more than one second (average per node)", "tooltip": { "shared": true, "sort": 2, - "value_type": "individual" + "value_type": "cumulative" }, "type": "graph", "xaxis": { @@ -920,9 +806,10 @@ }, "yaxes": [ { - "format": "short", - "label": null, - "logBase": 10, + "decimals": null, + "format": "cps", + "label": "Calls to `Future::poll`/second", + "logBase": 1, "max": null, "min": "0", "show": true @@ -933,7 +820,7 @@ "logBase": 1, "max": null, "min": null, - "show": true + "show": false } ], "yaxis": { @@ -941,6 +828,20 @@ "alignLevel": null } }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 49 + }, + "id": 27, + "panels": [], + "title": "Misc", + "type": "row" + }, { "aliasColors": {}, "bars": false, @@ -950,21 +851,18 @@ "fill": 0, "fillGradient": 0, "gridPos": { - "h": 6, + "h": 7, "w": 24, "x": 0, - "y": 56 + "y": 50 }, "hiddenSeries": false, - "id": 7, - "interval": "", + "id": 32, "legend": { "alignAsTable": true, "avg": true, "current": false, - "hideEmpty": true, - "hideZero": true, - "max": false, + "max": true, "min": false, "rightSide": true, "show": true, @@ -973,7 +871,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null as zero", + "nullPointMode": "null", "options": { "dataLinks": [] }, @@ -983,25 +881,25 @@ "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, - "stack": true, - "steppedLine": true, + "stack": false, + "steppedLine": false, "targets": [ { - "expr": "avg(\n rate(${metric_namespace}_tasks_polling_duration_bucket{instance=~\"${nodename}\", le=\"+Inf\"}[1m])\n - ignoring(le)\n rate(${metric_namespace}_tasks_polling_duration_bucket{instance=~\"${nodename}\", le=\"1.024\"}[1m])\n) by (task_name) > 0", + "expr": "avg(${metric_namespace}_unbounded_channel_len{instance=~\"${nodename}\", action = \"send\"} - ignoring(action) ${metric_namespace}_unbounded_channel_len{instance=~\"${nodename}\", action = \"received\"}) by (entity)", "interval": "", - "legendFormat": "{{task_name}}", - "refId": "A" + "legendFormat": "{{entity}}", + "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Calls to `Future::poll` that took more than one second (average per node)", + "title": "Unbounded channels size (average per node)", "tooltip": { "shared": true, "sort": 2, - "value_type": "cumulative" + "value_type": "individual" }, "type": "graph", "xaxis": { @@ -1013,11 +911,11 @@ }, "yaxes": [ { - "format": "cps", - "label": "Calls to `Future::poll`/second", + "format": "short", + "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { @@ -1034,20 +932,6 @@ "alignLevel": null } }, - { - "collapsed": false, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 62 - }, - "id": 27, - "panels": [], - "title": "Misc", - "type": "row" - }, { "aliasColors": {}, "bars": false, @@ -1060,18 +944,20 @@ "h": 7, "w": 24, "x": 0, - "y": 63 + "y": 57 }, "hiddenSeries": false, - "id": 23, + "id": 33, "legend": { - "avg": false, + "alignAsTable": true, + "avg": true, "current": false, "max": false, "min": false, - "show": false, + "rightSide": true, + "show": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, @@ -1089,17 +975,17 @@ "steppedLine": false, "targets": [ { - "expr": "${metric_namespace}_threads{instance=~\"${nodename}\"}", + "expr": "avg(irate(${metric_namespace}_unbounded_channel_len{instance=~\"${nodename}\", action = \"send\"}[10m])) by (entity)", "interval": "", - "legendFormat": "{{instance}}", - "refId": "A" + "legendFormat": "{{entity}}", + "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Number of threads", + "title": "Unbounded channels rate (average per node)", "tooltip": { "shared": true, "sort": 2, @@ -1115,7 +1001,7 @@ }, "yaxes": [ { - "format": "short", + "format": "cps", "label": null, "logBase": 1, "max": null, @@ -1137,7 +1023,7 @@ } } ], - "refresh": "30s", + "refresh": false, "schemaVersion": 22, "style": "dark", "tags": [], @@ -1147,7 +1033,7 @@ "allValue": null, "current": {}, "datasource": "$data_source", - "definition": "${metric_namespace}_cpu_usage_percentage", + "definition": "${metric_namespace}_process_start_time_seconds", "hide": 0, "includeAll": true, "index": -1, @@ -1155,7 +1041,7 @@ "multi": true, "name": "nodename", "options": [], - "query": "${metric_namespace}_cpu_usage_percentage", + "query": "${metric_namespace}_process_start_time_seconds", "refresh": 1, "regex": "/instance=\"(.*?)\"/", "skipUrlSync": false, @@ -1228,5 +1114,5 @@ "variables": { "list": [] }, - "version": 44 + "version": 52 }