Skip to content

Commit

Permalink
metrics: Support multi k8s in grafana dashboards (#32546)
Browse files Browse the repository at this point in the history
close #32593
  • Loading branch information
just1900 authored Feb 25, 2022
1 parent 53c9add commit cc3c905
Show file tree
Hide file tree
Showing 9 changed files with 685 additions and 538 deletions.
112 changes: 66 additions & 46 deletions br/metrics/grafana/br.json

Large diffs are not rendered by default.

84 changes: 52 additions & 32 deletions br/metrics/grafana/lightning.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion metrics/grafana/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ Why jsonnet?

1. Modify the jsonnet files (e.g. tidb_summary.jsonnet).
2. Run `generate_json.sh` to generate the json files by the jsonnet files.
3. Commit the modifications.
3. Commit the modifications.
220 changes: 120 additions & 100 deletions metrics/grafana/overview.json

Large diffs are not rendered by default.

138 changes: 79 additions & 59 deletions metrics/grafana/performance_overview.json

Large diffs are not rendered by default.

411 changes: 216 additions & 195 deletions metrics/grafana/tidb.json

Large diffs are not rendered by default.

101 changes: 58 additions & 43 deletions metrics/grafana/tidb_runtime.json

Large diffs are not rendered by default.

82 changes: 51 additions & 31 deletions metrics/grafana/tidb_summary.json

Large diffs are not rendered by default.

73 changes: 42 additions & 31 deletions metrics/grafana/tidb_summary.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,17 @@ local newDash = dashboard.new(
pluginId='prometheus',
pluginName='Prometheus',
)
.addTemplate(
template.new(
datasource=myDS,
hide= 2,
label='K8s-cluster',
name='k8s_cluster',
query='label_values(pd_cluster_status, k8s_cluster)',
refresh='time',
sort=1,
)
)
.addTemplate(
// Default template for tidb-cloud
template.new(
Expand All @@ -48,7 +59,7 @@ local newDash = dashboard.new(
label='tidb_cluster',
multi=false,
name='tidb_cluster',
query='label_values(pd_cluster_status, tidb_cluster)',
query='label_values(pd_cluster_status{k8s_cluster="$kuberentes"}, tidb_cluster)',
refresh='time',
regex='',
sort=1,
Expand All @@ -67,7 +78,7 @@ local uptimeP = graphPanel.new(
)
.addTarget(
prometheus.target(
'time() - process_start_time_seconds{tidb_cluster="$tidb_cluster", job="tidb"}',
'time() - process_start_time_seconds{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", job="tidb"}',
legendFormat='{{instance}}',
)
);
Expand All @@ -82,13 +93,13 @@ local connectionP = graphPanel.new(
)
.addTarget(
prometheus.target(
'tidb_server_connections{tidb_cluster="$tidb_cluster"}',
'tidb_server_connections{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}',
legendFormat='{{instance}}',
)
)
.addTarget(
prometheus.target(
'sum(tidb_server_connections{tidb_cluster="$tidb_cluster"})',
'sum(tidb_server_connections{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"})',
legendFormat='total',
)
);
Expand All @@ -102,7 +113,7 @@ local cpuP = graphPanel.new(
)
.addTarget(
prometheus.target(
'rate(process_cpu_seconds_total{tidb_cluster="$tidb_cluster", job="tidb"}[1m])',
'rate(process_cpu_seconds_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", job="tidb"}[1m])',
legendFormat='{{instance}}',
)
);
Expand All @@ -116,13 +127,13 @@ local memP = graphPanel.new(
)
.addTarget(
prometheus.target(
'process_resident_memory_bytes{tidb_cluster="$tidb_cluster", job="tidb"}',
'process_resident_memory_bytes{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", job="tidb"}',
legendFormat='process-{{instance}}',
)
)
.addTarget(
prometheus.target(
'go_memstats_heap_inuse_bytes{tidb_cluster="$tidb_cluster", job="tidb"}',
'go_memstats_heap_inuse_bytes{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", job="tidb"}',
legendFormat='HeapInuse-{{instance}}',
)
);
Expand All @@ -138,19 +149,19 @@ local durationP = graphPanel.new(
)
.addTarget(
prometheus.target(
'histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket{tidb_cluster="$tidb_cluster", sql_type!="internal"}[1m])) by (le))',
'histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", sql_type!="internal"}[1m])) by (le))',
legendFormat='99',
)
)
.addTarget(
prometheus.target(
'histogram_quantile(0.95, sum(rate(tidb_server_handle_query_duration_seconds_bucket{tidb_cluster="$tidb_cluster", sql_type!="internal"}[1m])) by (le))',
'histogram_quantile(0.95, sum(rate(tidb_server_handle_query_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", sql_type!="internal"}[1m])) by (le))',
legendFormat='95',
)
)
.addTarget(
prometheus.target(
'sum(rate(tidb_server_handle_query_duration_seconds_sum{tidb_cluster="$tidb_cluster", sql_type!="internal"}[30s])) / sum(rate(tidb_server_handle_query_duration_seconds_count{tidb_cluster="$tidb_cluster", sql_type!="internal"}[30s]))',
'sum(rate(tidb_server_handle_query_duration_seconds_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", sql_type!="internal"}[30s])) / sum(rate(tidb_server_handle_query_duration_seconds_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", sql_type!="internal"}[30s]))',
legendFormat='avg',
)
);
Expand All @@ -164,7 +175,7 @@ local failedP = graphPanel.new(
)
.addTarget(
prometheus.target(
'sum(increase(tidb_server_execute_error_total{tidb_cluster="$tidb_cluster"}[1m])) by (type, instance)',
'sum(increase(tidb_server_execute_error_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (type, instance)',
legendFormat='{{type}}-{{instance}}',
)
);
Expand All @@ -178,20 +189,20 @@ local cpsP = graphPanel.new(
)
.addTarget(
prometheus.target(
'sum(rate(tidb_server_query_total{tidb_cluster="$tidb_cluster"}[1m])) by (result)',
'sum(rate(tidb_server_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (result)',
legendFormat='query {{result}}',
)
)
.addTarget(
prometheus.target(
'sum(rate(tidb_server_query_total{tidb_cluster="$tidb_cluster", result="OK"}[1m] offset 1d))',
'sum(rate(tidb_server_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", result="OK"}[1m] offset 1d))',
legendFormat='yesterday',
hide=true,
)
)
.addTarget(
prometheus.target(
'sum(tidb_server_connections{tidb_cluster="$tidb_cluster"}) * sum(rate(tidb_server_handle_query_duration_seconds_count{tidb_cluster="$tidb_cluster"}[1m])) / sum(rate(tidb_server_handle_query_duration_seconds_sum{tidb_cluster="$tidb_cluster"}[1m]))',
'sum(tidb_server_connections{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}) * sum(rate(tidb_server_handle_query_duration_seconds_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) / sum(rate(tidb_server_handle_query_duration_seconds_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m]))',
legendFormat='ideal CPS',
hide=true,
)
Expand All @@ -206,7 +217,7 @@ local cpsByInstP = graphPanel.new(
)
.addTarget(
prometheus.target(
'sum(rate(tidb_server_query_total{tidb_cluster="$tidb_cluster"}[1m])) by (instance)',
'sum(rate(tidb_server_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (instance)',
legendFormat='{{instance}}',
)
);
Expand All @@ -220,13 +231,13 @@ local qpsP = graphPanel.new(
)
.addTarget(
prometheus.target(
'sum(rate(tidb_executor_statement_total{tidb_cluster="$tidb_cluster"}[1m])) by (type)',
'sum(rate(tidb_executor_statement_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (type)',
legendFormat='{{type}}',
)
)
.addTarget(
prometheus.target(
'sum(rate(tidb_executor_statement_total{tidb_cluster="$tidb_cluster"}[1m]))',
'sum(rate(tidb_executor_statement_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m]))',
legendFormat='total',
)
);
Expand All @@ -240,7 +251,7 @@ local cpsByCMDP = graphPanel.new(
)
.addTarget(
prometheus.target(
'sum(rate(tidb_server_query_total{tidb_cluster="$tidb_cluster"}[1m])) by (type)',
'sum(rate(tidb_server_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (type)',
legendFormat='{{type}}',
)
);
Expand All @@ -256,13 +267,13 @@ local parseP = graphPanel.new(
)
.addTarget(
prometheus.target(
'histogram_quantile(0.99, sum(rate(tidb_session_parse_duration_seconds_bucket{tidb_cluster="$tidb_cluster", sql_type="general"}[1m])) by (le))',
'histogram_quantile(0.99, sum(rate(tidb_session_parse_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", sql_type="general"}[1m])) by (le))',
legendFormat='99',
)
)
.addTarget(
prometheus.target(
'histogram_quantile(0.95, sum(rate(tidb_session_parse_duration_seconds_bucket{tidb_cluster="$tidb_cluster", sql_type="general"}[1m])) by (le))',
'histogram_quantile(0.95, sum(rate(tidb_session_parse_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", sql_type="general"}[1m])) by (le))',
legendFormat='95',
)
);
Expand All @@ -276,13 +287,13 @@ local compileP = graphPanel.new(
)
.addTarget(
prometheus.target(
'histogram_quantile(0.99, sum(rate(tidb_session_compile_duration_seconds_bucket{tidb_cluster="$tidb_cluster", sql_type="general"}[1m])) by (le))',
'histogram_quantile(0.99, sum(rate(tidb_session_compile_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", sql_type="general"}[1m])) by (le))',
legendFormat='99',
)
)
.addTarget(
prometheus.target(
'histogram_quantile(0.95, sum(rate(tidb_session_compile_duration_seconds_bucket{tidb_cluster="$tidb_cluster", sql_type="general"}[1m])) by (le))',
'histogram_quantile(0.95, sum(rate(tidb_session_compile_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", sql_type="general"}[1m])) by (le))',
legendFormat='95',
)
);
Expand All @@ -296,13 +307,13 @@ local exeP = graphPanel.new(
)
.addTarget(
prometheus.target(
'histogram_quantile(0.99, sum(rate(tidb_session_execute_duration_seconds_bucket{tidb_cluster="$tidb_cluster", sql_type="general"}[1m])) by (le))',
'histogram_quantile(0.99, sum(rate(tidb_session_execute_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", sql_type="general"}[1m])) by (le))',
legendFormat='99',
)
)
.addTarget(
prometheus.target(
'histogram_quantile(0.95, sum(rate(tidb_session_execute_duration_seconds_bucket{tidb_cluster="$tidb_cluster", sql_type="general"}[1m])) by (le))',
'histogram_quantile(0.95, sum(rate(tidb_session_execute_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", sql_type="general"}[1m])) by (le))',
legendFormat='95',
)
);
Expand All @@ -316,7 +327,7 @@ local planCacheP = graphPanel.new(
)
.addTarget(
prometheus.target(
'sum(rate(tidb_server_plan_cache_total{tidb_cluster="$tidb_cluster"}[1m])) by (type)',
'sum(rate(tidb_server_plan_cache_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (type)',
legendFormat='{{type}}',
)
);
Expand All @@ -332,7 +343,7 @@ local tpsP = graphPanel.new(
)
.addTarget(
prometheus.target(
'sum(rate(tidb_session_transaction_duration_seconds_count{tidb_cluster="$tidb_cluster"}[1m])) by (type, txn_mode)',
'sum(rate(tidb_session_transaction_duration_seconds_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (type, txn_mode)',
legendFormat='{{type}}-{{txn_mode}}',
)
);
Expand All @@ -346,19 +357,19 @@ local txnDurationP = graphPanel.new(
)
.addTarget(
prometheus.target(
'histogram_quantile(0.99, sum(rate(tidb_session_transaction_duration_seconds_bucket{tidb_cluster="$tidb_cluster", sql_type="general"}[1m])) by (le, txn_mode))',
'histogram_quantile(0.99, sum(rate(tidb_session_transaction_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", sql_type="general"}[1m])) by (le, txn_mode))',
legendFormat='99-{{txn_mode}}',
)
)
.addTarget(
prometheus.target(
'histogram_quantile(0.95, sum(rate(tidb_session_transaction_duration_seconds_bucket{tidb_cluster="$tidb_cluster", sql_type="general"}[1m])) by (le, txn_mode))',
'histogram_quantile(0.95, sum(rate(tidb_session_transaction_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", sql_type="general"}[1m])) by (le, txn_mode))',
legendFormat='95-{{txn_mode}}',
)
)
.addTarget(
prometheus.target(
'histogram_quantile(0.80, sum(rate(tidb_session_transaction_duration_seconds_bucket{tidb_cluster="$tidb_cluster", sql_type="general"}[1m])) by (le, txn_mode))',
'histogram_quantile(0.80, sum(rate(tidb_session_transaction_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", sql_type="general"}[1m])) by (le, txn_mode))',
legendFormat='80-{{txn_mode}}',
)
);
Expand All @@ -372,7 +383,7 @@ local maxTxnStmtP = graphPanel.new(
)
.addTarget(
prometheus.target(
'histogram_quantile(1, sum(rate(tidb_session_transaction_statement_num_bucket{tidb_cluster="$tidb_cluster"}[30s])) by (le))',
'histogram_quantile(1, sum(rate(tidb_session_transaction_statement_num_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[30s])) by (le))',
legendFormat='max',
)
);
Expand All @@ -386,7 +397,7 @@ local maxTxnRetryP = graphPanel.new(
)
.addTarget(
prometheus.target(
'histogram_quantile(1.0, sum(rate(tidb_session_retry_num_bucket{tidb_cluster="$tidb_cluster"}[30s])) by (le))',
'histogram_quantile(1.0, sum(rate(tidb_session_retry_num_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[30s])) by (le))',
legendFormat='max',
)
);
Expand Down

0 comments on commit cc3c905

Please sign in to comment.