Skip to content

Commit

Permalink
Display ZGC collector stats in report (#1502)
Browse files Browse the repository at this point in the history
* Add ZGC metrics to reporter.

* Update tests for GlobalStatsCalculator.

* Added test for reporter.

* Lint.

* Support plotting zgc in analyze script.

* Lint.

* Fixed a diffeent analyze script error.

* Revert "Fixed a diffeent analyze script error."

This reverts commit 32da26f.

* Docs.
  • Loading branch information
j-bennet authored Jun 2, 2022
1 parent 3a1c993 commit ecaa57e
Show file tree
Hide file tree
Showing 6 changed files with 149 additions and 28 deletions.
4 changes: 4 additions & 0 deletions docs/metrics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,10 @@ Rally stores the following metrics:
* ``node_total_young_gen_gc_count``: The total number of young generation garbage collections across the whole cluster as reported by the node stats API.
* ``node_total_old_gen_gc_time``: The total runtime of the old generation garbage collector across the whole cluster as reported by the node stats API.
* ``node_total_old_gen_gc_count``: The total number of old generation garbage collections across the whole cluster as reported by the node stats API.
* ``node_total_zgc_cycles_gc_time``: The total time spent doing GC by the ZGC garbage collector across the whole cluster as reported by the node stats API.
* ``node_total_zgc_cycles_gc_count``: The total number of garbage collections performed by ZGC across the whole cluster as reported by the node stats API.
* ``node_total_zgc_pauses_gc_time``: The total time spent in Stop-The-World pauses by the ZGC garbage collector across the whole cluster as reported by the node stats API.
* ``node_total_zgc_pauses_gc_count``: The total number of Stop-The-World pauses performed by ZGC across the whole cluster as reported by the node stats API.
* ``segments_count``: Total number of segments as reported by the index stats API.
* ``segments_memory_in_bytes``: Number of bytes used for segments as reported by the index stats API.
* ``segments_doc_values_memory_in_bytes``: Number of bytes used for doc values as reported by the index stats API.
Expand Down
24 changes: 24 additions & 0 deletions docs/summary_report.rst
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,30 @@ Total Old Gen GC count
* **Definition**: The total number of old generation garbage collections across the whole cluster as reported by the node stats API.
* **Corresponding metrics key**: ``node_total_old_gen_gc_count``

Total ZGC Cycles GC time
------------------------

* **Definition**: The total time spent doing GC by the ZGC garbage collector across the whole cluster as reported by the node stats API.
* **Corresponding metrics key**: ``node_total_zgc_cycles_gc_time``

Total ZGC Cycles GC count
-------------------------

* **Definition**: The total number of garbage collections performed by ZGC across the whole cluster as reported by the node stats API.
* **Corresponding metrics key**: ``node_total_zgc_cycles_gc_count``

Total ZGC Pauses GC time
------------------------

* **Definition**: The total time spent in Stop-The-World pauses by the ZGC garbage collector across the whole cluster as reported by the node stats API.
* **Corresponding metrics key**: ``node_total_zgc_pauses_gc_time``

Total ZGC Pauses GC count
-------------------------

* **Definition**: The total number of Stop-The-World pauses performed by ZGC across the whole cluster as reported by the node stats API.
* **Corresponding metrics key**: ``node_total_zgc_pauses_gc_count``

Store size
----------

Expand Down
8 changes: 8 additions & 0 deletions esrally/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1790,6 +1790,10 @@ def __call__(self):
result.young_gc_count = self.sum("node_total_young_gen_gc_count")
result.old_gc_time = self.sum("node_total_old_gen_gc_time")
result.old_gc_count = self.sum("node_total_old_gen_gc_count")
result.zgc_cycles_gc_time = self.sum("node_total_zgc_cycles_gc_time")
result.zgc_cycles_gc_count = self.sum("node_total_zgc_cycles_gc_count")
result.zgc_pauses_gc_time = self.sum("node_total_zgc_pauses_gc_time")
result.zgc_pauses_gc_count = self.sum("node_total_zgc_pauses_gc_count")

self.logger.debug("Gathering segment memory metrics.")
result.memory_segments = self.median("segments_memory_in_bytes")
Expand Down Expand Up @@ -1973,6 +1977,10 @@ def __init__(self, d=None):
self.young_gc_count = self.v(d, "young_gc_count")
self.old_gc_time = self.v(d, "old_gc_time")
self.old_gc_count = self.v(d, "old_gc_count")
self.zgc_cycles_gc_time = self.v(d, "zgc_cycles_gc_time")
self.zgc_cycles_gc_count = self.v(d, "zgc_cycles_gc_count")
self.zgc_pauses_gc_time = self.v(d, "zgc_pauses_gc_time")
self.zgc_pauses_gc_count = self.v(d, "zgc_pauses_gc_count")

self.memory_segments = self.v(d, "memory_segments")
self.memory_doc_values = self.v(d, "memory_doc_values")
Expand Down
57 changes: 29 additions & 28 deletions esrally/reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,10 @@ def _report_gc_metrics(self, stats):
self._line("Total Young Gen GC count", "", stats.young_gc_count, ""),
self._line("Total Old Gen GC time", "", stats.old_gc_time, "s", convert.ms_to_seconds),
self._line("Total Old Gen GC count", "", stats.old_gc_count, ""),
self._line("Total ZGC Cycles GC time", "", stats.zgc_cycles_gc_time, "s", convert.ms_to_seconds),
self._line("Total ZGC Cycles GC count", "", stats.zgc_cycles_gc_count, ""),
self._line("Total ZGC Pauses GC time", "", stats.zgc_pauses_gc_time, "s", convert.ms_to_seconds),
self._line("Total ZGC Pauses GC count", "", stats.zgc_pauses_gc_count, ""),
)

def _report_disk_usage(self, stats):
Expand Down Expand Up @@ -849,41 +853,38 @@ def _report_total_count(self, name, baseline_total, contender_total):
)

def _report_gc_metrics(self, baseline_stats, contender_stats):
return self._join(
self._line(
"Total Young Gen GC time",
baseline_stats.young_gc_time,
contender_stats.young_gc_time,
"",
"s",
treat_increase_as_improvement=False,
formatter=convert.ms_to_seconds,
),
self._line(
"Total Young Gen GC count",
baseline_stats.young_gc_count,
contender_stats.young_gc_count,
"",
"",
treat_increase_as_improvement=False,
),
self._line(
"Total Old Gen GC time",
baseline_stats.old_gc_time,
contender_stats.old_gc_time,
line_method = self._line

def _time_metric(metric_prefix, description):
return line_method(
f"Total {description} GC time",
getattr(baseline_stats, f"{metric_prefix}_gc_time"),
getattr(contender_stats, f"{metric_prefix}_gc_time"),
"",
"s",
treat_increase_as_improvement=False,
formatter=convert.ms_to_seconds,
),
self._line(
"Total Old Gen GC count",
baseline_stats.old_gc_count,
contender_stats.old_gc_count,
)

def _count_metric(metric_prefix, description):
return line_method(
f"Total {description} GC count",
getattr(baseline_stats, f"{metric_prefix}_gc_count"),
getattr(contender_stats, f"{metric_prefix}_gc_count"),
"",
"",
treat_increase_as_improvement=False,
),
)

return self._join(
_time_metric("young", "Young Gen"),
_count_metric("young", "Young Gen"),
_time_metric("old", "Old Gen"),
_count_metric("old", "Old Gen"),
_time_metric("zgc_cycles", "ZGC Cycles"),
_count_metric("zgc_cycles", "ZGC Cycles"),
_time_metric("zgc_pauses", "ZGC Pauses"),
_count_metric("zgc_pauses", "ZGC Pauses"),
)

def _report_disk_usage(self, baseline_stats, contender_stats):
Expand Down
45 changes: 45 additions & 0 deletions tests/metrics_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1690,6 +1690,12 @@ def test_calculate_global_stats(self):
store.put_value_cluster_level("latency", 220, unit="ms", task="index #1", operation_type=track.OperationType.Bulk)
store.put_value_cluster_level("latency", 225, unit="ms", task="index #1", operation_type=track.OperationType.Bulk)

for collector in ("young_gen", "old_gen", "zgc_cycles", "zgc_pauses"):
store.put_value_node_level("rally-node-0", f"node_{collector}_gc_time", 100, "ms")
store.put_value_node_level("rally-node-0", f"node_{collector}_gc_count", 1)
store.put_value_cluster_level(f"node_total_{collector}_gc_time", 100, "ms")
store.put_value_cluster_level(f"node_total_{collector}_gc_count", 1)

store.put_value_cluster_level(
"service_time",
250,
Expand Down Expand Up @@ -1811,6 +1817,15 @@ def test_calculate_global_stats(self):
]
assert opm2["duration"] == 600 * 1000

assert stats.young_gc_time == 100
assert stats.young_gc_count == 1
assert stats.old_gc_time == 100
assert stats.old_gc_count == 1
assert stats.zgc_cycles_gc_time == 100
assert stats.zgc_cycles_gc_count == 1
assert stats.zgc_pauses_gc_time == 100
assert stats.zgc_pauses_gc_count == 1

def test_calculate_system_stats(self):
cfg = config.Config()
cfg.add(config.Scope.application, "system", "env.name", "unittest")
Expand Down Expand Up @@ -1982,6 +1997,10 @@ def test_as_flat_list(self):
"young_gc_count": 7,
"old_gc_time": 0,
"old_gc_count": 0,
"zgc_cycles_gc_time": 100,
"zgc_cycles_gc_count": 1,
"zgc_pauses_gc_time": 50,
"zgc_pauses_gc_count": 1,
"merge_time": 3702,
"merge_time_per_shard": {
"min": 40,
Expand Down Expand Up @@ -2153,6 +2172,32 @@ def test_as_flat_list(self):
},
}

assert select(metric_list, "zgc_cycles_gc_time") == {
"name": "zgc_cycles_gc_time",
"value": {
"single": 100,
},
}
assert select(metric_list, "zgc_cycles_gc_count") == {
"name": "zgc_cycles_gc_count",
"value": {
"single": 1,
},
}

assert select(metric_list, "zgc_pauses_gc_time") == {
"name": "zgc_pauses_gc_time",
"value": {
"single": 50,
},
}
assert select(metric_list, "zgc_pauses_gc_count") == {
"name": "zgc_pauses_gc_count",
"value": {
"single": 1,
},
}

assert select(metric_list, "merge_time") == {
"name": "merge_time",
"value": {
Expand Down
39 changes: 39 additions & 0 deletions tests/reporter_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from unittest import mock

from esrally import config, reporter
from esrally.metrics import GlobalStats
from esrally.utils import convert


Expand Down Expand Up @@ -77,3 +78,41 @@ def test_diff_percent_divide_by_zero(self):
def test_diff_percent_ignore_formatter(self):
formatted = self.reporter._diff(1, 0, False, formatter=convert.factor(100.0), as_percentage=True)
assert formatted == "-100.00%"

def test_report_gc_metrics(self):
r1 = GlobalStats(
{
"young_gc_time": 100,
"young_gc_count": 1,
"old_gc_time": 200,
"old_gc_count": 1,
"zgc_cycles_gc_time": 300,
"zgc_cycles_gc_count": 1,
"zgc_pauses_gc_time": 400,
"zgc_pauses_gc_count": 1,
}
)

r2 = GlobalStats(
{
"young_gc_time": 200,
"young_gc_count": 2,
"old_gc_time": 300,
"old_gc_count": 2,
"zgc_cycles_gc_time": 400,
"zgc_cycles_gc_count": 2,
"zgc_pauses_gc_time": 500,
"zgc_pauses_gc_count": 2,
}
)
metrics = self.reporter._report_gc_metrics(r1, r2)
assert metrics == [
["Total Young Gen GC time", "", 0.1, 0.2, "+0.10000", "s", "+100.00%"],
["Total Young Gen GC count", "", 1, 2, "+1.00000", "", "+100.00%"],
["Total Old Gen GC time", "", 0.2, 0.3, "+0.10000", "s", "+50.00%"],
["Total Old Gen GC count", "", 1, 2, "+1.00000", "", "+100.00%"],
["Total ZGC Cycles GC time", "", 0.3, 0.4, "+0.10000", "s", "+33.33%"],
["Total ZGC Cycles GC count", "", 1, 2, "+1.00000", "", "+100.00%"],
["Total ZGC Pauses GC time", "", 0.4, 0.5, "+0.10000", "s", "+25.00%"],
["Total ZGC Pauses GC count", "", 1, 2, "+1.00000", "", "+100.00%"],
]

0 comments on commit ecaa57e

Please sign in to comment.