Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 108 additions & 3 deletions tests/test_json_output_integrity.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,14 @@ def _assert_time_series_consistent(env, section_name, section):
avg = float(bucket["Average Latency"])
acc = float(bucket["Accumulated Latency"])
derived = acc / count
tolerance = max(ABS_TOLERANCE_MS, REL_TOLERANCE * derived)
# Tolerance must include the quantization error on Accumulated Latency:
# it's emitted with %lld (1 ms precision), so the reported value can
# differ from the true sum by up to 1 ms. The derived avg is therefore
# off by up to (1 / count) ms. For low-count buckets — e.g. the trailing
# second of a short --test-time run with --reconnect-interval=1 — that
# term dominates and the static 2% relative + 0.01 ms absolute floor
# would false-positive (RED-197205 regression-test CI surfaced this).
tolerance = max(ABS_TOLERANCE_MS, REL_TOLERANCE * derived, 1.0 / count)
if acc > 0:
env.assertGreater(
avg, 0.0,
Expand All @@ -132,9 +139,13 @@ def _validate_run_section(env, run_label, run_section):


def _build_benchmark(env, test_dir, extra_args, threads=2, clients=5,
requests=5000):
requests=5000, test_time=None):
# memtier doesn't accept both --test-time and --requests; when the test
# is time-bounded, drop the request count.
if test_time is not None:
requests = None
config = get_default_memtier_config(threads=threads, clients=clients,
requests=requests)
requests=requests, test_time=test_time)
benchmark_specs = {"name": env.testName, "args": extra_args}
addTLSArgs(benchmark_specs, env)
add_required_env_arguments(benchmark_specs, config, env,
Expand Down Expand Up @@ -242,6 +253,100 @@ def test_json_totals_latency_matches_accumulated_get_only(env):
pass


def test_reconnect_interval_does_not_zero_out_rates(env):
"""RED-197205: --reconnect-interval N must not zero out throughput rates.

On v2.2.1 (and earlier), running with --reconnect-interval=1 produced a
final summary where Ops/sec, Hits/sec, Misses/sec and KB/sec all rendered
as 0.00 even though Count and Latency were correct. Root cause:
client_group::merge_run_stats() iterated all clients (including those
that were prepare()-d but never reached set_start_time()) and merged
their zeroed m_start_time into the factorial average, dragging the
aggregate m_start_time toward the epoch. With m_end_time correct but
m_start_time near 0, test_duration_usec became a huge value and every
`ops / test_duration_usec * 1000000` division collapsed to ~0.

Fix (master, PR #350 / commit 0228bac): a run_stats::m_started atomic
flag plus a has_started() guard in merge_run_stats() and
aggregate_inst_histogram() that skips clients that never started.

This test exercises the exact failing configuration from the bug report
(multiple threads, multiple clients per thread, --reconnect-interval=1,
--ratio=1:1) and asserts the rate fields are strictly positive. The
Latency invariants are also re-checked in case a future fix in this
area accidentally re-introduces the integer-truncation pattern.
"""
# memtier refuses --reconnect-interval together with --cluster-mode:
# "error: cluster mode dose not support reconnect-interval option".
# The RED-197205 regression is not cluster-specific, so we only run
# this check in single-endpoint mode.
if env.isCluster():
env.skip()
return

test_dir = tempfile.mkdtemp()
try:
# NB: the bug fires reliably with --test-time, not with -n. With
# -n, all clients run to completion and set_start_time was reached
# on every one of them, so the zero-state merge never happened.
# With --test-time, the wall-clock cutoff occasionally leaves
# late-spawning / reconnect-stuck clients in the zero-init state,
# which is exactly the path PR #350 fixed.
benchmark, run_config = _build_benchmark(
env, test_dir,
extra_args=["--ratio=1:1", "--key-pattern=R:R", "--pipeline=1",
"--reconnect-interval=1"],
threads=4, clients=4, requests=None, test_time=5)
ok = benchmark.run()
failed_asserts = env.getNumberOfFailedAssertion()
try:
env.assertTrue(ok, message="memtier_benchmark exited non-zero")
results = _read_json(run_config, env)
env.assertTrue("ALL STATS" in results,
message="Expected 'ALL STATS' in JSON output")
run = results["ALL STATS"]
_validate_run_section(env, "ALL STATS", run)

# Core regression assertions: with --reconnect-interval=1, the
# rate fields must be > 0 (not the all-zero footprint of
# RED-197205). We assert against the Totals section, which is
# what the bug report cited verbatim.
totals = run["Totals"]
env.assertGreater(
totals["Count"], 0,
message="Totals.Count must be > 0 with --reconnect-interval=1")
env.assertGreater(
totals["Ops/sec"], 0.0,
message="RED-197205 regression: Totals.Ops/sec is 0 with "
"--reconnect-interval=1 (Count={}, Latency={})".format(
totals.get("Count"), totals.get("Latency")))
env.assertGreater(
totals["KB/sec"], 0.0,
message="RED-197205 regression: Totals.KB/sec is 0 with "
"--reconnect-interval=1")
# With ratio=1:1 there's at least one GET per SET, so Misses/sec
# must also be > 0 (keys are random and the SUT starts empty).
env.assertGreater(
totals["Misses/sec"], 0.0,
message="RED-197205 regression: Totals.Misses/sec is 0 with "
"--reconnect-interval=1 and ratio=1:1")

# And the per-command sections should also have non-zero rates,
# since the bug zeroed everything uniformly via the shared
# test_duration_usec.
for cmd in ("Sets", "Gets"):
if cmd in run and run[cmd].get("Count", 0) > 0:
env.assertGreater(
run[cmd]["Ops/sec"], 0.0,
message="RED-197205 regression: {}.Ops/sec is 0 with "
"--reconnect-interval=1".format(cmd))
finally:
if env.getNumberOfFailedAssertion() > failed_asserts:
debugPrintMemtierOnError(run_config, env)
finally:
pass


def test_json_multi_run_aggregated_sections_consistent(env):
"""With --run-count>1 we get BEST/WORST/AGGREGATED sections — each one
must satisfy the same Latency invariants as ALL STATS does for a single
Expand Down
Loading