Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Fix rate limit metrics registering twice and misreporting #13649

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/13649.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix rate limit gauge metrics registering twice and misreporting (`synapse_rate_limit_sleep_affected_hosts`, `synapse_rate_limit_reject_affected_hosts`).
4 changes: 3 additions & 1 deletion synapse/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -756,7 +756,9 @@ def get_replication_streams(self) -> Dict[str, Stream]:
@cache_in_self
def get_federation_ratelimiter(self) -> FederationRateLimiter:
return FederationRateLimiter(
self.get_clock(), config=self.config.ratelimiting.rc_federation
self.get_clock(),
config=self.config.ratelimiting.rc_federation,
metrics_name="federation_servlets",
)

@cache_in_self
Expand Down
155 changes: 124 additions & 31 deletions synapse/util/ratelimitutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,23 @@
import collections
import contextlib
import logging
import threading
import typing
from typing import Any, DefaultDict, Iterator, List, Set
from typing import (
Any,
Callable,
DefaultDict,
Dict,
Iterator,
List,
Mapping,
Optional,
Set,
Tuple,
)

from prometheus_client.core import Counter
from typing_extensions import ContextManager

from twisted.internet import defer

Expand All @@ -40,12 +53,20 @@


# Track how much the ratelimiter is affecting requests
rate_limit_sleep_counter = Counter("synapse_rate_limit_sleep", "")
rate_limit_reject_counter = Counter("synapse_rate_limit_reject", "")
rate_limit_sleep_counter = Counter(
"synapse_rate_limit_sleep",
"Number of requests slept by the rate limiter",
["rate_limiter_name"],
)
rate_limit_reject_counter = Counter(
"synapse_rate_limit_reject",
"Number of requests rejected by the rate limiter",
["rate_limiter_name"],
)
queue_wait_timer = Histogram(
"synapse_rate_limit_queue_wait_time_seconds",
"sec",
[],
"Amount of time spent waiting for the rate limiter to let our request through.",
["rate_limiter_name"],
buckets=(
0.005,
0.01,
Expand All @@ -65,35 +86,92 @@
)


_rate_limiter_instances: Set["FederationRateLimiter"] = set()
# Protects the _rate_limiter_instances set from concurrent access
_rate_limiter_instances_lock = threading.Lock()


def _get_counts_from_rate_limiter_instance(
count_func: Callable[["FederationRateLimiter"], int]
) -> Mapping[Tuple[str, ...], int]:
"""Returns a count of something (slept/rejected hosts) by (metrics_name)"""
# Cast to a list to prevent it changing while the Prometheus
# thread is collecting metrics
with _rate_limiter_instances_lock:
rate_limiter_instances = list(_rate_limiter_instances)

# Map from (metrics_name,) -> int, the number of something like slept hosts
# or rejected hosts. The key type is Tuple[str], but we leave the length
# unspecified for compatability with LaterGauge's annotations.
counts: Dict[Tuple[str, ...], int] = {}
for rate_limiter_instance in rate_limiter_instances:
# Only track metrics if they provided a `metrics_name` to
# differentiate this instance of the rate limiter.
if rate_limiter_instance.metrics_name:
key = (rate_limiter_instance.metrics_name,)
counts[key] = count_func(rate_limiter_instance)

return counts
Comment on lines +89 to +114
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the same pattern as

_in_flight_requests: Set["RequestMetrics"] = set()
# Protects the _in_flight_requests set from concurrent access
_in_flight_requests_lock = threading.Lock()
def _get_in_flight_counts() -> Mapping[Tuple[str, ...], int]:
"""Returns a count of all in flight requests by (method, server_name)"""
# Cast to a list to prevent it changing while the Prometheus
# thread is collecting metrics
with _in_flight_requests_lock:
reqs = list(_in_flight_requests)
for rm in reqs:
rm.update_metrics()
# Map from (method, name) -> int, the number of in flight requests of that
# type. The key type is Tuple[str, str], but we leave the length unspecified
# for compatability with LaterGauge's annotations.
counts: Dict[Tuple[str, ...], int] = {}
for rm in reqs:
key = (rm.method, rm.name)
counts[key] = counts.get(key, 0) + 1
return counts
LaterGauge(
"synapse_http_server_in_flight_requests_count",
"",
["method", "servlet"],
_get_in_flight_counts,
)



# We track the number of affected hosts per time-period so we can
# differentiate one really noisy homeserver from a general
# ratelimit tuning problem across the federation.
LaterGauge(
"synapse_rate_limit_sleep_affected_hosts",
"Number of hosts that had requests put to sleep",
["rate_limiter_name"],
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've made the label rate_limiter_name instead of name because name seemed pretty ambiguous in the context of synapse_rate_limit_sleep_affected_hosts. Trying to clarify the question "name of what?"

lambda: _get_counts_from_rate_limiter_instance(
lambda rate_limiter_instance: sum(
ratelimiter.should_sleep()
for ratelimiter in rate_limiter_instance.ratelimiters.values()
)
),
)
LaterGauge(
"synapse_rate_limit_reject_affected_hosts",
"Number of hosts that had requests rejected",
["rate_limiter_name"],
lambda: _get_counts_from_rate_limiter_instance(
lambda rate_limiter_instance: sum(
ratelimiter.should_reject()
for ratelimiter in rate_limiter_instance.ratelimiters.values()
)
),
)


class FederationRateLimiter:
def __init__(self, clock: Clock, config: FederationRatelimitSettings):
"""Used to rate limit request per-host."""

def __init__(
self,
clock: Clock,
config: FederationRatelimitSettings,
metrics_name: Optional[str] = None,
):
"""
Args:
clock
config
metrics_name: The name of the rate limiter so we can differentiate it
from the rest in the metrics. If `None`, we don't track metrics
for this rate limiter.

"""
self.metrics_name = metrics_name

def new_limiter() -> "_PerHostRatelimiter":
return _PerHostRatelimiter(clock=clock, config=config)
return _PerHostRatelimiter(
clock=clock, config=config, metrics_name=metrics_name
)

self.ratelimiters: DefaultDict[
str, "_PerHostRatelimiter"
] = collections.defaultdict(new_limiter)

# We track the number of affected hosts per time-period so we can
# differentiate one really noisy homeserver from a general
# ratelimit tuning problem across the federation.
LaterGauge(
"synapse_rate_limit_sleep_affected_hosts",
"Number of hosts that had requests put to sleep",
[],
lambda: sum(
ratelimiter.should_sleep() for ratelimiter in self.ratelimiters.values()
),
)
LaterGauge(
"synapse_rate_limit_reject_affected_hosts",
"Number of hosts that had requests rejected",
[],
lambda: sum(
ratelimiter.should_reject()
for ratelimiter in self.ratelimiters.values()
),
)
with _rate_limiter_instances_lock:
_rate_limiter_instances.add(self)

def ratelimit(self, host: str) -> "_GeneratorContextManager[defer.Deferred[None]]":
"""Used to ratelimit an incoming request from a given host
Expand All @@ -114,13 +192,23 @@ def ratelimit(self, host: str) -> "_GeneratorContextManager[defer.Deferred[None]


class _PerHostRatelimiter:
def __init__(self, clock: Clock, config: FederationRatelimitSettings):
def __init__(
self,
clock: Clock,
config: FederationRatelimitSettings,
metrics_name: Optional[str] = None,
):
"""
Args:
clock
config
metrics_name: The name of the rate limiter so we can differentiate it
DMRobertson marked this conversation as resolved.
Show resolved Hide resolved
from the rest in the metrics. If `None`, we don't track metrics
for this rate limiter.
from the rest in the metrics
"""
self.clock = clock
self.metrics_name = metrics_name

self.window_size = config.window_size
self.sleep_limit = config.sleep_limit
Expand Down Expand Up @@ -178,7 +266,10 @@ def should_sleep(self) -> bool:
return len(self.request_times) > self.sleep_limit

async def _on_enter_with_tracing(self, request_id: object) -> None:
with start_active_span("ratelimit wait"), queue_wait_timer.time():
maybe_metrics_cm: ContextManager = contextlib.nullcontext()
if self.metrics_name:
maybe_metrics_cm = queue_wait_timer.labels(self.metrics_name).time()
with start_active_span("ratelimit wait"), maybe_metrics_cm:
Comment on lines +269 to +272
Copy link
Contributor Author

@MadLittleMods MadLittleMods Aug 26, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Slightly worried about context issues here again just because it's going a bit off the beaten path (please double-check) 😰

I've made metrics_name optional because I'm not concerned about tracking the UsernameAvailabilityRestServlet which is used to rate limit people by IP. I only care about tracking the federation servlets which FederationRateLimiter was made for.

We could make metrics_name required and record metrics for the username availability servlet to get rid of this complexity though.

await self._on_enter(request_id)

def _on_enter(self, request_id: object) -> "defer.Deferred[None]":
Expand All @@ -193,7 +284,8 @@ def _on_enter(self, request_id: object) -> "defer.Deferred[None]":
# sleeping or in the ready queue).
if self.should_reject():
logger.debug("Ratelimiter(%s): rejecting request", self.host)
rate_limit_reject_counter.inc()
if self.metrics_name:
rate_limit_reject_counter.labels(self.metrics_name).inc()
raise LimitExceededError(
retry_after_ms=int(self.window_size / self.sleep_limit)
)
Expand Down Expand Up @@ -228,7 +320,8 @@ def queue_request() -> "defer.Deferred[None]":
id(request_id),
self.sleep_sec,
)
rate_limit_sleep_counter.inc()
if self.metrics_name:
rate_limit_sleep_counter.labels(self.metrics_name).inc()
ret_defer = run_in_background(self.clock.sleep, self.sleep_sec)

self.sleeping_requests.add(request_id)
Expand Down