Skip to content

Commit 2c236be

Browse files
Refactor Counter metrics to be homeserver-scoped (#18656)
Bulk refactor `Counter` metrics to be homeserver-scoped. We also add lints to make sure that new `Counter` metrics don't sneak in without using the `server_name` label (`SERVER_NAME_LABEL`). All of the "Fill in" commits are just bulk refactor. Part of #18592 ### Testing strategy 1. Add the `metrics` listener in your `homeserver.yaml` ```yaml listeners: # This is just showing how to configure metrics either way # # `http` `metrics` resource - port: 9322 type: http bind_addresses: ['127.0.0.1'] resources: - names: [metrics] compress: false # `metrics` listener - port: 9323 type: metrics bind_addresses: ['127.0.0.1'] ``` 1. Start the homeserver: `poetry run synapse_homeserver --config-path homeserver.yaml` 1. Fetch `http://localhost:9322/_synapse/metrics` and/or `http://localhost:9323/metrics` 1. Observe response includes the `synapse_user_registrations_total`, `synapse_http_server_response_count_total`, etc metrics with the `server_name` label
1 parent 458e641 commit 2c236be

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+753
-240
lines changed

changelog.d/18656.misc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Refactor `Counter` metrics to be homeserver-scoped.

scripts-dev/mypy_synapse_plugin.py

Lines changed: 104 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,13 @@
2828
import mypy.types
2929
from mypy.erasetype import remove_instance_last_known_values
3030
from mypy.errorcodes import ErrorCode
31-
from mypy.nodes import ARG_NAMED_OPT, TempNode, Var
32-
from mypy.plugin import FunctionSigContext, MethodSigContext, Plugin
31+
from mypy.nodes import ARG_NAMED_OPT, ListExpr, NameExpr, TempNode, Var
32+
from mypy.plugin import (
33+
FunctionLike,
34+
FunctionSigContext,
35+
MethodSigContext,
36+
Plugin,
37+
)
3338
from mypy.typeops import bind_self
3439
from mypy.types import (
3540
AnyType,
@@ -43,8 +48,26 @@
4348
UnionType,
4449
)
4550

51+
PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL = ErrorCode(
52+
"missing-server-name-label",
53+
"`SERVER_NAME_LABEL` required in metric",
54+
category="per-homeserver-tenant-metrics",
55+
)
56+
4657

4758
class SynapsePlugin(Plugin):
59+
def get_function_signature_hook(
60+
self, fullname: str
61+
) -> Optional[Callable[[FunctionSigContext], FunctionLike]]:
62+
if fullname in (
63+
"prometheus_client.metrics.Counter",
64+
# TODO: Add other prometheus_client metrics that need checking as we
65+
# refactor, see https://github.com/element-hq/synapse/issues/18592
66+
):
67+
return check_prometheus_metric_instantiation
68+
69+
return None
70+
4871
def get_method_signature_hook(
4972
self, fullname: str
5073
) -> Optional[Callable[[MethodSigContext], CallableType]]:
@@ -65,6 +88,85 @@ def get_method_signature_hook(
6588
return None
6689

6790

91+
def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableType:
92+
"""
93+
Ensure that the `prometheus_client` metrics include the `SERVER_NAME_LABEL` label
94+
when instantiated.
95+
96+
This is important because we support multiple Synapse instances running in the same
97+
process, where all metrics share a single global `REGISTRY`. The `server_name` label
98+
ensures metrics are correctly separated by homeserver.
99+
100+
There are also some metrics that apply at the process level, such as CPU usage,
101+
Python garbage collection, Twisted reactor tick time which shouldn't have the
102+
`SERVER_NAME_LABEL`. In those cases, use use a type ignore comment to disable the
103+
check, e.g. `# type: ignore[missing-server-name-label]`.
104+
"""
105+
# The true signature, this isn't being modified so this is what will be returned.
106+
signature: CallableType = ctx.default_signature
107+
108+
# Sanity check the arguments are still as expected in this version of
109+
# `prometheus_client`. ex. `Counter(name, documentation, labelnames, ...)`
110+
#
111+
# `signature.arg_names` should be: ["name", "documentation", "labelnames", ...]
112+
if len(signature.arg_names) < 3 or signature.arg_names[2] != "labelnames":
113+
ctx.api.fail(
114+
f"Expected the 3rd argument of {signature.name} to be 'labelnames', but got "
115+
f"{signature.arg_names[2]}",
116+
ctx.context,
117+
)
118+
return signature
119+
120+
# Ensure mypy is passing the correct number of arguments because we are doing some
121+
# dirty indexing into `ctx.args` later on.
122+
assert len(ctx.args) == len(signature.arg_names), (
123+
f"Expected the list of arguments in the {signature.name} signature ({len(signature.arg_names)})"
124+
f"to match the number of arguments from the function signature context ({len(ctx.args)})"
125+
)
126+
127+
# Check if the `labelnames` argument includes `SERVER_NAME_LABEL`
128+
#
129+
# `ctx.args` should look like this:
130+
# ```
131+
# [
132+
# [StrExpr("name")],
133+
# [StrExpr("documentation")],
134+
# [ListExpr([StrExpr("label1"), StrExpr("label2")])]
135+
# ...
136+
# ]
137+
# ```
138+
labelnames_arg_expression = ctx.args[2][0] if len(ctx.args[2]) > 0 else None
139+
if isinstance(labelnames_arg_expression, ListExpr):
140+
# Check if the `labelnames` argument includes the `server_name` label (`SERVER_NAME_LABEL`).
141+
for labelname_expression in labelnames_arg_expression.items:
142+
if (
143+
isinstance(labelname_expression, NameExpr)
144+
and labelname_expression.fullname == "synapse.metrics.SERVER_NAME_LABEL"
145+
):
146+
# Found the `SERVER_NAME_LABEL`, all good!
147+
break
148+
else:
149+
ctx.api.fail(
150+
f"Expected {signature.name} to include `SERVER_NAME_LABEL` in the list of labels. "
151+
"If this is a process-level metric (vs homeserver-level), use a type ignore comment "
152+
"to disable this check.",
153+
ctx.context,
154+
code=PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL,
155+
)
156+
else:
157+
ctx.api.fail(
158+
f"Expected the `labelnames` argument of {signature.name} to be a list of label names "
159+
f"(including `SERVER_NAME_LABEL`), but got {labelnames_arg_expression}. "
160+
"If this is a process-level metric (vs homeserver-level), use a type ignore comment "
161+
"to disable this check.",
162+
ctx.context,
163+
code=PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL,
164+
)
165+
return signature
166+
167+
return signature
168+
169+
68170
def _get_true_return_type(signature: CallableType) -> mypy.types.Type:
69171
"""
70172
Get the "final" return type of a callable which might return an Awaitable/Deferred.

synapse/appservice/api.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
from synapse.events.utils import SerializeEventConfig, serialize_event
4949
from synapse.http.client import SimpleHttpClient, is_unknown_endpoint
5050
from synapse.logging import opentracing
51+
from synapse.metrics import SERVER_NAME_LABEL
5152
from synapse.types import DeviceListUpdates, JsonDict, JsonMapping, ThirdPartyInstanceID
5253
from synapse.util.caches.response_cache import ResponseCache
5354

@@ -59,29 +60,31 @@
5960
sent_transactions_counter = Counter(
6061
"synapse_appservice_api_sent_transactions",
6162
"Number of /transactions/ requests sent",
62-
["service"],
63+
labelnames=["service", SERVER_NAME_LABEL],
6364
)
6465

6566
failed_transactions_counter = Counter(
6667
"synapse_appservice_api_failed_transactions",
6768
"Number of /transactions/ requests that failed to send",
68-
["service"],
69+
labelnames=["service", SERVER_NAME_LABEL],
6970
)
7071

7172
sent_events_counter = Counter(
72-
"synapse_appservice_api_sent_events", "Number of events sent to the AS", ["service"]
73+
"synapse_appservice_api_sent_events",
74+
"Number of events sent to the AS",
75+
labelnames=["service", SERVER_NAME_LABEL],
7376
)
7477

7578
sent_ephemeral_counter = Counter(
7679
"synapse_appservice_api_sent_ephemeral",
7780
"Number of ephemeral events sent to the AS",
78-
["service"],
81+
labelnames=["service", SERVER_NAME_LABEL],
7982
)
8083

8184
sent_todevice_counter = Counter(
8285
"synapse_appservice_api_sent_todevice",
8386
"Number of todevice messages sent to the AS",
84-
["service"],
87+
labelnames=["service", SERVER_NAME_LABEL],
8588
)
8689

8790
HOUR_IN_MS = 60 * 60 * 1000
@@ -382,6 +385,7 @@ async def push_bulk(
382385
"left": list(device_list_summary.left),
383386
}
384387

388+
labels = {"service": service.id, SERVER_NAME_LABEL: self.server_name}
385389
try:
386390
args = None
387391
if self.config.use_appservice_legacy_authorization:
@@ -399,10 +403,10 @@ async def push_bulk(
399403
service.url,
400404
[event.get("event_id") for event in events],
401405
)
402-
sent_transactions_counter.labels(service.id).inc()
403-
sent_events_counter.labels(service.id).inc(len(serialized_events))
404-
sent_ephemeral_counter.labels(service.id).inc(len(ephemeral))
405-
sent_todevice_counter.labels(service.id).inc(len(to_device_messages))
406+
sent_transactions_counter.labels(**labels).inc()
407+
sent_events_counter.labels(**labels).inc(len(serialized_events))
408+
sent_ephemeral_counter.labels(**labels).inc(len(ephemeral))
409+
sent_todevice_counter.labels(**labels).inc(len(to_device_messages))
406410
return True
407411
except CodeMessageException as e:
408412
logger.warning(
@@ -421,7 +425,7 @@ async def push_bulk(
421425
ex.args,
422426
exc_info=logger.isEnabledFor(logging.DEBUG),
423427
)
424-
failed_transactions_counter.labels(service.id).inc()
428+
failed_transactions_counter.labels(**labels).inc()
425429
return False
426430

427431
async def claim_client_keys(

synapse/federation/federation_client.py

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@
7474
from synapse.http.client import is_unknown_endpoint
7575
from synapse.http.types import QueryParams
7676
from synapse.logging.opentracing import SynapseTags, log_kv, set_tag, tag_args, trace
77+
from synapse.metrics import SERVER_NAME_LABEL
7778
from synapse.types import JsonDict, StrCollection, UserID, get_domain_from_id
7879
from synapse.types.handlers.policy_server import RECOMMENDATION_OK, RECOMMENDATION_SPAM
7980
from synapse.util.async_helpers import concurrently_execute
@@ -85,7 +86,9 @@
8586

8687
logger = logging.getLogger(__name__)
8788

88-
sent_queries_counter = Counter("synapse_federation_client_sent_queries", "", ["type"])
89+
sent_queries_counter = Counter(
90+
"synapse_federation_client_sent_queries", "", labelnames=["type", SERVER_NAME_LABEL]
91+
)
8992

9093

9194
PDU_RETRY_TIME_MS = 1 * 60 * 1000
@@ -209,7 +212,10 @@ async def make_query(
209212
Returns:
210213
The JSON object from the response
211214
"""
212-
sent_queries_counter.labels(query_type).inc()
215+
sent_queries_counter.labels(
216+
type=query_type,
217+
**{SERVER_NAME_LABEL: self.server_name},
218+
).inc()
213219

214220
return await self.transport_layer.make_query(
215221
destination,
@@ -231,7 +237,10 @@ async def query_client_keys(
231237
Returns:
232238
The JSON object from the response
233239
"""
234-
sent_queries_counter.labels("client_device_keys").inc()
240+
sent_queries_counter.labels(
241+
type="client_device_keys",
242+
**{SERVER_NAME_LABEL: self.server_name},
243+
).inc()
235244
return await self.transport_layer.query_client_keys(
236245
destination, content, timeout
237246
)
@@ -242,7 +251,10 @@ async def query_user_devices(
242251
"""Query the device keys for a list of user ids hosted on a remote
243252
server.
244253
"""
245-
sent_queries_counter.labels("user_devices").inc()
254+
sent_queries_counter.labels(
255+
type="user_devices",
256+
**{SERVER_NAME_LABEL: self.server_name},
257+
).inc()
246258
return await self.transport_layer.query_user_devices(
247259
destination, user_id, timeout
248260
)
@@ -264,7 +276,10 @@ async def claim_client_keys(
264276
Returns:
265277
The JSON object from the response
266278
"""
267-
sent_queries_counter.labels("client_one_time_keys").inc()
279+
sent_queries_counter.labels(
280+
type="client_one_time_keys",
281+
**{SERVER_NAME_LABEL: self.server_name},
282+
).inc()
268283

269284
# Convert the query with counts into a stable and unstable query and check
270285
# if attempting to claim more than 1 OTK.

synapse/federation/federation_server.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@
8282
tag_args,
8383
trace,
8484
)
85+
from synapse.metrics import SERVER_NAME_LABEL
8586
from synapse.metrics.background_process_metrics import wrap_as_background_process
8687
from synapse.replication.http.federation import (
8788
ReplicationFederationSendEduRestServlet,
@@ -104,12 +105,18 @@
104105

105106
logger = logging.getLogger(__name__)
106107

107-
received_pdus_counter = Counter("synapse_federation_server_received_pdus", "")
108+
received_pdus_counter = Counter(
109+
"synapse_federation_server_received_pdus", "", labelnames=[SERVER_NAME_LABEL]
110+
)
108111

109-
received_edus_counter = Counter("synapse_federation_server_received_edus", "")
112+
received_edus_counter = Counter(
113+
"synapse_federation_server_received_edus", "", labelnames=[SERVER_NAME_LABEL]
114+
)
110115

111116
received_queries_counter = Counter(
112-
"synapse_federation_server_received_queries", "", ["type"]
117+
"synapse_federation_server_received_queries",
118+
"",
119+
labelnames=["type", SERVER_NAME_LABEL],
113120
)
114121

115122
pdu_process_time = Histogram(
@@ -434,7 +441,9 @@ async def _handle_pdus_in_txn(
434441
report back to the sending server.
435442
"""
436443

437-
received_pdus_counter.inc(len(transaction.pdus))
444+
received_pdus_counter.labels(**{SERVER_NAME_LABEL: self.server_name}).inc(
445+
len(transaction.pdus)
446+
)
438447

439448
origin_host, _ = parse_server_name(origin)
440449

@@ -553,7 +562,7 @@ async def _handle_edus_in_txn(self, origin: str, transaction: Transaction) -> No
553562
"""Process the EDUs in a received transaction."""
554563

555564
async def _process_edu(edu_dict: JsonDict) -> None:
556-
received_edus_counter.inc()
565+
received_edus_counter.labels(**{SERVER_NAME_LABEL: self.server_name}).inc()
557566

558567
edu = Edu(
559568
origin=origin,
@@ -668,7 +677,10 @@ async def on_pdu_request(
668677
async def on_query_request(
669678
self, query_type: str, args: Dict[str, str]
670679
) -> Tuple[int, Dict[str, Any]]:
671-
received_queries_counter.labels(query_type).inc()
680+
received_queries_counter.labels(
681+
type=query_type,
682+
**{SERVER_NAME_LABEL: self.server_name},
683+
).inc()
672684
resp = await self.registry.on_query(query_type, args)
673685
return 200, resp
674686

synapse/federation/sender/__init__.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@
160160
from synapse.federation.units import Edu
161161
from synapse.logging.context import make_deferred_yieldable, run_in_background
162162
from synapse.metrics import (
163+
SERVER_NAME_LABEL,
163164
LaterGauge,
164165
event_processing_loop_counter,
165166
event_processing_loop_room_count,
@@ -189,11 +190,13 @@
189190
sent_pdus_destination_dist_count = Counter(
190191
"synapse_federation_client_sent_pdu_destinations_count",
191192
"Number of PDUs queued for sending to one or more destinations",
193+
labelnames=[SERVER_NAME_LABEL],
192194
)
193195

194196
sent_pdus_destination_dist_total = Counter(
195197
"synapse_federation_client_sent_pdu_destinations",
196198
"Total number of PDUs queued for sending across all destinations",
199+
labelnames=[SERVER_NAME_LABEL],
197200
)
198201

199202
# Time (in s) to wait before trying to wake up destinations that have
@@ -708,13 +711,19 @@ async def handle_room_events(events: List[EventBase]) -> None:
708711
"federation_sender"
709712
).set(ts)
710713

711-
events_processed_counter.inc(len(event_entries))
714+
events_processed_counter.labels(
715+
**{SERVER_NAME_LABEL: self.server_name}
716+
).inc(len(event_entries))
712717

713-
event_processing_loop_room_count.labels("federation_sender").inc(
714-
len(events_by_room)
715-
)
718+
event_processing_loop_room_count.labels(
719+
name="federation_sender",
720+
**{SERVER_NAME_LABEL: self.server_name},
721+
).inc(len(events_by_room))
716722

717-
event_processing_loop_counter.labels("federation_sender").inc()
723+
event_processing_loop_counter.labels(
724+
name="federation_sender",
725+
**{SERVER_NAME_LABEL: self.server_name},
726+
).inc()
718727

719728
synapse.metrics.event_processing_positions.labels(
720729
"federation_sender"
@@ -735,8 +744,12 @@ async def _send_pdu(self, pdu: EventBase, destinations: Iterable[str]) -> None:
735744
if not destinations:
736745
return
737746

738-
sent_pdus_destination_dist_total.inc(len(destinations))
739-
sent_pdus_destination_dist_count.inc()
747+
sent_pdus_destination_dist_total.labels(
748+
**{SERVER_NAME_LABEL: self.server_name}
749+
).inc(len(destinations))
750+
sent_pdus_destination_dist_count.labels(
751+
**{SERVER_NAME_LABEL: self.server_name}
752+
).inc()
740753

741754
assert pdu.internal_metadata.stream_ordering
742755

0 commit comments

Comments
 (0)