Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/18932.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Disconnect background process work from request trace.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Recorded the flaky tests on #18537

❌ TestThreadReceiptsInSyncMSC4102 ->
https://github.com/element-hq/synapse/actions/runs/18021157721/job/51278706782

1 change: 0 additions & 1 deletion synapse/events/auto_accept_invites.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,6 @@ async def on_new_event(self, event: EventBase, *args: Any) -> None:
event.state_key,
event.room_id,
"join",
bg_start_span=False,
)

if is_direct_message:
Expand Down
11 changes: 11 additions & 0 deletions synapse/logging/opentracing.py
Original file line number Diff line number Diff line change
Expand Up @@ -576,7 +576,9 @@ def start_active_span_follows_from(
operation_name: str,
contexts: Collection,
child_of: Optional[Union["opentracing.Span", "opentracing.SpanContext"]] = None,
tags: Optional[Dict[str, str]] = None,
start_time: Optional[float] = None,
ignore_active_span: bool = False,
*,
inherit_force_tracing: bool = False,
tracer: Optional["opentracing.Tracer"] = None,
Expand All @@ -591,9 +593,16 @@ def start_active_span_follows_from(
span will be the parent. (If there is no currently active span, the first
span in `contexts` will be the parent.)

tags: an optional dictionary of span tags. The caller gives up ownership of that
dictionary, because the :class:`Tracer` may use it as-is to avoid extra data
copying.

start_time: optional override for the start time of the created span. Seconds
since the epoch.

ignore_active_span: an explicit flag that ignores the current active
scope and creates a root span.

inherit_force_tracing: if set, and any of the previous contexts have had tracing
forced, the new span will also have tracing forced.
tracer: override the opentracing tracer. By default the global tracer is used.
Expand All @@ -606,7 +615,9 @@ def start_active_span_follows_from(
operation_name,
child_of=child_of,
references=references,
tags=tags,
start_time=start_time,
ignore_active_span=ignore_active_span,
tracer=tracer,
)

Expand Down
105 changes: 97 additions & 8 deletions synapse/metrics/background_process_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,17 @@

import logging
import threading
from contextlib import nullcontext
from contextlib import contextmanager, nullcontext
from functools import wraps
from types import TracebackType
from typing import (
TYPE_CHECKING,
Any,
Awaitable,
Callable,
ContextManager,
Dict,
Generator,
Iterable,
Optional,
Protocol,
Expand All @@ -49,7 +51,12 @@
LoggingContext,
PreserveLoggingContext,
)
from synapse.logging.opentracing import SynapseTags, start_active_span
from synapse.logging.opentracing import (
SynapseTags,
active_span,
start_active_span,
start_active_span_follows_from,
)
from synapse.metrics import SERVER_NAME_LABEL
from synapse.metrics._types import Collector

Expand Down Expand Up @@ -264,15 +271,97 @@ async def run() -> Optional[R]:

with BackgroundProcessLoggingContext(
name=desc, server_name=server_name, instance_id=count
) as context:
) as logging_context:
try:
if bg_start_span:
ctx = start_active_span(
f"bgproc.{desc}", tags={SynapseTags.REQUEST_ID: str(context)}
)
original_active_tracing_span = active_span()

# If there is already an active span (e.g. because this background
# process was started as part of handling a request for example),
# because this is a long-running background task that may serve a
# broader purpose than the request that kicked it off, we don't want
# it to be a direct child of the currently active trace connected to
# the request. We only want a loose reference to jump between the
# traces.
#
# For example, when making a `/messages` request, when approaching a
# gap, we may kick off a background process to fetch missing events
# from federation. The `/messages` request trace should't include
# the entire time taken and details around fetching the missing
# events since the request doesn't rely on the result, it was just
# part of the heuristic to initiate things.
#
# We don't care about the value from the context manager as it's not
# used (so we just use `Any` for the type). Ideally, we'd be able to
# mark this as unused like an `assert_never` of sorts.
tracing_scope: ContextManager[Any]
if original_active_tracing_span is not None:
# With the OpenTracing client that we're using, it's impossible to
# create a disconnected root span while also providing `references`
# so we first create a bare root span, then create a child span that
# includes the references that we want.
root_tracing_scope = start_active_span(
f"bgproc.{desc}",
tags={SynapseTags.REQUEST_ID: str(logging_context)},
# Create a root span for the background process (disconnected
# from other spans)
ignore_active_span=True,
)

# Also add a span in the original request trace that cross-links
# to background process trace. We immediately finish the span as
# this is just a marker to follow where the real work is being
# done.
#
# In OpenTracing, `FOLLOWS_FROM` indicates parent-child
# relationship whereas we just want a cross-link to the
# downstream trace. This is a bit hacky, but the closest we
# can get to in OpenTracing land. If we ever migrate to
# OpenTelemetry, we should use a normal `Link` for this.
with start_active_span_follows_from(
f"start_bgproc.{desc}",
child_of=original_active_tracing_span,
ignore_active_span=True,
# Points to the background process span.
contexts=[root_tracing_scope.span.context],
):
pass

# Then start the tracing scope that we're going to use for
# the duration of the background process within the root
# span we just created.
child_tracing_scope = start_active_span_follows_from(
f"bgproc_child.{desc}",
child_of=root_tracing_scope.span,
ignore_active_span=True,
tags={SynapseTags.REQUEST_ID: str(logging_context)},
# Create the `FOLLOWS_FROM` reference to the request's
# span so there is a loose coupling between the two
# traces and it's easy to jump between.
contexts=[original_active_tracing_span.context],
)

# For easy usage down below, we create a context manager that
# combines both scopes.
@contextmanager
def combined_context_manager() -> Generator[None, None, None]:
with root_tracing_scope, child_tracing_scope:
yield

tracing_scope = combined_context_manager()

else:
# Otherwise, when there is no active span, we will be creating
# a disconnected root span already and we don't have to
# worry about cross-linking to anything.
tracing_scope = start_active_span(
f"bgproc.{desc}",
tags={SynapseTags.REQUEST_ID: str(logging_context)},
)
else:
ctx = nullcontext() # type: ignore[assignment]
with ctx:
tracing_scope = nullcontext()

with tracing_scope:
return await func(*args, **kwargs)
except Exception:
logger.exception(
Expand Down
7 changes: 7 additions & 0 deletions synapse/replication/tcp/redis.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,13 @@ def send_command(self, cmd: Command) -> None:
self.server_name,
self._async_send_command,
cmd,
# We originally started tracing background processes to avoid `There was no
# active span` errors but this change meant we started generating 15x the
# number of spans than before (this is one of the most heavily called
# instances of `run_as_background_process`).
#
# Since we don't log or tag a tracing span in the downstream
# code, we can safely disable this.
bg_start_span=False,
)

Expand Down
Loading