Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.
Closed
1 change: 1 addition & 0 deletions changelog.d/14531.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Remove database differentiation (SQLite vs Postgres specific code) in `_get_state_groups_from_groups_txn`.
128 changes: 64 additions & 64 deletions synapse/storage/databases/state/bg_updates.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
LoggingDatabaseConnection,
LoggingTransaction,
)
from synapse.storage.engines import PostgresEngine
from synapse.storage.engines import PostgresEngine, Sqlite3Engine
from synapse.types import MutableStateMap, StateMap
from synapse.types.state import StateFilter
from synapse.util.caches import intern_string
Expand Down Expand Up @@ -103,11 +103,9 @@ def _get_state_groups_from_groups_txn(
# against `state_groups_state` to fetch the latest state.
# It assumes that previous state groups are always numerically
# lesser.
# This may return multiple rows per (type, state_key), but last_value
# should be the same.
sql = """
WITH RECURSIVE sgs(state_group) AS (
VALUES(?::bigint)
VALUES(CAST(? AS bigint))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The only change to this code here is changing the cast to be compatible between SQLite and Postgres.

SQLite chokes on the previous ?::bigint syntax

UNION ALL
SELECT prev_state_group FROM state_group_edges e, sgs s
WHERE s.state_group = e.state_group
Expand All @@ -125,8 +123,7 @@ def _get_state_groups_from_groups_txn(
not state_filter.include_others and not state_filter.is_full()
)
state_filter_condition_combos: List[Tuple[str, Optional[str]]] = []
# We don't need to caclculate this list if we're not using the condition
# optimization
# We only need to caclculate this list if we're using the condition optimization
if use_condition_optimization:
for etype, state_keys in state_filter.types.items():
if state_keys is None:
Expand All @@ -150,20 +147,56 @@ def _get_state_groups_from_groups_txn(
where_clause = "(type = ? AND state_key = ?)"
overall_select_query_args.extend([etype, skey])

# Small helper function to wrap the union clause in parenthesis if we're
# using postgres. This is because SQLite doesn't allow `LIMIT`/`ORDER`
# clauses in the union subquery but postgres does as long as they are
# wrapped in parenthesis which this function handles the complexity of
# handling.
def wrap_union_if_postgres(
union_clause: str, extra_order_or_limit_clause: str = ""
) -> str:
if isinstance(self.database_engine, PostgresEngine):
return f"""({union_clause} {extra_order_or_limit_clause})"""

return union_clause
Comment on lines +150 to +161
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be better just to define this function somewhere above once but thought it lost the specific usage context when we put it somewhere else. Feel free to poke ⏩


# We could use `SELECT DISTINCT ON` here to align with the query below
# but that isn't compatible with SQLite and we can get away with `LIMIT
# 1` here instead because the `WHERE` clause will only ever match and
# target one event; and is simpler anyway. And it's better to use
# something that's simpler and compatible with both Database engines.
select_clause_list.append(
wrap_union_if_postgres(
# We only select `state_group` here for use in the `ORDER`
# clause later after the `UNION`
f"""
(
SELECT DISTINCT ON (type, state_key)
type, state_key, event_id
SELECT type, state_key, event_id, state_group
FROM state_groups_state
INNER JOIN sgs USING (state_group)
WHERE {where_clause}
""",
# The `ORDER BY`/`LIMIT` is an extra nicety that saves us from
# having to ferry a bunch of duplicate state pairs back from the
# database since we only need the one with the greatest
# state_group (most recent). Since this only applies to
# postgres, we do have to be careful to take care of the
# duplicate pairs in the downstream code when running with
# SQLite.
"""
ORDER BY type, state_key, state_group DESC
LIMIT 1
""",
)
"""
)

overall_select_clause = " UNION ".join(select_clause_list)

if isinstance(self.database_engine, Sqlite3Engine):
# We `ORDER` after the union results because it's compatible with both
# Postgres and SQLite. And we need the rows to by ordered by
# `state_group` so the greatest state_group pairs are first and we only
# care about the first distinct (type, state_key) pair later on.
overall_select_clause += " ORDER BY type, state_key, state_group DESC"
else:
Copy link
Contributor Author

@MadLittleMods MadLittleMods Nov 23, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The diff is a bit weird to follow but we're just removing the whole SQLite specific else and unindenting the Postgres one

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

where_clause, where_args = state_filter.make_sql_filter_clause()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this worth resurrecting?

@DMRobertson It seems possible to accomplish but there are a few more hurdles. If you're keen to take it over, feel free. I won't be looking at it until I get more backend facing again.

# Unless the filter clause is empty, we're going to append it after an
Expand All @@ -173,6 +206,7 @@ def _get_state_groups_from_groups_txn(

overall_select_query_args.extend(where_args)

if isinstance(self.database_engine, PostgresEngine):
overall_select_clause = f"""
SELECT DISTINCT ON (type, state_key)
type, state_key, event_id
Expand All @@ -182,69 +216,35 @@ def _get_state_groups_from_groups_txn(
) {where_clause}
ORDER BY type, state_key, state_group DESC
"""
else:
# SQLite doesn't support `SELECT DISTINCT ON`, so we have to just get
# some potential duplicate (type, state_key) pairs and then only use the
# first of each kind we see.
overall_select_clause = f"""
SELECT type, state_key, event_id
FROM state_groups_state
WHERE state_group IN (
SELECT state_group FROM sgs
) {where_clause}
ORDER BY type, state_key, state_group DESC
"""
Comment on lines +220 to +230
Copy link
Contributor Author

@MadLittleMods MadLittleMods May 12, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This query is exactly the same as the Postgres version above except for the DISTINCT ON (type, state_key) part at the beginning.

Any interest in sharing the bulk of the query here? Not sure it would be more clear


for group in groups:
args: List[Union[int, str]] = [group]
args.extend(overall_select_query_args)

txn.execute(sql % (overall_select_clause,), args)
for row in txn:
typ, state_key, event_id = row
# The `*_` rest syntax is to ignore the `state_group` column which we
# only select in the optimized case
typ, state_key, event_id, *_ = row
key = (intern_string(typ), intern_string(state_key))
# Deal with the potential duplicate (type, state_key) pairs from the
# SQLite specific query above. We only want to use the first row which
# is from the greatest state group (most-recent) because that is that
# applicable state in that state group.
if key not in results[group]:
results[group][key] = event_id
else:
max_entries_returned = state_filter.max_entries_returned()

where_clause, where_args = state_filter.make_sql_filter_clause()
# Unless the filter clause is empty, we're going to append it after an
# existing where clause
if where_clause:
where_clause = " AND (%s)" % (where_clause,)

# We don't use WITH RECURSIVE on sqlite3 as there are distributions
# that ship with an sqlite3 version that doesn't support it (e.g. wheezy)
for group in groups:
next_group: Optional[int] = group

while next_group:
# We did this before by getting the list of group ids, and
# then passing that list to sqlite to get latest event for
# each (type, state_key). However, that was terribly slow
# without the right indices (which we can't add until
# after we finish deduping state, which requires this func)
args = [next_group]
args.extend(where_args)

txn.execute(
"SELECT type, state_key, event_id FROM state_groups_state"
" WHERE state_group = ? " + where_clause,
args,
)
results[group].update(
((typ, state_key), event_id)
for typ, state_key, event_id in txn
if (typ, state_key) not in results[group]
)

# If the number of entries in the (type,state_key)->event_id dict
# matches the number of (type,state_keys) types we were searching
# for, then we must have found them all, so no need to go walk
# further down the tree... UNLESS our types filter contained
# wildcards (i.e. Nones) in which case we have to do an exhaustive
# search
if (
max_entries_returned is not None
and len(results[group]) == max_entries_returned
):
break
Comment on lines -229 to -239
Copy link
Contributor Author

@MadLittleMods MadLittleMods May 12, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With the current consolidation, we're losing this shortcut when running with SQLite (and the shortcut is probably pretty useful) 😬

And as mentioned in another comment, also conscious that some of these subtle hacks to consolidate might not be better than the big if-else distinction between Postgres and SQLite we had before 😬

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Closing as I'm not convinced this PR is better now that we've reached this point where things work. It was looking very promising before these latest changes necessitated by SQLite not supporting SELECT DISTINCT ON syntax.

It is less code especially considering comments added but wouldn't consider it easier to reason about and we lose the SQLite optimization mentioned above.


next_group = self.db_pool.simple_select_one_onecol_txn(
txn,
table="state_group_edges",
keyvalues={"state_group": next_group},
retcol="prev_state_group",
allow_none=True,
)

# The results shouldn't be considered mutable.
return results
Expand Down