From abf658c712f932563cd4f667464dccb27d6f81cc Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Thu, 18 Sep 2025 14:33:11 +0100 Subject: [PATCH 01/34] WIP MSC4354; stub out soft-failure code for now --- .../conf/workers-shared-extra.yaml.j2 | 2 + synapse/config/experimental.py | 3 + synapse/federation/federation_base.py | 2 + synapse/rest/client/room.py | 22 ++ synapse/storage/databases/main/events.py | 227 ++++++++++++++++++ 5 files changed, 256 insertions(+) diff --git a/docker/complement/conf/workers-shared-extra.yaml.j2 b/docker/complement/conf/workers-shared-extra.yaml.j2 index 94e74df9d11..cb963c04ec5 100644 --- a/docker/complement/conf/workers-shared-extra.yaml.j2 +++ b/docker/complement/conf/workers-shared-extra.yaml.j2 @@ -135,6 +135,8 @@ experimental_features: msc4155_enabled: true # Thread Subscriptions msc4306_enabled: true + # Sticky Events + msc4354_enabled: true server_notices: system_mxid_localpart: _server diff --git a/synapse/config/experimental.py b/synapse/config/experimental.py index d086deab3f4..2cff80c753d 100644 --- a/synapse/config/experimental.py +++ b/synapse/config/experimental.py @@ -592,3 +592,6 @@ def read_config( # MSC4306: Thread Subscriptions # (and MSC4308: Thread Subscriptions extension to Sliding Sync) self.msc4306_enabled: bool = experimental.get("msc4306_enabled", False) + + # MSC4354: Sticky Events + self.msc4354_enabled: bool = experimental.get("msc4354_enabled", False) diff --git a/synapse/federation/federation_base.py b/synapse/federation/federation_base.py index a1c9c286ac7..c71def8c765 100644 --- a/synapse/federation/federation_base.py +++ b/synapse/federation/federation_base.py @@ -195,6 +195,8 @@ async def _check_sigs_and_hash( # using the event in prev_events). redacted_event = prune_event(pdu) redacted_event.internal_metadata.soft_failed = True + # Mark this as spam so we don't re-evaluate soft-failure status. + pdu.internal_metadata.policy_server_spammy = True return redacted_event return pdu diff --git a/synapse/rest/client/room.py b/synapse/rest/client/room.py index 64deae76507..01790bc1e43 100644 --- a/synapse/rest/client/room.py +++ b/synapse/rest/client/room.py @@ -82,6 +82,9 @@ logger = logging.getLogger(__name__) +MSC4354_STICKY_DURATION_QUERY_PARAM = "msc4354_stick_duration_ms" +MSC4354_STICKY_EVENT_KEY = "msc4354_sticky" + class _RoomSize(Enum): """ @@ -206,6 +209,7 @@ def __init__(self, hs: "HomeServer"): self.clock = hs.get_clock() self._max_event_delay_ms = hs.config.server.max_event_delay_ms self._spam_checker_module_callbacks = hs.get_module_api_callbacks().spam_checker + self.msc4354_enabled = hs.config.experimental.msc4354_enabled def register(self, http_server: HttpServer) -> None: # /rooms/$roomid/state/$eventtype @@ -364,6 +368,14 @@ async def on_PUT( "room_id": room_id, "sender": requester.user.to_string(), } + if self.msc4354_enabled: + sticky_duration_ms = parse_integer( + request, MSC4354_STICKY_DURATION_QUERY_PARAM + ) + if sticky_duration_ms is not None: + event_dict[MSC4354_STICKY_EVENT_KEY] = { + "duration_ms": sticky_duration_ms, + } if state_key is not None: event_dict["state_key"] = state_key @@ -396,6 +408,7 @@ def __init__(self, hs: "HomeServer"): self.delayed_events_handler = hs.get_delayed_events_handler() self.auth = hs.get_auth() self._max_event_delay_ms = hs.config.server.max_event_delay_ms + self.msc4354_enabled = hs.config.experimental.msc4354_enabled def register(self, http_server: HttpServer) -> None: # /rooms/$roomid/send/$event_type[/$txn_id] @@ -442,6 +455,15 @@ async def _do( if origin_server_ts is not None: event_dict["origin_server_ts"] = origin_server_ts + if self.msc4354_enabled: + sticky_duration_ms = parse_integer( + request, MSC4354_STICKY_DURATION_QUERY_PARAM + ) + if sticky_duration_ms is not None: + event_dict[MSC4354_STICKY_EVENT_KEY] = { + "duration_ms": sticky_duration_ms, + } + try: ( event, diff --git a/synapse/storage/databases/main/events.py b/synapse/storage/databases/main/events.py index a50e889b9df..b9542d9f220 100644 --- a/synapse/storage/databases/main/events.py +++ b/synapse/storage/databases/main/events.py @@ -22,6 +22,7 @@ import collections import itertools import logging +import time from collections import OrderedDict from typing import ( TYPE_CHECKING, @@ -1174,6 +1175,13 @@ def _persist_events_txn( self._update_sliding_sync_tables_with_new_persisted_events_txn( txn, room_id, events_and_contexts ) + # process events which are sticky as well as re-evaluate soft-failed sticky events. + self._handle_sticky_events_txn( + txn, + room_id, + events_and_contexts, + state_delta_for_room, + ) def _persist_event_auth_chain_txn( self, @@ -2921,6 +2929,225 @@ def _store_redaction(self, txn: LoggingTransaction, event: EventBase) -> None: }, ) + def _handle_sticky_events_txn( + self, + txn: LoggingTransaction, + room_id: str, + events_and_contexts: List[EventPersistencePair], + state_delta_for_room: Optional[DeltaState], + ) -> None: + """ + Update the sticky events table, used in MSC4354. + + This function assumes that `_store_event_txn()` (to persist the event) and + `_update_current_state_txn(...)` (so the current state has taken the events into account) + have already been run. + + "Handling" sticky events is broken into two phases: + - for each sticky event in events_and_contexts, mark them as sticky in the sticky events table. + - for each still-sticky soft-failed event in the room, re-evaluate soft-failedness. + + Args: + txn + room_id: The room that all of the events belong to + events_and_contexts: The events being persisted. + state_delta_for_room: The changes to the current state, used to detect if we need to + re-evaluate soft-failed sticky events. + """ + if len(events_and_contexts) == 0: + return + + # TODO: finish the impl + # fetch soft failed sticky events to recheck now, before we insert new sticky events, else + # we could incorrectly re-evaluate new sticky events + # event_ids_to_check = self._get_soft_failed_sticky_events_to_recheck(txn, room_id, state_delta_for_room) + # logger.info(f"_get_soft_failed_sticky_events_to_recheck => {event_ids_to_check}") + # recheck them and update any that now pass soft-fail checks. + # self._recheck_soft_failed_events(txn, room_id, event_ids_to_check) + + # insert brand new sticky events. + self._insert_sticky_events_txn(txn, events_and_contexts) + + def _insert_sticky_events_txn( + self, + txn: LoggingTransaction, + events_and_contexts: List[EventPersistencePair], + ) -> None: + sticky_events: List[EventBase] = [] + for ev, _ in events_and_contexts: + # MSC: Note: policy servers and other similar antispam techniques still apply to these events. + if ev.internal_metadata.policy_server_spammy: + continue + # We shouldn't be passed rejected events, but if we do, we filter them out too. + if ev.rejected_reason is not None: + continue + # MSC: The presence of sticky.duration_ms with a valid value makes the event “sticky” + sticky_obj = ev.get("sticky", None) + if type(sticky_obj) is dict: + sticky_duration_ms = sticky_obj.get("duration_ms", None) + # MSC: Valid values are the integer range 0-3600000 (1 hour). + if ( + type(sticky_duration_ms) is int + and sticky_duration_ms >= 0 + and sticky_duration_ms <= 3600000 + ): + sticky_events.append(ev) + + if len(sticky_events) == 0: + return + now_ms = round(time.time() * 1000) + self.db_pool.simple_insert_many_txn( + txn, + "sticky_events", + keys=("room_id", "event_id", "sender", "expires_at", "soft_failed"), + values=[ + ( + ev.room_id, + ev.event_id, + ev.sender, + # MSC: The start time is min(now, origin_server_ts). + # This ensures that malicious origin timestamps cannot specify start times in the future. + # Calculate the end time as start_time + min(sticky.duration_ms, 3600000). + min(ev.origin_server_ts, now_ms) + + min(ev.get_dict()["sticky"]["duration_ms"], 3600000), + ev.internal_metadata.soft_failed, + ) + for ev in sticky_events + ], + ) + + def _get_soft_failed_sticky_events_to_recheck( + self, + txn: LoggingTransaction, + room_id: str, + state_delta_for_room: Optional[DeltaState], + ) -> List[str]: + """Fetch soft-failed sticky events which should be rechecked against the current state. + + Soft-failed events are not rejected, so they pass auth at the state before + the event and at the auth_events in the event. Instead, soft-failed events failed auth at + the _current state of the room_. We only need to recheck soft failure if we have a reason to + believe the event may pass that check now. + + Note that we don't bother rechecking accepted events that may now be soft-failed, because + by that point it's too late as we've already sent the event to clients. + + Returns: + A list of event IDs to recheck + """ + + if state_delta_for_room is None: + # No change to current state => no way soft failure status could be different. + return [] + + # any change to critical auth events may change soft failure status. This means any changes + # to join rules, power levels or member events. If the state has changed but it isn't one + # of those events, we don't need to recheck. + critical_auth_types = ( + EventTypes.JoinRules, + EventTypes.PowerLevels, + EventTypes.Member, + ) + critical_auth_types_changed = set() + critical_auth_types_changed.update( + [ + typ + for typ, _ in state_delta_for_room.to_delete + if typ in critical_auth_types + ] + ) + critical_auth_types_changed.update( + [ + typ + for typ, _ in state_delta_for_room.to_insert + if typ in critical_auth_types + ] + ) + if len(critical_auth_types_changed) == 0: + # No change to critical auth events => no way soft failure status could be different. + return [] + + if critical_auth_types_changed == {EventTypes.Member}: + # the final case we want to catch is when unprivileged users join/leave rooms. These users cause + # changes in the critical auth types (the member event) but ultimately have no effect on soft + # failure status for anyone but that user themselves. + # + # Grab the set of senders that have been modified and see if any of them sent a soft-failed + # sticky event. If they did, then we need to re-evaluate. If they didn't, then we don't need to. + new_membership_changes = set( + [ + skey + for typ, skey in state_delta_for_room.to_insert + if typ == EventTypes.Member + ] + + [ + skey + for typ, skey in state_delta_for_room.to_delete + if typ == EventTypes.Member + ] + ) + # pull out senders of sticky events in this room + events_to_recheck: List[Tuple[str]] = self.db_pool.simple_select_many_txn( + txn, + table="sticky_events", + column="sender", + iterable=new_membership_changes, + keyvalues={ + "room_id": room_id, + "soft_failed": True, + }, + retcols=("event_id"), + ) + return [event_id for (event_id,) in events_to_recheck] + + # otherwise one of the following must be true: + # - there was a change in PL or join rules + # - there was a change in the membership of a sender of a soft-failed sticky event. + # In both of these cases we want to re-evaluate soft failure status. + # + # NB: event auth checks are NOT recursive. We don't need to specifically handle the case where + # an admin user's membership changes which causes a PL event to be allowed, as when the PL event + # gets allowed we will re-evaluate anyway. E.g: + # + # PL(send_event=0, sender=Admin) + # ^ ^_____________________ + # | | + # . PL(send_event=50, sender=Mod) sticky event (sender=User) + # + # In this scenario, the sticky event is soft-failed due to the Mod updating the PL event to + # set send_event=50, which User does not have. If we learn of an event which makes Mod's PL + # event invalid (say, Mod was banned by Admin concurrently to Mod setting the PL event), then + # the act of seeing the ban event will cause the old PL event to be in the state delta, meaning + # we will re-evaluate the sticky event due to the PL changing. We don't need to specially handle case.a + events_to_recheck = self.db_pool.simple_select_list_txn( + txn, + table="sticky_events", + keyvalues={ + "room_id": room_id, + "soft_failed": True, + }, + retcols=("event_id"), + ) + return [event_id for (event_id,) in events_to_recheck] + + def _recheck_soft_failed_events( + self, + txn: LoggingTransaction, + room_id: str, + event_ids: List[str], + ) -> None: + """ + Recheck authorised but soft-failed events. The provided event IDs must have already passed + all auth checks (so the event isn't rejected) but soft-failure checks. + + Args: + txn: The SQL transaction + room_id: The room the event IDs are in. + event_ids: The soft-failed events to re-evaluate. + """ + # We know the events are otherwise authorised, so we only need to load the current state + # and check if the events pass auth at the current state. + def insert_labels_for_event_txn( self, txn: LoggingTransaction, From 869953456a50310eff7106a9b762cb7e32fcbcf3 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Thu, 18 Sep 2025 16:45:10 +0100 Subject: [PATCH 02/34] Persist sticky events in sticky_events table This only works on postgres for now --- synapse/api/constants.py | 11 +++++++- synapse/events/builder.py | 7 ++++- synapse/rest/client/room.py | 15 ++++------- synapse/storage/databases/main/events.py | 16 +++++++++--- synapse/storage/schema/__init__.py | 2 +- .../schema/main/delta/93/01_sticky_events.sql | 26 +++++++++++++++++++ 6 files changed, 61 insertions(+), 16 deletions(-) create mode 100644 synapse/storage/schema/main/delta/93/01_sticky_events.sql diff --git a/synapse/api/constants.py b/synapse/api/constants.py index 7a8f546d6bf..5f2b1a4b5ac 100644 --- a/synapse/api/constants.py +++ b/synapse/api/constants.py @@ -24,7 +24,7 @@ """Contains constants from the specification.""" import enum -from typing import Final +from typing import Final, TypedDict # the max size of a (canonical-json-encoded) event MAX_PDU_SIZE = 65536 @@ -360,3 +360,12 @@ class Direction(enum.Enum): class ProfileFields: DISPLAYNAME: Final = "displayname" AVATAR_URL: Final = "avatar_url" + + +class StickyEventField(TypedDict): + duration_ms: int + + +class StickyEvent: + QUERY_PARAM_NAME: Final = "msc4354_stick_duration_ms" + FIELD_NAME: Final = "msc4354_sticky" diff --git a/synapse/events/builder.py b/synapse/events/builder.py index 5e1913d389e..a1a73de10ee 100644 --- a/synapse/events/builder.py +++ b/synapse/events/builder.py @@ -24,7 +24,7 @@ import attr from signedjson.types import SigningKey -from synapse.api.constants import MAX_DEPTH, EventTypes +from synapse.api.constants import MAX_DEPTH, EventTypes, StickyEvent, StickyEventField from synapse.api.room_versions import ( KNOWN_EVENT_FORMAT_VERSIONS, EventFormatVersions, @@ -89,6 +89,7 @@ class EventBuilder: content: JsonDict = attr.Factory(dict) unsigned: JsonDict = attr.Factory(dict) + sticky: Optional[StickyEventField] = None # These only exist on a subset of events, so they raise AttributeError if # someone tries to get them when they don't exist. @@ -269,6 +270,9 @@ async def build( if self._origin_server_ts is not None: event_dict["origin_server_ts"] = self._origin_server_ts + if self.sticky is not None: + event_dict[StickyEvent.FIELD_NAME] = self.sticky + return create_local_event_from_event_dict( clock=self._clock, hostname=self._hostname, @@ -318,6 +322,7 @@ def for_room_version( unsigned=key_values.get("unsigned", {}), redacts=key_values.get("redacts", None), origin_server_ts=key_values.get("origin_server_ts", None), + sticky=key_values.get(StickyEvent.FIELD_NAME, None), ) diff --git a/synapse/rest/client/room.py b/synapse/rest/client/room.py index 01790bc1e43..3e0f1b32188 100644 --- a/synapse/rest/client/room.py +++ b/synapse/rest/client/room.py @@ -33,7 +33,7 @@ from twisted.web.server import Request from synapse import event_auth -from synapse.api.constants import Direction, EventTypes, Membership +from synapse.api.constants import Direction, EventTypes, Membership, StickyEvent from synapse.api.errors import ( AuthError, Codes, @@ -82,9 +82,6 @@ logger = logging.getLogger(__name__) -MSC4354_STICKY_DURATION_QUERY_PARAM = "msc4354_stick_duration_ms" -MSC4354_STICKY_EVENT_KEY = "msc4354_sticky" - class _RoomSize(Enum): """ @@ -370,10 +367,10 @@ async def on_PUT( } if self.msc4354_enabled: sticky_duration_ms = parse_integer( - request, MSC4354_STICKY_DURATION_QUERY_PARAM + request, StickyEvent.QUERY_PARAM_NAME ) if sticky_duration_ms is not None: - event_dict[MSC4354_STICKY_EVENT_KEY] = { + event_dict[StickyEvent.FIELD_NAME] = { "duration_ms": sticky_duration_ms, } @@ -456,11 +453,9 @@ async def _do( event_dict["origin_server_ts"] = origin_server_ts if self.msc4354_enabled: - sticky_duration_ms = parse_integer( - request, MSC4354_STICKY_DURATION_QUERY_PARAM - ) + sticky_duration_ms = parse_integer(request, StickyEvent.QUERY_PARAM_NAME) if sticky_duration_ms is not None: - event_dict[MSC4354_STICKY_EVENT_KEY] = { + event_dict[StickyEvent.FIELD_NAME] = { "duration_ms": sticky_duration_ms, } diff --git a/synapse/storage/databases/main/events.py b/synapse/storage/databases/main/events.py index b9542d9f220..37742a0b1c2 100644 --- a/synapse/storage/databases/main/events.py +++ b/synapse/storage/databases/main/events.py @@ -49,6 +49,7 @@ EventTypes, Membership, RelationTypes, + StickyEvent, ) from synapse.api.errors import PartialStateConflictError from synapse.api.room_versions import RoomVersions @@ -2982,7 +2983,7 @@ def _insert_sticky_events_txn( if ev.rejected_reason is not None: continue # MSC: The presence of sticky.duration_ms with a valid value makes the event “sticky” - sticky_obj = ev.get("sticky", None) + sticky_obj = ev.get_dict().get(StickyEvent.FIELD_NAME, None) if type(sticky_obj) is dict: sticky_duration_ms = sticky_obj.get("duration_ms", None) # MSC: Valid values are the integer range 0-3600000 (1 hour). @@ -2993,8 +2994,15 @@ def _insert_sticky_events_txn( ): sticky_events.append(ev) + # TODO: filter out already expired sticky events. + if len(sticky_events) == 0: return + logger.info( + "inserting %d sticky events in room %s", + len(sticky_events), + sticky_events[0].room_id, + ) now_ms = round(time.time() * 1000) self.db_pool.simple_insert_many_txn( txn, @@ -3009,8 +3017,10 @@ def _insert_sticky_events_txn( # This ensures that malicious origin timestamps cannot specify start times in the future. # Calculate the end time as start_time + min(sticky.duration_ms, 3600000). min(ev.origin_server_ts, now_ms) - + min(ev.get_dict()["sticky"]["duration_ms"], 3600000), - ev.internal_metadata.soft_failed, + + min( + ev.get_dict()[StickyEvent.FIELD_NAME]["duration_ms"], 3600000 + ), + ev.internal_metadata.is_soft_failed(), ) for ev in sticky_events ], diff --git a/synapse/storage/schema/__init__.py b/synapse/storage/schema/__init__.py index 3c3b13437ef..81e231cd9ce 100644 --- a/synapse/storage/schema/__init__.py +++ b/synapse/storage/schema/__init__.py @@ -19,7 +19,7 @@ # # -SCHEMA_VERSION = 92 # remember to update the list below when updating +SCHEMA_VERSION = 93 # remember to update the list below when updating """Represents the expectations made by the codebase about the database schema This should be incremented whenever the codebase changes its requirements on the diff --git a/synapse/storage/schema/main/delta/93/01_sticky_events.sql b/synapse/storage/schema/main/delta/93/01_sticky_events.sql new file mode 100644 index 00000000000..e3288968de7 --- /dev/null +++ b/synapse/storage/schema/main/delta/93/01_sticky_events.sql @@ -0,0 +1,26 @@ +-- +-- This file is licensed under the Affero General Public License (AGPL) version 3. +-- +-- Copyright (C) 2025 New Vector, Ltd +-- +-- This program is free software: you can redistribute it and/or modify +-- it under the terms of the GNU Affero General Public License as +-- published by the Free Software Foundation, either version 3 of the +-- License, or (at your option) any later version. +-- +-- See the GNU Affero General Public License for more details: +-- . + +CREATE SEQUENCE IF NOT EXISTS sticky_events_seq; + +CREATE TABLE IF NOT EXISTS sticky_events( + id BIGINT PRIMARY KEY DEFAULT nextval('sticky_events_seq'), + room_id TEXT NOT NULL, + event_id TEXT NOT NULL, + sender TEXT NOT NULL, + expires_at BIGINT NOT NULL, + soft_failed BOOLEAN NOT NULL +); + +-- for pulling out soft failed events by room +CREATE INDEX IF NOT EXISTS sticky_events_room_idx ON sticky_events(room_id, soft_failed); From 7801e68a33b174e42ec3ad4bf56fb21330e452e0 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Fri, 19 Sep 2025 10:09:18 +0100 Subject: [PATCH 03/34] Use multi-writer streams for sticky events --- synapse/app/generic_worker.py | 2 + synapse/config/workers.py | 2 +- synapse/replication/tcp/handler.py | 7 + synapse/replication/tcp/streams/__init__.py | 3 + synapse/replication/tcp/streams/_base.py | 42 +++ synapse/storage/databases/main/__init__.py | 2 + synapse/storage/databases/main/events.py | 247 +----------- .../storage/databases/main/sticky_events.py | 351 ++++++++++++++++++ .../schema/main/delta/93/01_sticky_events.sql | 5 +- .../93/01_sticky_events_seq.sql.postgres | 19 + 10 files changed, 439 insertions(+), 241 deletions(-) create mode 100644 synapse/storage/databases/main/sticky_events.py create mode 100644 synapse/storage/schema/main/delta/93/01_sticky_events_seq.sql.postgres diff --git a/synapse/app/generic_worker.py b/synapse/app/generic_worker.py index 4f5bea6bd67..742b2af0813 100644 --- a/synapse/app/generic_worker.py +++ b/synapse/app/generic_worker.py @@ -23,6 +23,7 @@ import sys from typing import Dict, List +from synapse.storage.databases.main.sticky_events import StickyEventsWorkerStore from twisted.web.resource import Resource import synapse @@ -136,6 +137,7 @@ class GenericWorkerStore( RoomWorkerStore, DirectoryWorkerStore, ThreadSubscriptionsWorkerStore, + StickyEventsWorkerStore, PushRulesWorkerStore, ApplicationServiceTransactionWorkerStore, ApplicationServiceWorkerStore, diff --git a/synapse/config/workers.py b/synapse/config/workers.py index 825ba784820..05f854e640a 100644 --- a/synapse/config/workers.py +++ b/synapse/config/workers.py @@ -127,7 +127,7 @@ class WriterLocations: """Specifies the instances that write various streams. Attributes: - events: The instances that write to the event and backfill streams. + events: The instances that write to the event, backfill and sticky events streams. typing: The instances that write to the typing stream. Currently can only be a single instance. to_device: The instances that write to the to_device stream. Currently diff --git a/synapse/replication/tcp/handler.py b/synapse/replication/tcp/handler.py index dd7e38dd781..4b01eac09d9 100644 --- a/synapse/replication/tcp/handler.py +++ b/synapse/replication/tcp/handler.py @@ -74,6 +74,7 @@ ) from synapse.replication.tcp.streams._base import ( DeviceListsStream, + StickyEventsStream, ThreadSubscriptionsStream, ) @@ -224,6 +225,12 @@ def __init__(self, hs: "HomeServer"): continue + if isinstance(stream, StickyEventsStream): + if hs.get_instance_name() in hs.config.worker.writers.events: + self._streams_to_replicate.append(stream) + + continue + if isinstance(stream, DeviceListsStream): if hs.get_instance_name() in hs.config.worker.writers.device_lists: self._streams_to_replicate.append(stream) diff --git a/synapse/replication/tcp/streams/__init__.py b/synapse/replication/tcp/streams/__init__.py index 25c15e5d486..96dddcc5562 100644 --- a/synapse/replication/tcp/streams/__init__.py +++ b/synapse/replication/tcp/streams/__init__.py @@ -40,6 +40,7 @@ PushersStream, PushRulesStream, ReceiptsStream, + StickyEventsStream, Stream, ThreadSubscriptionsStream, ToDeviceStream, @@ -68,6 +69,7 @@ ToDeviceStream, FederationStream, AccountDataStream, + StickyEventsStream, ThreadSubscriptionsStream, UnPartialStatedRoomStream, UnPartialStatedEventStream, @@ -88,6 +90,7 @@ "DeviceListsStream", "ToDeviceStream", "AccountDataStream", + "StickyEventsStream", "ThreadSubscriptionsStream", "UnPartialStatedRoomStream", "UnPartialStatedEventStream", diff --git a/synapse/replication/tcp/streams/_base.py b/synapse/replication/tcp/streams/_base.py index ec7e935d6a3..bdc74e4b6a5 100644 --- a/synapse/replication/tcp/streams/_base.py +++ b/synapse/replication/tcp/streams/_base.py @@ -766,3 +766,45 @@ async def _update_function( return [], to_token, False return rows, rows[-1][0], len(updates) == limit + + +class StickyEventsStream(_StreamFromIdGen): + """A sticky event was changed.""" + + @attr.s(slots=True, auto_attribs=True) + class StickyEventsStreamRow: + """Stream to inform workers about changes to sticky events.""" + + room_id: str + event_id: str # The sticky event ID + + NAME = "sticky_events" + ROW_TYPE = StickyEventsStreamRow + + def __init__(self, hs: "HomeServer"): + self.store = hs.get_datastores().main + super().__init__( + hs.get_instance_name(), + self._update_function, + self.store._sticky_events_id_gen, + ) + + async def _update_function( + self, instance_name: str, from_token: int, to_token: int, limit: int + ) -> StreamUpdateResult: + updates = await self.store.get_updated_sticky_events( + from_id=from_token, to_id=to_token, limit=limit + ) + rows = [ + ( + stream_id, + # These are the args to `StickyEventsStreamRow` + (room_id, event_id), + ) + for stream_id, room_id, event_id in updates + ] + + if not rows: + return [], to_token, False + + return rows, rows[-1][0], len(updates) == limit diff --git a/synapse/storage/databases/main/__init__.py b/synapse/storage/databases/main/__init__.py index de55c452aea..afac2d0835f 100644 --- a/synapse/storage/databases/main/__init__.py +++ b/synapse/storage/databases/main/__init__.py @@ -34,6 +34,7 @@ ) from synapse.storage.databases.main.sliding_sync import SlidingSyncStore from synapse.storage.databases.main.stats import UserSortOrder +from synapse.storage.databases.main.sticky_events import StickyEventsWorkerStore from synapse.storage.databases.main.thread_subscriptions import ( ThreadSubscriptionsWorkerStore, ) @@ -144,6 +145,7 @@ class DataStore( TagsStore, AccountDataStore, ThreadSubscriptionsWorkerStore, + StickyEventsWorkerStore, PushRulesWorkerStore, StreamWorkerStore, OpenIdStore, diff --git a/synapse/storage/databases/main/events.py b/synapse/storage/databases/main/events.py index 37742a0b1c2..2dc687481cb 100644 --- a/synapse/storage/databases/main/events.py +++ b/synapse/storage/databases/main/events.py @@ -22,7 +22,6 @@ import collections import itertools import logging -import time from collections import OrderedDict from typing import ( TYPE_CHECKING, @@ -49,7 +48,6 @@ EventTypes, Membership, RelationTypes, - StickyEvent, ) from synapse.api.errors import PartialStateConflictError from synapse.api.room_versions import RoomVersions @@ -253,6 +251,7 @@ def __init__( self.database_engine = db.engine self._clock = hs.get_clock() self._instance_name = hs.get_instance_name() + self.msc4354_sticky_events = hs.config.experimental.msc4354_enabled self._ephemeral_messages_enabled = hs.config.server.enable_ephemeral_messages self.is_mine_id = hs.is_mine_id @@ -1176,13 +1175,15 @@ def _persist_events_txn( self._update_sliding_sync_tables_with_new_persisted_events_txn( txn, room_id, events_and_contexts ) - # process events which are sticky as well as re-evaluate soft-failed sticky events. - self._handle_sticky_events_txn( - txn, - room_id, - events_and_contexts, - state_delta_for_room, - ) + + if self.msc4354_sticky_events: + # process events which are sticky as well as re-evaluate soft-failed sticky events. + self.store.handle_sticky_events_txn( + txn, + room_id, + events_and_contexts, + state_delta_for_room, + ) def _persist_event_auth_chain_txn( self, @@ -2930,234 +2931,6 @@ def _store_redaction(self, txn: LoggingTransaction, event: EventBase) -> None: }, ) - def _handle_sticky_events_txn( - self, - txn: LoggingTransaction, - room_id: str, - events_and_contexts: List[EventPersistencePair], - state_delta_for_room: Optional[DeltaState], - ) -> None: - """ - Update the sticky events table, used in MSC4354. - - This function assumes that `_store_event_txn()` (to persist the event) and - `_update_current_state_txn(...)` (so the current state has taken the events into account) - have already been run. - - "Handling" sticky events is broken into two phases: - - for each sticky event in events_and_contexts, mark them as sticky in the sticky events table. - - for each still-sticky soft-failed event in the room, re-evaluate soft-failedness. - - Args: - txn - room_id: The room that all of the events belong to - events_and_contexts: The events being persisted. - state_delta_for_room: The changes to the current state, used to detect if we need to - re-evaluate soft-failed sticky events. - """ - if len(events_and_contexts) == 0: - return - - # TODO: finish the impl - # fetch soft failed sticky events to recheck now, before we insert new sticky events, else - # we could incorrectly re-evaluate new sticky events - # event_ids_to_check = self._get_soft_failed_sticky_events_to_recheck(txn, room_id, state_delta_for_room) - # logger.info(f"_get_soft_failed_sticky_events_to_recheck => {event_ids_to_check}") - # recheck them and update any that now pass soft-fail checks. - # self._recheck_soft_failed_events(txn, room_id, event_ids_to_check) - - # insert brand new sticky events. - self._insert_sticky_events_txn(txn, events_and_contexts) - - def _insert_sticky_events_txn( - self, - txn: LoggingTransaction, - events_and_contexts: List[EventPersistencePair], - ) -> None: - sticky_events: List[EventBase] = [] - for ev, _ in events_and_contexts: - # MSC: Note: policy servers and other similar antispam techniques still apply to these events. - if ev.internal_metadata.policy_server_spammy: - continue - # We shouldn't be passed rejected events, but if we do, we filter them out too. - if ev.rejected_reason is not None: - continue - # MSC: The presence of sticky.duration_ms with a valid value makes the event “sticky” - sticky_obj = ev.get_dict().get(StickyEvent.FIELD_NAME, None) - if type(sticky_obj) is dict: - sticky_duration_ms = sticky_obj.get("duration_ms", None) - # MSC: Valid values are the integer range 0-3600000 (1 hour). - if ( - type(sticky_duration_ms) is int - and sticky_duration_ms >= 0 - and sticky_duration_ms <= 3600000 - ): - sticky_events.append(ev) - - # TODO: filter out already expired sticky events. - - if len(sticky_events) == 0: - return - logger.info( - "inserting %d sticky events in room %s", - len(sticky_events), - sticky_events[0].room_id, - ) - now_ms = round(time.time() * 1000) - self.db_pool.simple_insert_many_txn( - txn, - "sticky_events", - keys=("room_id", "event_id", "sender", "expires_at", "soft_failed"), - values=[ - ( - ev.room_id, - ev.event_id, - ev.sender, - # MSC: The start time is min(now, origin_server_ts). - # This ensures that malicious origin timestamps cannot specify start times in the future. - # Calculate the end time as start_time + min(sticky.duration_ms, 3600000). - min(ev.origin_server_ts, now_ms) - + min( - ev.get_dict()[StickyEvent.FIELD_NAME]["duration_ms"], 3600000 - ), - ev.internal_metadata.is_soft_failed(), - ) - for ev in sticky_events - ], - ) - - def _get_soft_failed_sticky_events_to_recheck( - self, - txn: LoggingTransaction, - room_id: str, - state_delta_for_room: Optional[DeltaState], - ) -> List[str]: - """Fetch soft-failed sticky events which should be rechecked against the current state. - - Soft-failed events are not rejected, so they pass auth at the state before - the event and at the auth_events in the event. Instead, soft-failed events failed auth at - the _current state of the room_. We only need to recheck soft failure if we have a reason to - believe the event may pass that check now. - - Note that we don't bother rechecking accepted events that may now be soft-failed, because - by that point it's too late as we've already sent the event to clients. - - Returns: - A list of event IDs to recheck - """ - - if state_delta_for_room is None: - # No change to current state => no way soft failure status could be different. - return [] - - # any change to critical auth events may change soft failure status. This means any changes - # to join rules, power levels or member events. If the state has changed but it isn't one - # of those events, we don't need to recheck. - critical_auth_types = ( - EventTypes.JoinRules, - EventTypes.PowerLevels, - EventTypes.Member, - ) - critical_auth_types_changed = set() - critical_auth_types_changed.update( - [ - typ - for typ, _ in state_delta_for_room.to_delete - if typ in critical_auth_types - ] - ) - critical_auth_types_changed.update( - [ - typ - for typ, _ in state_delta_for_room.to_insert - if typ in critical_auth_types - ] - ) - if len(critical_auth_types_changed) == 0: - # No change to critical auth events => no way soft failure status could be different. - return [] - - if critical_auth_types_changed == {EventTypes.Member}: - # the final case we want to catch is when unprivileged users join/leave rooms. These users cause - # changes in the critical auth types (the member event) but ultimately have no effect on soft - # failure status for anyone but that user themselves. - # - # Grab the set of senders that have been modified and see if any of them sent a soft-failed - # sticky event. If they did, then we need to re-evaluate. If they didn't, then we don't need to. - new_membership_changes = set( - [ - skey - for typ, skey in state_delta_for_room.to_insert - if typ == EventTypes.Member - ] - + [ - skey - for typ, skey in state_delta_for_room.to_delete - if typ == EventTypes.Member - ] - ) - # pull out senders of sticky events in this room - events_to_recheck: List[Tuple[str]] = self.db_pool.simple_select_many_txn( - txn, - table="sticky_events", - column="sender", - iterable=new_membership_changes, - keyvalues={ - "room_id": room_id, - "soft_failed": True, - }, - retcols=("event_id"), - ) - return [event_id for (event_id,) in events_to_recheck] - - # otherwise one of the following must be true: - # - there was a change in PL or join rules - # - there was a change in the membership of a sender of a soft-failed sticky event. - # In both of these cases we want to re-evaluate soft failure status. - # - # NB: event auth checks are NOT recursive. We don't need to specifically handle the case where - # an admin user's membership changes which causes a PL event to be allowed, as when the PL event - # gets allowed we will re-evaluate anyway. E.g: - # - # PL(send_event=0, sender=Admin) - # ^ ^_____________________ - # | | - # . PL(send_event=50, sender=Mod) sticky event (sender=User) - # - # In this scenario, the sticky event is soft-failed due to the Mod updating the PL event to - # set send_event=50, which User does not have. If we learn of an event which makes Mod's PL - # event invalid (say, Mod was banned by Admin concurrently to Mod setting the PL event), then - # the act of seeing the ban event will cause the old PL event to be in the state delta, meaning - # we will re-evaluate the sticky event due to the PL changing. We don't need to specially handle case.a - events_to_recheck = self.db_pool.simple_select_list_txn( - txn, - table="sticky_events", - keyvalues={ - "room_id": room_id, - "soft_failed": True, - }, - retcols=("event_id"), - ) - return [event_id for (event_id,) in events_to_recheck] - - def _recheck_soft_failed_events( - self, - txn: LoggingTransaction, - room_id: str, - event_ids: List[str], - ) -> None: - """ - Recheck authorised but soft-failed events. The provided event IDs must have already passed - all auth checks (so the event isn't rejected) but soft-failure checks. - - Args: - txn: The SQL transaction - room_id: The room the event IDs are in. - event_ids: The soft-failed events to re-evaluate. - """ - # We know the events are otherwise authorised, so we only need to load the current state - # and check if the events pass auth at the current state. - def insert_labels_for_event_txn( self, txn: LoggingTransaction, diff --git a/synapse/storage/databases/main/sticky_events.py b/synapse/storage/databases/main/sticky_events.py new file mode 100644 index 00000000000..3f08a8bde92 --- /dev/null +++ b/synapse/storage/databases/main/sticky_events.py @@ -0,0 +1,351 @@ +# +# This file is licensed under the Affero General Public License (AGPL) version 3. +# +# Copyright (C) 2025 New Vector, Ltd +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# See the GNU Affero General Public License for more details: +# . +import logging +import time +from typing import ( + TYPE_CHECKING, + Any, + Iterable, + List, + Optional, + Tuple, +) + +from synapse.api.constants import EventTypes, StickyEvent +from synapse.events import EventBase +from synapse.events.snapshot import EventPersistencePair +from synapse.replication.tcp.streams._base import StickyEventsStream +from synapse.storage.database import ( + DatabasePool, + LoggingDatabaseConnection, + LoggingTransaction, +) +from synapse.storage.databases.main.cache import CacheInvalidationWorkerStore +from synapse.storage.databases.main.events import DeltaState +from synapse.storage.util.id_generators import MultiWriterIdGenerator + +if TYPE_CHECKING: + from synapse.server import HomeServer + +logger = logging.getLogger(__name__) + + +class StickyEventsWorkerStore(CacheInvalidationWorkerStore): + def __init__( + self, + database: DatabasePool, + db_conn: LoggingDatabaseConnection, + hs: "HomeServer", + ): + super().__init__(database, db_conn, hs) + + self._can_write_to_sticky_events = ( + self._instance_name in hs.config.worker.writers.events + ) + + self._sticky_events_id_gen: MultiWriterIdGenerator = MultiWriterIdGenerator( + db_conn=db_conn, + db=database, + notifier=hs.get_replication_notifier(), + stream_name="sticky_events", + server_name=self.server_name, + instance_name=self._instance_name, + tables=[ + ("sticky_events", "instance_name", "stream_id"), + ], + sequence_name="sticky_events_sequence", + writers=hs.config.worker.writers.events, + ) + + def process_replication_rows( + self, + stream_name: str, + instance_name: str, + token: int, + rows: Iterable[Any], + ) -> None: + super().process_replication_rows(stream_name, instance_name, token, rows) + + def process_replication_position( + self, stream_name: str, instance_name: str, token: int + ) -> None: + if stream_name == StickyEventsStream.NAME: + self._sticky_events_id_gen.advance(instance_name, token) + super().process_replication_position(stream_name, instance_name, token) + + def get_max_sticky_events_stream_id(self) -> int: + """Get the current maximum stream_id for thread subscriptions. + + Returns: + The maximum stream_id + """ + return self._sticky_events_id_gen.get_current_token() + + def get_sticky_events_stream_id_generator(self) -> MultiWriterIdGenerator: + return self._sticky_events_id_gen + + async def get_updated_sticky_events( + self, from_id: int, to_id: int, limit: int + ) -> List[Tuple[int, str, str]]: + """Get updates to sticky events between two stream IDs. + + Args: + from_id: The starting stream ID (exclusive) + to_id: The ending stream ID (inclusive) + limit: The maximum number of rows to return + + Returns: + list of (stream_id, room_id, event_id) tuples + """ + return [] # TODO + + def handle_sticky_events_txn( + self, + txn: LoggingTransaction, + room_id: str, + events_and_contexts: List[EventPersistencePair], + state_delta_for_room: Optional[DeltaState], + ) -> None: + """Update the sticky events table, used in MSC4354. Intended to be called within the persist + events transaction. + + This function assumes that `_store_event_txn()` (to persist the event) and + `_update_current_state_txn(...)` (so the current state has taken the events into account) + have already been run. + + "Handling" sticky events is broken into two phases: + - for each sticky event in events_and_contexts, mark them as sticky in the sticky events table. + - for each still-sticky soft-failed event in the room, re-evaluate soft-failedness. + + Args: + txn + room_id: The room that all of the events belong to + events_and_contexts: The events being persisted. + state_delta_for_room: The changes to the current state, used to detect if we need to + re-evaluate soft-failed sticky events. + """ + if len(events_and_contexts) == 0: + return + + # TODO: finish the impl + # fetch soft failed sticky events to recheck now, before we insert new sticky events, else + # we could incorrectly re-evaluate new sticky events + # event_ids_to_check = self._get_soft_failed_sticky_events_to_recheck(txn, room_id, state_delta_for_room) + # logger.info(f"_get_soft_failed_sticky_events_to_recheck => {event_ids_to_check}") + # recheck them and update any that now pass soft-fail checks. + # self._recheck_soft_failed_events(txn, room_id, event_ids_to_check) + + # insert brand new sticky events. + self._insert_sticky_events_txn(txn, events_and_contexts) + + def _insert_sticky_events_txn( + self, + txn: LoggingTransaction, + events_and_contexts: List[EventPersistencePair], + ) -> None: + now_ms = round(time.time() * 1000) + # event, expires_at, stream_id + sticky_events: List[Tuple[EventBase, int, int]] = [] + for ev, _ in events_and_contexts: + # MSC: Note: policy servers and other similar antispam techniques still apply to these events. + if ev.internal_metadata.policy_server_spammy: + continue + # We shouldn't be passed rejected events, but if we do, we filter them out too. + if ev.rejected_reason is not None: + continue + # MSC: The presence of sticky.duration_ms with a valid value makes the event “sticky” + sticky_obj = ev.get_dict().get(StickyEvent.FIELD_NAME, None) + if type(sticky_obj) is not dict: + continue + sticky_duration_ms = sticky_obj.get("duration_ms", None) + # MSC: Valid values are the integer range 0-3600000 (1 hour). + if ( + type(sticky_duration_ms) is int + and sticky_duration_ms >= 0 + and sticky_duration_ms <= 3600000 + ): + # MSC: The start time is min(now, origin_server_ts). + # This ensures that malicious origin timestamps cannot specify start times in the future. + # Calculate the end time as start_time + min(sticky.duration_ms, 3600000). + expires_at = min(ev.origin_server_ts, now_ms) + min( + ev.get_dict()[StickyEvent.FIELD_NAME]["duration_ms"], 3600000 + ) + # filter out already expired sticky events + if expires_at > now_ms: + sticky_events.append( + (ev, expires_at, self._sticky_events_id_gen.get_next_txn(txn)) + ) + if len(sticky_events) == 0: + return + logger.info( + "inserting %d sticky events in room %s", + len(sticky_events), + sticky_events[0][0].room_id, + ) + self.db_pool.simple_insert_many_txn( + txn, + "sticky_events", + keys=( + "instance_name", + "stream_id", + "room_id", + "event_id", + "sender", + "expires_at", + "soft_failed", + ), + values=[ + ( + self._instance_name, + stream_id, + ev.room_id, + ev.event_id, + ev.sender, + expires_at, + ev.internal_metadata.is_soft_failed(), + ) + for (ev, expires_at, stream_id) in sticky_events + ], + ) + + def _get_soft_failed_sticky_events_to_recheck( + self, + txn: LoggingTransaction, + room_id: str, + state_delta_for_room: Optional[DeltaState], + ) -> List[str]: + """Fetch soft-failed sticky events which should be rechecked against the current state. + + Soft-failed events are not rejected, so they pass auth at the state before + the event and at the auth_events in the event. Instead, soft-failed events failed auth at + the _current state of the room_. We only need to recheck soft failure if we have a reason to + believe the event may pass that check now. + + Note that we don't bother rechecking accepted events that may now be soft-failed, because + by that point it's too late as we've already sent the event to clients. + + Returns: + A list of event IDs to recheck + """ + + if state_delta_for_room is None: + # No change to current state => no way soft failure status could be different. + return [] + + # any change to critical auth events may change soft failure status. This means any changes + # to join rules, power levels or member events. If the state has changed but it isn't one + # of those events, we don't need to recheck. + critical_auth_types = ( + EventTypes.JoinRules, + EventTypes.PowerLevels, + EventTypes.Member, + ) + critical_auth_types_changed = set() + critical_auth_types_changed.update( + [ + typ + for typ, _ in state_delta_for_room.to_delete + if typ in critical_auth_types + ] + ) + critical_auth_types_changed.update( + [ + typ + for typ, _ in state_delta_for_room.to_insert + if typ in critical_auth_types + ] + ) + if len(critical_auth_types_changed) == 0: + # No change to critical auth events => no way soft failure status could be different. + return [] + + if critical_auth_types_changed == {EventTypes.Member}: + # the final case we want to catch is when unprivileged users join/leave rooms. These users cause + # changes in the critical auth types (the member event) but ultimately have no effect on soft + # failure status for anyone but that user themselves. + # + # Grab the set of senders that have been modified and see if any of them sent a soft-failed + # sticky event. If they did, then we need to re-evaluate. If they didn't, then we don't need to. + new_membership_changes = set( + [ + skey + for typ, skey in state_delta_for_room.to_insert + if typ == EventTypes.Member + ] + + [ + skey + for typ, skey in state_delta_for_room.to_delete + if typ == EventTypes.Member + ] + ) + # pull out senders of sticky events in this room + events_to_recheck: List[Tuple[str]] = self.db_pool.simple_select_many_txn( + txn, + table="sticky_events", + column="sender", + iterable=new_membership_changes, + keyvalues={ + "room_id": room_id, + "soft_failed": True, + }, + retcols=("event_id"), + ) + return [event_id for (event_id,) in events_to_recheck] + + # otherwise one of the following must be true: + # - there was a change in PL or join rules + # - there was a change in the membership of a sender of a soft-failed sticky event. + # In both of these cases we want to re-evaluate soft failure status. + # + # NB: event auth checks are NOT recursive. We don't need to specifically handle the case where + # an admin user's membership changes which causes a PL event to be allowed, as when the PL event + # gets allowed we will re-evaluate anyway. E.g: + # + # PL(send_event=0, sender=Admin) + # ^ ^_____________________ + # | | + # . PL(send_event=50, sender=Mod) sticky event (sender=User) + # + # In this scenario, the sticky event is soft-failed due to the Mod updating the PL event to + # set send_event=50, which User does not have. If we learn of an event which makes Mod's PL + # event invalid (say, Mod was banned by Admin concurrently to Mod setting the PL event), then + # the act of seeing the ban event will cause the old PL event to be in the state delta, meaning + # we will re-evaluate the sticky event due to the PL changing. We don't need to specially handle case.a + events_to_recheck = self.db_pool.simple_select_list_txn( + txn, + table="sticky_events", + keyvalues={ + "room_id": room_id, + "soft_failed": True, + }, + retcols=("event_id"), + ) + return [event_id for (event_id,) in events_to_recheck] + + def _recheck_soft_failed_events( + self, + txn: LoggingTransaction, + room_id: str, + event_ids: List[str], + ) -> None: + """ + Recheck authorised but soft-failed events. The provided event IDs must have already passed + all auth checks (so the event isn't rejected) but soft-failure checks. + + Args: + txn: The SQL transaction + room_id: The room the event IDs are in. + event_ids: The soft-failed events to re-evaluate. + """ + # We know the events are otherwise authorised, so we only need to load the current state + # and check if the events pass auth at the current state. diff --git a/synapse/storage/schema/main/delta/93/01_sticky_events.sql b/synapse/storage/schema/main/delta/93/01_sticky_events.sql index e3288968de7..5f94860130a 100644 --- a/synapse/storage/schema/main/delta/93/01_sticky_events.sql +++ b/synapse/storage/schema/main/delta/93/01_sticky_events.sql @@ -11,10 +11,9 @@ -- See the GNU Affero General Public License for more details: -- . -CREATE SEQUENCE IF NOT EXISTS sticky_events_seq; - CREATE TABLE IF NOT EXISTS sticky_events( - id BIGINT PRIMARY KEY DEFAULT nextval('sticky_events_seq'), + stream_id INTEGER NOT NULL PRIMARY KEY, + instance_name TEXT NOT NULL, room_id TEXT NOT NULL, event_id TEXT NOT NULL, sender TEXT NOT NULL, diff --git a/synapse/storage/schema/main/delta/93/01_sticky_events_seq.sql.postgres b/synapse/storage/schema/main/delta/93/01_sticky_events_seq.sql.postgres new file mode 100644 index 00000000000..e4f4ff57984 --- /dev/null +++ b/synapse/storage/schema/main/delta/93/01_sticky_events_seq.sql.postgres @@ -0,0 +1,19 @@ +-- +-- This file is licensed under the Affero General Public License (AGPL) version 3. +-- +-- Copyright (C) 2025 New Vector, Ltd +-- +-- This program is free software: you can redistribute it and/or modify +-- it under the terms of the GNU Affero General Public License as +-- published by the Free Software Foundation, either version 3 of the +-- License, or (at your option) any later version. +-- +-- See the GNU Affero General Public License for more details: +-- . + +CREATE SEQUENCE sticky_events_sequence + -- Synapse streams start at 2, because the default position is 1 + -- so any item inserted at position 1 is ignored. + -- This is also what existing streams do, except they use `setval(..., 1)` + -- which is semantically the same except less obvious. + START WITH 2; From e01a22b2de8b80d3258509b3017281807943bf23 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Fri, 19 Sep 2025 14:30:26 +0100 Subject: [PATCH 04/34] Hook up replication receiver, add sticky events to sync tokens --- synapse/_scripts/synapse_port_db.py | 1 + synapse/app/generic_worker.py | 2 +- synapse/notifier.py | 1 + synapse/replication/tcp/client.py | 12 ++- .../storage/databases/main/sticky_events.py | 73 ++++++++++++++++++- synapse/streams/events.py | 3 + synapse/types/__init__.py | 10 ++- 7 files changed, 98 insertions(+), 4 deletions(-) diff --git a/synapse/_scripts/synapse_port_db.py b/synapse/_scripts/synapse_port_db.py index a81db3cfbfb..7b18b2b88cd 100755 --- a/synapse/_scripts/synapse_port_db.py +++ b/synapse/_scripts/synapse_port_db.py @@ -137,6 +137,7 @@ "has_known_state", "is_encrypted", ], + "sticky_events": ["soft_failed"], "thread_subscriptions": ["subscribed", "automatic"], "users": ["shadow_banned", "approved", "locked", "suspended"], "un_partial_stated_event_stream": ["rejection_status_changed"], diff --git a/synapse/app/generic_worker.py b/synapse/app/generic_worker.py index 742b2af0813..f9b86f99d35 100644 --- a/synapse/app/generic_worker.py +++ b/synapse/app/generic_worker.py @@ -23,7 +23,6 @@ import sys from typing import Dict, List -from synapse.storage.databases.main.sticky_events import StickyEventsWorkerStore from twisted.web.resource import Resource import synapse @@ -102,6 +101,7 @@ from synapse.storage.databases.main.sliding_sync import SlidingSyncStore from synapse.storage.databases.main.state import StateGroupWorkerStore from synapse.storage.databases.main.stats import StatsStore +from synapse.storage.databases.main.sticky_events import StickyEventsWorkerStore from synapse.storage.databases.main.stream import StreamWorkerStore from synapse.storage.databases.main.tags import TagsWorkerStore from synapse.storage.databases.main.task_scheduler import TaskSchedulerWorkerStore diff --git a/synapse/notifier.py b/synapse/notifier.py index e684df4866b..136e766d681 100644 --- a/synapse/notifier.py +++ b/synapse/notifier.py @@ -533,6 +533,7 @@ def on_new_event( StreamKeyType.TYPING, StreamKeyType.UN_PARTIAL_STATED_ROOMS, StreamKeyType.THREAD_SUBSCRIPTIONS, + StreamKeyType.STICKY_EVENTS, ], new_token: int, users: Optional[Collection[Union[str, UserID]]] = None, diff --git a/synapse/replication/tcp/client.py b/synapse/replication/tcp/client.py index 7a86b2e65ee..21b0fda58bf 100644 --- a/synapse/replication/tcp/client.py +++ b/synapse/replication/tcp/client.py @@ -44,7 +44,10 @@ UnPartialStatedEventStream, UnPartialStatedRoomStream, ) -from synapse.replication.tcp.streams._base import ThreadSubscriptionsStream +from synapse.replication.tcp.streams._base import ( + StickyEventsStream, + ThreadSubscriptionsStream, +) from synapse.replication.tcp.streams.events import ( EventsStream, EventsStreamEventRow, @@ -262,6 +265,13 @@ async def on_rdata( token, users=[row.user_id for row in rows], ) + elif stream_name == StickyEventsStream.NAME: + print(f"STICKY_EVENTS on_rdata {token} => {rows}") + self.notifier.on_new_event( + StreamKeyType.STICKY_EVENTS, + token, + rooms=[row.room_id for row in rows], + ) await self._presence_handler.process_replication_rows( stream_name, instance_name, token, rows diff --git a/synapse/storage/databases/main/sticky_events.py b/synapse/storage/databases/main/sticky_events.py index 3f08a8bde92..d6cdefa9f49 100644 --- a/synapse/storage/databases/main/sticky_events.py +++ b/synapse/storage/databases/main/sticky_events.py @@ -15,10 +15,13 @@ from typing import ( TYPE_CHECKING, Any, + Dict, Iterable, List, Optional, + Set, Tuple, + cast, ) from synapse.api.constants import EventTypes, StickyEvent @@ -29,6 +32,7 @@ DatabasePool, LoggingDatabaseConnection, LoggingTransaction, + make_in_list_sql_clause, ) from synapse.storage.databases.main.cache import CacheInvalidationWorkerStore from synapse.storage.databases.main.events import DeltaState @@ -94,6 +98,54 @@ def get_max_sticky_events_stream_id(self) -> int: def get_sticky_events_stream_id_generator(self) -> MultiWriterIdGenerator: return self._sticky_events_id_gen + async def get_sticky_events_in_rooms( + self, + room_ids: List[str], + from_id: int, + ) -> Tuple[int, Dict[str, Set[str]]]: + """ + Fetch all the sticky events in the given rooms, from the given sticky stream ID. + + Args: + room_ids: The room IDs to return sticky events in. + from_id: The sticky stream ID that sticky events should be returned from. + Returns: + A tuple of (to_id, map[room_id, event_ids]) + """ + sticky_events_rows = await self.db_pool.runInteraction( + "get_sticky_events_in_rooms", + self._get_sticky_events_in_rooms_txn, + room_ids, + from_id, + ) + to_id = from_id + room_to_events: Dict[str, Set[str]] = {} + for stream_id, room_id, event_id in sticky_events_rows: + to_id = max(to_id, stream_id) + events = room_to_events.get(room_id, set()) + events.add(event_id) + room_to_events[room_id] = events + return (to_id, room_to_events) + + def _get_sticky_events_in_rooms_txn( + self, + txn: LoggingTransaction, + room_ids: List[str], + from_id: int, + ) -> List[Tuple[int, str, str]]: + if len(room_ids) == 0: + return [] + clause, room_id_values = make_in_list_sql_clause( + txn.database_engine, "room_id", room_ids + ) + txn.execute( + f""" + SELECT stream_id, room_id, event_id FROM sticky_events WHERE stream_id > ? AND {clause} + """, + (from_id, room_id_values), + ) + return cast(List[Tuple[int, str, str]], txn.fetchall()) + async def get_updated_sticky_events( self, from_id: int, to_id: int, limit: int ) -> List[Tuple[int, str, str]]: @@ -107,7 +159,24 @@ async def get_updated_sticky_events( Returns: list of (stream_id, room_id, event_id) tuples """ - return [] # TODO + return await self.db_pool.runInteraction( + "get_updated_sticky_events", + self._get_updated_sticky_events_txn, + from_id, + to_id, + limit, + ) + + def _get_updated_sticky_events_txn( + self, txn: LoggingTransaction, from_id: int, to_id: int, limit: int + ) -> List[Tuple[int, str, str]]: + txn.execute( + """ + SELECT stream_id, room_id, event_id FROM sticky_events WHERE stream_id > ? AND stream_id <= ? LIMIT ? + """, + (from_id, to_id, limit), + ) + return cast(List[Tuple[int, str, str]], txn.fetchall()) def handle_sticky_events_txn( self, @@ -137,6 +206,8 @@ def handle_sticky_events_txn( if len(events_and_contexts) == 0: return + assert self._can_write_to_sticky_events + # TODO: finish the impl # fetch soft failed sticky events to recheck now, before we insert new sticky events, else # we could incorrectly re-evaluate new sticky events diff --git a/synapse/streams/events.py b/synapse/streams/events.py index 1e4bebe46d5..52f14517243 100644 --- a/synapse/streams/events.py +++ b/synapse/streams/events.py @@ -84,6 +84,7 @@ def get_current_token(self) -> StreamToken: self._instance_name ) thread_subscriptions_key = self.store.get_max_thread_subscriptions_stream_id() + sticky_events_key = self.store.get_max_sticky_events_stream_id() token = StreamToken( room_key=self.sources.room.get_current_key(), @@ -98,6 +99,7 @@ def get_current_token(self) -> StreamToken: groups_key=0, un_partial_stated_rooms_key=un_partial_stated_rooms_key, thread_subscriptions_key=thread_subscriptions_key, + sticky_events_key=sticky_events_key, ) return token @@ -125,6 +127,7 @@ async def bound_future_token(self, token: StreamToken) -> StreamToken: StreamKeyType.DEVICE_LIST: self.store.get_device_stream_id_generator(), StreamKeyType.UN_PARTIAL_STATED_ROOMS: self.store.get_un_partial_stated_rooms_id_generator(), StreamKeyType.THREAD_SUBSCRIPTIONS: self.store.get_thread_subscriptions_stream_id_generator(), + StreamKeyType.STICKY_EVENTS: self.store.get_sticky_events_stream_id_generator(), } for _, key in StreamKeyType.__members__.items(): diff --git a/synapse/types/__init__.py b/synapse/types/__init__.py index 2d5b07ab8fa..2ea89865114 100644 --- a/synapse/types/__init__.py +++ b/synapse/types/__init__.py @@ -997,6 +997,7 @@ class StreamKeyType(Enum): DEVICE_LIST = "device_list_key" UN_PARTIAL_STATED_ROOMS = "un_partial_stated_rooms_key" THREAD_SUBSCRIPTIONS = "thread_subscriptions_key" + STICKY_EVENTS = "sticky_events_key" @attr.s(slots=True, frozen=True, auto_attribs=True) @@ -1018,6 +1019,7 @@ class StreamToken: 9. `groups_key`: `1` (note that this key is now unused) 10. `un_partial_stated_rooms_key`: `379` 11. `thread_subscriptions_key`: 4242 + 12. `sticky_events_key`: 4141 You can see how many of these keys correspond to the various fields in a "/sync" response: @@ -1077,6 +1079,7 @@ class StreamToken: groups_key: int un_partial_stated_rooms_key: int thread_subscriptions_key: int + sticky_events_key: int _SEPARATOR = "_" START: ClassVar["StreamToken"] @@ -1105,6 +1108,7 @@ async def from_string(cls, store: "DataStore", string: str) -> "StreamToken": groups_key, un_partial_stated_rooms_key, thread_subscriptions_key, + sticky_events_key, ) = keys return cls( @@ -1121,6 +1125,7 @@ async def from_string(cls, store: "DataStore", string: str) -> "StreamToken": groups_key=int(groups_key), un_partial_stated_rooms_key=int(un_partial_stated_rooms_key), thread_subscriptions_key=int(thread_subscriptions_key), + sticky_events_key=int(sticky_events_key), ) except CancelledError: raise @@ -1144,6 +1149,7 @@ async def to_string(self, store: "DataStore") -> str: str(self.groups_key), str(self.un_partial_stated_rooms_key), str(self.thread_subscriptions_key), + str(self.sticky_events_key), ] ) @@ -1209,6 +1215,7 @@ def get_field( StreamKeyType.TYPING, StreamKeyType.UN_PARTIAL_STATED_ROOMS, StreamKeyType.THREAD_SUBSCRIPTIONS, + StreamKeyType.STICKY_EVENTS, ], ) -> int: ... @@ -1265,7 +1272,7 @@ def __str__(self) -> str: f"account_data: {self.account_data_key}, push_rules: {self.push_rules_key}, " f"to_device: {self.to_device_key}, device_list: {self.device_list_key}, " f"groups: {self.groups_key}, un_partial_stated_rooms: {self.un_partial_stated_rooms_key}," - f"thread_subscriptions: {self.thread_subscriptions_key})" + f"thread_subscriptions: {self.thread_subscriptions_key}, sticky_events: {self.sticky_events_key})" ) @@ -1281,6 +1288,7 @@ def __str__(self) -> str: groups_key=0, un_partial_stated_rooms_key=0, thread_subscriptions_key=0, + sticky_events_key=0, ) From 3e7a5a6bd650c496807795039d8414316d0b5acd Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Fri, 19 Sep 2025 16:28:04 +0100 Subject: [PATCH 05/34] Insert sticky events into /sync responses --- synapse/handlers/sync.py | 73 +++++++++++++++++++ synapse/rest/client/sync.py | 5 ++ .../storage/databases/main/sticky_events.py | 7 +- 3 files changed, 82 insertions(+), 3 deletions(-) diff --git a/synapse/handlers/sync.py b/synapse/handlers/sync.py index 4a68fdcc76e..f9c5e3b3c08 100644 --- a/synapse/handlers/sync.py +++ b/synapse/handlers/sync.py @@ -153,6 +153,7 @@ class JoinedSyncResult: state: StateMap[EventBase] ephemeral: List[JsonDict] account_data: List[JsonDict] + sticky: List[EventBase] unread_notifications: JsonDict unread_thread_notifications: JsonDict summary: Optional[JsonDict] @@ -608,6 +609,69 @@ async def ephemeral_by_room( return now_token, ephemeral_by_room + async def sticky_events_by_room( + self, + sync_result_builder: "SyncResultBuilder", + now_token: StreamToken, + since_token: Optional[StreamToken] = None, + ) -> Tuple[StreamToken, Dict[str, Set[str]]]: + """Get the sticky events for each room the user is in + Args: + sync_result_builder + now_token: Where the server is currently up to. + since_token: Where the server was when the client + last synced. + Returns: + A tuple of the now StreamToken, updated to reflect the which sticky + events are included, and a dict mapping from room_id to a list of + sticky event IDs for that room. + """ + with Measure( + self.clock, name="sticky_events_by_room", server_name=self.server_name + ): + from_id = since_token.sticky_events_key if since_token else 0 + + room_ids = sync_result_builder.joined_room_ids + + to_id, sticky_by_room = await self.store.get_sticky_events_in_rooms( + room_ids, from_id + ) + now_token = now_token.copy_and_replace(StreamKeyType.STICKY_EVENTS, to_id) + + return now_token, sticky_by_room + + async def _generate_sticky_events( + self, + sync_result_builder: "SyncResultBuilder", + sticky_by_room: Dict[str, Set[str]], + ) -> None: + """Generate sticky events to put into the sync response. + + The builder should already be populated with timeline events for joined rooms, so we can + duplicate suppress sticky events that are already going to be returned in the timeline section + of the sync response. + + Args: + sync_result_builder + sticky_by_room: Map of room ID to sticky event IDs. + """ + for joined_room in sync_result_builder.joined: + sticky_event_ids = sticky_by_room.get(joined_room.room_id, set()) + if len(sticky_event_ids) == 0: + continue + # remove sticky events that are in the timeline + timeline = {ev.event_id for ev in joined_room.timeline.events} + sticky_event_ids = sticky_event_ids.difference(timeline) + if len(sticky_event_ids) == 0: + continue + event_map = await self.store.get_events(sticky_event_ids) + joined_room.sticky = await filter_events_for_client( + self._storage_controllers, + sync_result_builder.sync_config.user.to_string(), + list(event_map.values()), + always_include_ids=frozenset(sticky_event_ids), + ) + async def _load_filtered_recents( self, room_id: str, @@ -2240,6 +2304,14 @@ async def handle_room_entries(room_entry: "RoomSyncResultBuilder") -> None: sync_result_builder.invited.extend(invited) sync_result_builder.knocked.extend(knocked) + if self.hs_config.experimental.msc4354_enabled: + now_token, sticky_by_room = await self.sticky_events_by_room( + sync_result_builder, now_token, since_token + ) + if sticky_by_room: + await self._generate_sticky_events(sync_result_builder, sticky_by_room) + sync_result_builder.now_token = now_token + return set(newly_joined_rooms), set(newly_left_rooms) async def _have_rooms_changed( @@ -2795,6 +2867,7 @@ async def _generate_room_entry( unread_thread_notifications={}, summary=summary, unread_count=0, + sticky=[], ) if room_sync or always_include: diff --git a/synapse/rest/client/sync.py b/synapse/rest/client/sync.py index c424ca53254..d1ef1f61931 100644 --- a/synapse/rest/client/sync.py +++ b/synapse/rest/client/sync.py @@ -619,6 +619,11 @@ async def encode_room( ephemeral_events = room.ephemeral result["ephemeral"] = {"events": ephemeral_events} result["unread_notifications"] = room.unread_notifications + if room.sticky: + serialized_sticky = await self._event_serializer.serialize_events( + room.sticky, time_now, config=serialize_options + ) + result["sticky"] = {"events": serialized_sticky} if room.unread_thread_notifications: result["unread_thread_notifications"] = room.unread_thread_notifications if self._msc3773_enabled: diff --git a/synapse/storage/databases/main/sticky_events.py b/synapse/storage/databases/main/sticky_events.py index d6cdefa9f49..7f8082421c8 100644 --- a/synapse/storage/databases/main/sticky_events.py +++ b/synapse/storage/databases/main/sticky_events.py @@ -15,6 +15,7 @@ from typing import ( TYPE_CHECKING, Any, + Collection, Dict, Iterable, List, @@ -100,7 +101,7 @@ def get_sticky_events_stream_id_generator(self) -> MultiWriterIdGenerator: async def get_sticky_events_in_rooms( self, - room_ids: List[str], + room_ids: Collection[str], from_id: int, ) -> Tuple[int, Dict[str, Set[str]]]: """ @@ -130,7 +131,7 @@ async def get_sticky_events_in_rooms( def _get_sticky_events_in_rooms_txn( self, txn: LoggingTransaction, - room_ids: List[str], + room_ids: Collection[str], from_id: int, ) -> List[Tuple[int, str, str]]: if len(room_ids) == 0: @@ -140,7 +141,7 @@ def _get_sticky_events_in_rooms_txn( ) txn.execute( f""" - SELECT stream_id, room_id, event_id FROM sticky_events WHERE stream_id > ? AND {clause} + SELECT stream_id, room_id, event_id FROM sticky_events WHERE soft_failed=FALSE AND stream_id > ? AND {clause} """, (from_id, room_id_values), ) From 7af74298b36e7c4153c1edb7d36e736042e513c8 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Mon, 22 Sep 2025 10:16:00 +0100 Subject: [PATCH 06/34] Don't include expired sticky events in /sync responses --- synapse/handlers/sync.py | 6 +++++- synapse/storage/databases/main/sticky_events.py | 13 ++++++++++--- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/synapse/handlers/sync.py b/synapse/handlers/sync.py index f9c5e3b3c08..6f8010fe824 100644 --- a/synapse/handlers/sync.py +++ b/synapse/handlers/sync.py @@ -20,6 +20,7 @@ # import itertools import logging +import time from typing import ( TYPE_CHECKING, AbstractSet, @@ -626,6 +627,7 @@ async def sticky_events_by_room( events are included, and a dict mapping from room_id to a list of sticky event IDs for that room. """ + now = round(time.time() * 1000) with Measure( self.clock, name="sticky_events_by_room", server_name=self.server_name ): @@ -634,7 +636,9 @@ async def sticky_events_by_room( room_ids = sync_result_builder.joined_room_ids to_id, sticky_by_room = await self.store.get_sticky_events_in_rooms( - room_ids, from_id + room_ids, + from_id, + now, ) now_token = now_token.copy_and_replace(StreamKeyType.STICKY_EVENTS, to_id) diff --git a/synapse/storage/databases/main/sticky_events.py b/synapse/storage/databases/main/sticky_events.py index 7f8082421c8..925a640f563 100644 --- a/synapse/storage/databases/main/sticky_events.py +++ b/synapse/storage/databases/main/sticky_events.py @@ -103,6 +103,7 @@ async def get_sticky_events_in_rooms( self, room_ids: Collection[str], from_id: int, + now: int, ) -> Tuple[int, Dict[str, Set[str]]]: """ Fetch all the sticky events in the given rooms, from the given sticky stream ID. @@ -110,6 +111,7 @@ async def get_sticky_events_in_rooms( Args: room_ids: The room IDs to return sticky events in. from_id: The sticky stream ID that sticky events should be returned from. + now: The current time in unix millis, used for skipping expired events. Returns: A tuple of (to_id, map[room_id, event_ids]) """ @@ -118,6 +120,7 @@ async def get_sticky_events_in_rooms( self._get_sticky_events_in_rooms_txn, room_ids, from_id, + now, ) to_id = from_id room_to_events: Dict[str, Set[str]] = {} @@ -133,6 +136,7 @@ def _get_sticky_events_in_rooms_txn( txn: LoggingTransaction, room_ids: Collection[str], from_id: int, + now: int, ) -> List[Tuple[int, str, str]]: if len(room_ids) == 0: return [] @@ -141,9 +145,9 @@ def _get_sticky_events_in_rooms_txn( ) txn.execute( f""" - SELECT stream_id, room_id, event_id FROM sticky_events WHERE soft_failed=FALSE AND stream_id > ? AND {clause} + SELECT stream_id, room_id, event_id FROM sticky_events WHERE soft_failed=FALSE AND expires_at > ? AND stream_id > ? AND {clause} """, - (from_id, room_id_values), + (now, from_id, room_id_values), ) return cast(List[Tuple[int, str, str]], txn.fetchall()) @@ -225,7 +229,7 @@ def _insert_sticky_events_txn( txn: LoggingTransaction, events_and_contexts: List[EventPersistencePair], ) -> None: - now_ms = round(time.time() * 1000) + now_ms = self._now() # event, expires_at, stream_id sticky_events: List[Tuple[EventBase, int, int]] = [] for ev, _ in events_and_contexts: @@ -421,3 +425,6 @@ def _recheck_soft_failed_events( """ # We know the events are otherwise authorised, so we only need to load the current state # and check if the events pass auth at the current state. + + def _now(self) -> int: + return round(time.time() * 1000) From 0cfdd0d6b53b302eeb9d7a07112ccfa5655001e3 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Mon, 22 Sep 2025 14:27:54 +0100 Subject: [PATCH 07/34] Delete from sticky_events periodically --- .../storage/databases/main/sticky_events.py | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/synapse/storage/databases/main/sticky_events.py b/synapse/storage/databases/main/sticky_events.py index 925a640f563..f8ca1474314 100644 --- a/synapse/storage/databases/main/sticky_events.py +++ b/synapse/storage/databases/main/sticky_events.py @@ -25,9 +25,12 @@ cast, ) +from twisted.internet.defer import Deferred + from synapse.api.constants import EventTypes, StickyEvent from synapse.events import EventBase from synapse.events.snapshot import EventPersistencePair +from synapse.metrics.background_process_metrics import run_as_background_process from synapse.replication.tcp.streams._base import StickyEventsStream from synapse.storage.database import ( DatabasePool, @@ -44,6 +47,12 @@ logger = logging.getLogger(__name__) +# Remove entries from the sticky_events table at this frequency. +# Note: this does NOT mean we don't honour shorter expiration timeouts. +# Consumers call 'get_sticky_events_in_rooms' which has `WHERE expires_at > ?` +# to filter out expired sticky events that have yet to be deleted. +DELETE_EXPIRED_STICKY_EVENTS_MS = 60 * 1000 * 60 # 1 hour + class StickyEventsWorkerStore(CacheInvalidationWorkerStore): def __init__( @@ -58,6 +67,12 @@ def __init__( self._instance_name in hs.config.worker.writers.events ) + # Technically this means we will cleanup N times, once per event persister, maybe put on master? + if self._can_write_to_sticky_events: + self._clock.looping_call( + self._run_background_cleanup, DELETE_EXPIRED_STICKY_EVENTS_MS + ) + self._sticky_events_id_gen: MultiWriterIdGenerator = MultiWriterIdGenerator( db_conn=db_conn, db=database, @@ -426,5 +441,30 @@ def _recheck_soft_failed_events( # We know the events are otherwise authorised, so we only need to load the current state # and check if the events pass auth at the current state. + async def _delete_expired_sticky_events(self) -> None: + logger.info("delete_expired_sticky_events") + await self.db_pool.runInteraction( + "_delete_expired_sticky_events", + self._delete_expired_sticky_events_txn, + self._now(), + ) + + def _delete_expired_sticky_events_txn( + self, txn: LoggingTransaction, now: int + ) -> None: + txn.execute( + """ + DELETE FROM sticky_events WHERE expires_at < ? + """, + (now,), + ) + def _now(self) -> int: return round(time.time() * 1000) + + def _run_background_cleanup(self) -> Deferred: + return run_as_background_process( + "delete_expired_sticky_events", + self.server_name, + self._delete_expired_sticky_events, + ) From 7c8daf4ed9282f61deea707e201b7b216f187fda Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Tue, 23 Sep 2025 11:02:20 +0100 Subject: [PATCH 08/34] Get sticky events working with Simplified Sliding Sync --- synapse/handlers/sliding_sync/extensions.py | 60 +++++++++++++++++++ synapse/handlers/sync.py | 4 +- synapse/rest/client/sync.py | 30 +++++++++- .../storage/databases/main/sticky_events.py | 17 ++++-- .../93/01_sticky_events_seq.sql.postgres | 11 ++-- synapse/types/handlers/sliding_sync.py | 16 +++++ synapse/types/rest/client/__init__.py | 12 ++++ synapse/util/async_helpers.py | 25 ++++++++ 8 files changed, 160 insertions(+), 15 deletions(-) diff --git a/synapse/handlers/sliding_sync/extensions.py b/synapse/handlers/sliding_sync/extensions.py index 25ee954b7fd..1bb7ff0d876 100644 --- a/synapse/handlers/sliding_sync/extensions.py +++ b/synapse/handlers/sliding_sync/extensions.py @@ -54,6 +54,7 @@ concurrently_execute, gather_optional_coroutines, ) +from synapse.visibility import filter_events_for_client _ThreadSubscription: TypeAlias = ( SlidingSyncResult.Extensions.ThreadSubscriptionsExtension.ThreadSubscription @@ -76,7 +77,10 @@ def __init__(self, hs: "HomeServer"): self.event_sources = hs.get_event_sources() self.device_handler = hs.get_device_handler() self.push_rules_handler = hs.get_push_rules_handler() + self.clock = hs.get_clock() + self._storage_controllers = hs.get_storage_controllers() self._enable_thread_subscriptions = hs.config.experimental.msc4306_enabled + self._enable_sticky_events = hs.config.experimental.msc4354_enabled @trace async def get_extensions_response( @@ -177,6 +181,19 @@ async def get_extensions_response( from_token=from_token, ) + sticky_events_coro = None + if ( + sync_config.extensions.sticky_events is not None + and self._enable_sticky_events + ): + sticky_events_coro = self.get_sticky_events_extension_response( + sync_config=sync_config, + sticky_events_request=sync_config.extensions.sticky_events, + actual_room_ids=actual_room_ids, + to_token=to_token, + from_token=from_token, + ) + ( to_device_response, e2ee_response, @@ -184,6 +201,7 @@ async def get_extensions_response( receipts_response, typing_response, thread_subs_response, + sticky_events_response, ) = await gather_optional_coroutines( to_device_coro, e2ee_coro, @@ -191,6 +209,7 @@ async def get_extensions_response( receipts_coro, typing_coro, thread_subs_coro, + sticky_events_coro, ) return SlidingSyncResult.Extensions( @@ -200,6 +219,7 @@ async def get_extensions_response( receipts=receipts_response, typing=typing_response, thread_subscriptions=thread_subs_response, + sticky_events=sticky_events_response, ) def find_relevant_room_ids_for_extension( @@ -970,3 +990,43 @@ async def get_thread_subscriptions_extension_response( unsubscribed=unsubscribed_threads, prev_batch=prev_batch, ) + + async def get_sticky_events_extension_response( + self, + sync_config: SlidingSyncConfig, + sticky_events_request: SlidingSyncConfig.Extensions.StickyEventsExtension, + actual_room_ids: Set[str], + to_token: StreamToken, + from_token: Optional[SlidingSyncStreamToken], + ) -> Optional[SlidingSyncResult.Extensions.StickyEventsExtension]: + if not sticky_events_request.enabled: + return None + now = self.clock.time_msec() + from_id = from_token.stream_token.sticky_events_key if from_token else 0 + _, room_to_event_ids = await self.store.get_sticky_events_in_rooms( + actual_room_ids, + from_id, + to_token.sticky_events_key, + now, + ) + all_sticky_event_ids = { + ev_id for evs in room_to_event_ids.values() for ev_id in evs + } + event_map = await self.store.get_events(all_sticky_event_ids) + filtered_events = await filter_events_for_client( + self._storage_controllers, + sync_config.user.to_string(), + list(event_map.values()), + always_include_ids=frozenset(all_sticky_event_ids), + ) + event_map = {ev.event_id: ev for ev in filtered_events} + return SlidingSyncResult.Extensions.StickyEventsExtension( + room_id_to_sticky_events={ + room_id: { + event_map[event_id] + for event_id in sticky_event_ids + if event_id in event_map + } + for room_id, sticky_event_ids in room_to_event_ids.items() + } + ) diff --git a/synapse/handlers/sync.py b/synapse/handlers/sync.py index 6f8010fe824..c26be23e19c 100644 --- a/synapse/handlers/sync.py +++ b/synapse/handlers/sync.py @@ -620,8 +620,7 @@ async def sticky_events_by_room( Args: sync_result_builder now_token: Where the server is currently up to. - since_token: Where the server was when the client - last synced. + since_token: Where the server was when the client last synced. Returns: A tuple of the now StreamToken, updated to reflect the which sticky events are included, and a dict mapping from room_id to a list of @@ -638,6 +637,7 @@ async def sticky_events_by_room( to_id, sticky_by_room = await self.store.get_sticky_events_in_rooms( room_ids, from_id, + now_token.sticky_events_key, now, ) now_token = now_token.copy_and_replace(StreamKeyType.STICKY_EVENTS, to_id) diff --git a/synapse/rest/client/sync.py b/synapse/rest/client/sync.py index d1ef1f61931..7bafcf475e9 100644 --- a/synapse/rest/client/sync.py +++ b/synapse/rest/client/sync.py @@ -623,7 +623,7 @@ async def encode_room( serialized_sticky = await self._event_serializer.serialize_events( room.sticky, time_now, config=serialize_options ) - result["sticky"] = {"events": serialized_sticky} + result["msc4354_sticky"] = {"events": serialized_sticky} if room.unread_thread_notifications: result["unread_thread_notifications"] = room.unread_thread_notifications if self._msc3773_enabled: @@ -653,6 +653,7 @@ class SlidingSyncRestServlet(RestServlet): - receipts (MSC3960) - account data (MSC3959) - thread subscriptions (MSC4308) + - sticky events (MSC4354) Request query parameters: timeout: How long to wait for new events in milliseconds. @@ -1096,8 +1097,35 @@ async def encode_extensions( _serialise_thread_subscriptions(extensions.thread_subscriptions) ) + if extensions.sticky_events: + serialized_extensions[ + "org.matrix.msc4354.sticky_events" + ] = await self._serialise_sticky_events(requester, extensions.sticky_events) + return serialized_extensions + async def _serialise_sticky_events( + self, + requester: Requester, + sticky_events: SlidingSyncResult.Extensions.StickyEventsExtension, + ) -> JsonDict: + time_now = self.clock.time_msec() + # Same as SSS timelines. TODO: support more options like /sync does. + serialize_options = SerializeEventConfig( + event_format=format_event_for_client_v2_without_room_id, + requester=requester, + ) + return { + "rooms": { + room_id: { + "events": await self.event_serializer.serialize_events( + sticky_events, time_now, config=serialize_options + ) + } + for room_id, sticky_events in sticky_events.room_id_to_sticky_events.items() + }, + } + def _serialise_thread_subscriptions( thread_subscriptions: SlidingSyncResult.Extensions.ThreadSubscriptionsExtension, diff --git a/synapse/storage/databases/main/sticky_events.py b/synapse/storage/databases/main/sticky_events.py index f8ca1474314..130bd388df2 100644 --- a/synapse/storage/databases/main/sticky_events.py +++ b/synapse/storage/databases/main/sticky_events.py @@ -118,6 +118,7 @@ async def get_sticky_events_in_rooms( self, room_ids: Collection[str], from_id: int, + to_id: int, now: int, ) -> Tuple[int, Dict[str, Set[str]]]: """ @@ -125,7 +126,8 @@ async def get_sticky_events_in_rooms( Args: room_ids: The room IDs to return sticky events in. - from_id: The sticky stream ID that sticky events should be returned from. + from_id: The sticky stream ID that sticky events should be returned from (exclusive). + to_id: The sticky stream ID that sticky events should end at (inclusive). now: The current time in unix millis, used for skipping expired events. Returns: A tuple of (to_id, map[room_id, event_ids]) @@ -135,22 +137,24 @@ async def get_sticky_events_in_rooms( self._get_sticky_events_in_rooms_txn, room_ids, from_id, + to_id, now, ) - to_id = from_id + new_to_id = from_id room_to_events: Dict[str, Set[str]] = {} for stream_id, room_id, event_id in sticky_events_rows: - to_id = max(to_id, stream_id) + new_to_id = max(new_to_id, stream_id) events = room_to_events.get(room_id, set()) events.add(event_id) room_to_events[room_id] = events - return (to_id, room_to_events) + return (new_to_id, room_to_events) def _get_sticky_events_in_rooms_txn( self, txn: LoggingTransaction, room_ids: Collection[str], from_id: int, + to_id: int, now: int, ) -> List[Tuple[int, str, str]]: if len(room_ids) == 0: @@ -160,9 +164,10 @@ def _get_sticky_events_in_rooms_txn( ) txn.execute( f""" - SELECT stream_id, room_id, event_id FROM sticky_events WHERE soft_failed=FALSE AND expires_at > ? AND stream_id > ? AND {clause} + SELECT stream_id, room_id, event_id FROM sticky_events + WHERE soft_failed=FALSE AND expires_at > ? AND stream_id > ? AND stream_id <= ? AND {clause} """, - (now, from_id, room_id_values), + (now, from_id, to_id, room_id_values), ) return cast(List[Tuple[int, str, str]], txn.fetchall()) diff --git a/synapse/storage/schema/main/delta/93/01_sticky_events_seq.sql.postgres b/synapse/storage/schema/main/delta/93/01_sticky_events_seq.sql.postgres index e4f4ff57984..5a28a309d91 100644 --- a/synapse/storage/schema/main/delta/93/01_sticky_events_seq.sql.postgres +++ b/synapse/storage/schema/main/delta/93/01_sticky_events_seq.sql.postgres @@ -11,9 +11,8 @@ -- See the GNU Affero General Public License for more details: -- . -CREATE SEQUENCE sticky_events_sequence - -- Synapse streams start at 2, because the default position is 1 - -- so any item inserted at position 1 is ignored. - -- This is also what existing streams do, except they use `setval(..., 1)` - -- which is semantically the same except less obvious. - START WITH 2; +CREATE SEQUENCE sticky_events_sequence; +-- Synapse streams start at 2, because the default position is 1 +-- so any item inserted at position 1 is ignored. +-- We have to use nextval not START WITH 2, see https://github.com/element-hq/synapse/issues/18712 +SELECT nextval('thread_subscriptions_sequence'); diff --git a/synapse/types/handlers/sliding_sync.py b/synapse/types/handlers/sliding_sync.py index b7bc565464f..4f7d1b895c0 100644 --- a/synapse/types/handlers/sliding_sync.py +++ b/synapse/types/handlers/sliding_sync.py @@ -21,6 +21,7 @@ AbstractSet, Any, Callable, + Collection, Dict, Final, Generic, @@ -396,12 +397,26 @@ def __bool__(self) -> bool: or bool(self.prev_batch) ) + @attr.s(slots=True, frozen=True, auto_attribs=True) + class StickyEventsExtension: + """The Sticky Events extension (MSC4354) + + Attributes: + room_id_to_sticky_events: map (room_id -> [unexpired_sticky_events]) + """ + + room_id_to_sticky_events: Mapping[str, Collection[EventBase]] + + def __bool__(self) -> bool: + return bool(self.room_id_to_sticky_events) + to_device: Optional[ToDeviceExtension] = None e2ee: Optional[E2eeExtension] = None account_data: Optional[AccountDataExtension] = None receipts: Optional[ReceiptsExtension] = None typing: Optional[TypingExtension] = None thread_subscriptions: Optional[ThreadSubscriptionsExtension] = None + sticky_events: Optional[StickyEventsExtension] = None def __bool__(self) -> bool: return bool( @@ -411,6 +426,7 @@ def __bool__(self) -> bool: or self.receipts or self.typing or self.thread_subscriptions + or self.sticky_events ) next_pos: SlidingSyncStreamToken diff --git a/synapse/types/rest/client/__init__.py b/synapse/types/rest/client/__init__.py index 11d7e59b43a..39d78e66a01 100644 --- a/synapse/types/rest/client/__init__.py +++ b/synapse/types/rest/client/__init__.py @@ -376,6 +376,15 @@ class ThreadSubscriptionsExtension(RequestBodyModel): enabled: Optional[StrictBool] = False limit: StrictInt = 100 + class StickyEventsExtension(RequestBodyModel): + """The Sticky Events extension (MSC4354) + + Attributes: + enabled + """ + + enabled: Optional[StrictBool] = False + to_device: Optional[ToDeviceExtension] = None e2ee: Optional[E2eeExtension] = None account_data: Optional[AccountDataExtension] = None @@ -384,6 +393,9 @@ class ThreadSubscriptionsExtension(RequestBodyModel): thread_subscriptions: Optional[ThreadSubscriptionsExtension] = Field( alias="io.element.msc4308.thread_subscriptions" ) + sticky_events: Optional[StickyEventsExtension] = Field( + alias="org.matrix.msc4354.sticky_events" + ) conn_id: Optional[StrictStr] diff --git a/synapse/util/async_helpers.py b/synapse/util/async_helpers.py index c21b7887f9e..7b766c54aa8 100644 --- a/synapse/util/async_helpers.py +++ b/synapse/util/async_helpers.py @@ -348,6 +348,7 @@ async def yieldable_gather_results_delaying_cancellation( T4 = TypeVar("T4") T5 = TypeVar("T5") T6 = TypeVar("T6") +T7 = TypeVar("T7") @overload @@ -479,6 +480,30 @@ async def gather_optional_coroutines( ]: ... +@overload +async def gather_optional_coroutines( + *coroutines: Unpack[ + Tuple[ + Optional[Coroutine[Any, Any, T1]], + Optional[Coroutine[Any, Any, T2]], + Optional[Coroutine[Any, Any, T3]], + Optional[Coroutine[Any, Any, T4]], + Optional[Coroutine[Any, Any, T5]], + Optional[Coroutine[Any, Any, T6]], + Optional[Coroutine[Any, Any, T7]], + ] + ], +) -> Tuple[ + Optional[T1], + Optional[T2], + Optional[T3], + Optional[T4], + Optional[T5], + Optional[T6], + Optional[T7], +]: ... + + async def gather_optional_coroutines( *coroutines: Unpack[Tuple[Optional[Coroutine[Any, Any, T1]], ...]], ) -> Tuple[Optional[T1], ...]: From ac0f8c20e8e065663222f572acb6dbb06231e255 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Tue, 23 Sep 2025 13:54:32 +0100 Subject: [PATCH 09/34] Support MSC4140 Delayed Events with sticky events --- synapse/handlers/delayed_events.py | 12 +++++-- synapse/rest/client/room.py | 32 +++++++++++-------- .../storage/databases/main/delayed_events.py | 7 ++++ .../schema/main/delta/93/01_sticky_events.sql | 3 ++ 4 files changed, 37 insertions(+), 17 deletions(-) diff --git a/synapse/handlers/delayed_events.py b/synapse/handlers/delayed_events.py index a6749801a50..a244128ea00 100644 --- a/synapse/handlers/delayed_events.py +++ b/synapse/handlers/delayed_events.py @@ -17,7 +17,7 @@ from twisted.internet.interfaces import IDelayedCall -from synapse.api.constants import EventTypes +from synapse.api.constants import EventTypes, StickyEvent from synapse.api.errors import ShadowBanError from synapse.api.ratelimiting import Ratelimiter from synapse.config.workers import MAIN_PROCESS_INSTANCE_NAME @@ -244,6 +244,7 @@ async def add( origin_server_ts: Optional[int], content: JsonDict, delay: int, + sticky_duration_ms: Optional[int], ) -> str: """ Creates a new delayed event and schedules its delivery. @@ -257,7 +258,7 @@ async def add( If None, the timestamp will be the actual time when the event is sent. content: The content of the event to be sent. delay: How long (in milliseconds) to wait before automatically sending the event. - + sticky_duration_ms: The sticky duration if any, see MSC4354. Returns: The ID of the added delayed event. Raises: @@ -293,6 +294,7 @@ async def add( origin_server_ts=origin_server_ts, content=content, delay=delay, + sticky_duration_ms=sticky_duration_ms, ) if self._repl_client is not None: @@ -400,6 +402,7 @@ async def send(self, requester: Requester, delay_id: str) -> None: origin_server_ts=event.origin_server_ts, content=event.content, device_id=event.device_id, + sticky_duration_ms=event.sticky_duration_ms, ) ) @@ -510,7 +513,10 @@ async def _send_event( if event.state_key is not None: event_dict["state_key"] = event.state_key - + if event.sticky_duration_ms is not None: + event_dict[StickyEvent.FIELD_NAME] = { + "duration_ms": event.sticky_duration_ms, + } ( sent_event, _, diff --git a/synapse/rest/client/room.py b/synapse/rest/client/room.py index 3e0f1b32188..410f323b4aa 100644 --- a/synapse/rest/client/room.py +++ b/synapse/rest/client/room.py @@ -328,6 +328,10 @@ async def on_PUT( if requester.app_service: origin_server_ts = parse_integer(request, "ts") + sticky_duration_ms: Optional[int] = None + if self.msc4354_enabled: + sticky_duration_ms = parse_integer(request, StickyEvent.QUERY_PARAM_NAME) + delay = _parse_request_delay(request, self._max_event_delay_ms) if delay is not None: delay_id = await self.delayed_events_handler.add( @@ -338,6 +342,7 @@ async def on_PUT( origin_server_ts=origin_server_ts, content=content, delay=delay, + sticky_duration_ms=sticky_duration_ms, ) set_tag("delay_id", delay_id) @@ -365,14 +370,10 @@ async def on_PUT( "room_id": room_id, "sender": requester.user.to_string(), } - if self.msc4354_enabled: - sticky_duration_ms = parse_integer( - request, StickyEvent.QUERY_PARAM_NAME - ) - if sticky_duration_ms is not None: - event_dict[StickyEvent.FIELD_NAME] = { - "duration_ms": sticky_duration_ms, - } + if sticky_duration_ms is not None: + event_dict[StickyEvent.FIELD_NAME] = { + "duration_ms": sticky_duration_ms, + } if state_key is not None: event_dict["state_key"] = state_key @@ -426,6 +427,10 @@ async def _do( if requester.app_service: origin_server_ts = parse_integer(request, "ts") + sticky_duration_ms: Optional[int] = None + if self.msc4354_enabled: + sticky_duration_ms = parse_integer(request, StickyEvent.QUERY_PARAM_NAME) + delay = _parse_request_delay(request, self._max_event_delay_ms) if delay is not None: delay_id = await self.delayed_events_handler.add( @@ -436,6 +441,7 @@ async def _do( origin_server_ts=origin_server_ts, content=content, delay=delay, + sticky_duration_ms=sticky_duration_ms, ) set_tag("delay_id", delay_id) @@ -452,12 +458,10 @@ async def _do( if origin_server_ts is not None: event_dict["origin_server_ts"] = origin_server_ts - if self.msc4354_enabled: - sticky_duration_ms = parse_integer(request, StickyEvent.QUERY_PARAM_NAME) - if sticky_duration_ms is not None: - event_dict[StickyEvent.FIELD_NAME] = { - "duration_ms": sticky_duration_ms, - } + if sticky_duration_ms is not None: + event_dict[StickyEvent.FIELD_NAME] = { + "duration_ms": sticky_duration_ms, + } try: ( diff --git a/synapse/storage/databases/main/delayed_events.py b/synapse/storage/databases/main/delayed_events.py index c88682d55c4..c585fa3efc0 100644 --- a/synapse/storage/databases/main/delayed_events.py +++ b/synapse/storage/databases/main/delayed_events.py @@ -45,6 +45,7 @@ class EventDetails: origin_server_ts: Optional[Timestamp] content: JsonDict device_id: Optional[DeviceID] + sticky_duration_ms: Optional[int] @attr.s(slots=True, frozen=True, auto_attribs=True) @@ -92,6 +93,7 @@ async def add_delayed_event( origin_server_ts: Optional[int], content: JsonDict, delay: int, + sticky_duration_ms: Optional[int], ) -> Tuple[DelayID, Timestamp]: """ Inserts a new delayed event in the DB. @@ -118,6 +120,7 @@ def add_delayed_event_txn(txn: LoggingTransaction) -> Timestamp: "state_key": state_key, "origin_server_ts": origin_server_ts, "content": json_encoder.encode(content), + "sticky_duration_ms": sticky_duration_ms, }, ) @@ -249,6 +252,7 @@ def process_timeout_delayed_events_txn( "send_ts", "content", "device_id", + "sticky_duration_ms", ) ) sql_update = "UPDATE delayed_events SET is_processed = TRUE" @@ -289,6 +293,7 @@ def process_timeout_delayed_events_txn( Timestamp(row[5] if row[5] is not None else row[6]), db_to_json(row[7]), DeviceID(row[8]) if row[8] is not None else None, + int(row[9]) if row[9] is not None else None, DelayID(row[0]), UserLocalpart(row[1]), ) @@ -339,6 +344,7 @@ def process_target_delayed_event_txn( "origin_server_ts", "content", "device_id", + "sticky_duration_ms", ) ) sql_update = "UPDATE delayed_events SET is_processed = TRUE" @@ -366,6 +372,7 @@ def process_target_delayed_event_txn( Timestamp(row[3]) if row[3] is not None else None, db_to_json(row[4]), DeviceID(row[5]) if row[5] is not None else None, + int(row[6]) if row[6] is not None else None, ) return event, self._get_next_delayed_event_send_ts_txn(txn) diff --git a/synapse/storage/schema/main/delta/93/01_sticky_events.sql b/synapse/storage/schema/main/delta/93/01_sticky_events.sql index 5f94860130a..0c9319a7d2f 100644 --- a/synapse/storage/schema/main/delta/93/01_sticky_events.sql +++ b/synapse/storage/schema/main/delta/93/01_sticky_events.sql @@ -23,3 +23,6 @@ CREATE TABLE IF NOT EXISTS sticky_events( -- for pulling out soft failed events by room CREATE INDEX IF NOT EXISTS sticky_events_room_idx ON sticky_events(room_id, soft_failed); + +-- A optional int for combining sticky events with delayed events. Used at send time. +ALTER TABLE delayed_events ADD COLUMN sticky_duration_ms BIGINT; \ No newline at end of file From 1e812e4df099630bbe6ac0e448a7fec87ad7e9a2 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Tue, 23 Sep 2025 14:48:28 +0100 Subject: [PATCH 10/34] Fix sqlite --- synapse/storage/databases/main/sticky_events.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/synapse/storage/databases/main/sticky_events.py b/synapse/storage/databases/main/sticky_events.py index 130bd388df2..1033e704d69 100644 --- a/synapse/storage/databases/main/sticky_events.py +++ b/synapse/storage/databases/main/sticky_events.py @@ -167,7 +167,7 @@ def _get_sticky_events_in_rooms_txn( SELECT stream_id, room_id, event_id FROM sticky_events WHERE soft_failed=FALSE AND expires_at > ? AND stream_id > ? AND stream_id <= ? AND {clause} """, - (now, from_id, to_id, room_id_values), + (now, from_id, to_id, *room_id_values), ) return cast(List[Tuple[int, str, str]], txn.fetchall()) From 2728b21f3dd8aa8bd28d1fc4f60a15c1382fa8b8 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Wed, 24 Sep 2025 10:11:18 +0100 Subject: [PATCH 11/34] Re-evaluate soft-failure on sticky events --- synapse/storage/databases/main/events.py | 24 ++- .../storage/databases/main/sticky_events.py | 162 ++++++++++++++---- 2 files changed, 147 insertions(+), 39 deletions(-) diff --git a/synapse/storage/databases/main/events.py b/synapse/storage/databases/main/events.py index 2dc687481cb..824a77e4569 100644 --- a/synapse/storage/databases/main/events.py +++ b/synapse/storage/databases/main/events.py @@ -371,6 +371,21 @@ async def _persist_events_and_state_updates( len(events_and_contexts) ) + # TODO: are we guaranteed to call the below code if we were to die now? + # On startup we will already think we have persisted the events? + + # This was originally in _persist_events_txn but it relies on non-txn functions like + # get_events_as_list and get_partial_filtered_current_state_ids to handle soft-failure + # re-evaluation, so it can't do that without leaking out the txn currently, hence it + # now just lives outside. + if self.msc4354_sticky_events: + # process events which are sticky as well as re-evaluate soft-failed sticky events. + await self.store.handle_sticky_events( + room_id, + events_and_contexts, + state_delta_for_room, + ) + if not use_negative_stream_ordering: # we don't want to set the event_persisted_position to a negative # stream_ordering. @@ -1176,15 +1191,6 @@ def _persist_events_txn( txn, room_id, events_and_contexts ) - if self.msc4354_sticky_events: - # process events which are sticky as well as re-evaluate soft-failed sticky events. - self.store.handle_sticky_events_txn( - txn, - room_id, - events_and_contexts, - state_delta_for_room, - ) - def _persist_event_auth_chain_txn( self, txn: LoggingTransaction, diff --git a/synapse/storage/databases/main/sticky_events.py b/synapse/storage/databases/main/sticky_events.py index 1033e704d69..5fe862e2ad2 100644 --- a/synapse/storage/databases/main/sticky_events.py +++ b/synapse/storage/databases/main/sticky_events.py @@ -27,7 +27,9 @@ from twisted.internet.defer import Deferred +from synapse import event_auth from synapse.api.constants import EventTypes, StickyEvent +from synapse.api.errors import AuthError from synapse.events import EventBase from synapse.events.snapshot import EventPersistencePair from synapse.metrics.background_process_metrics import run_as_background_process @@ -40,7 +42,10 @@ ) from synapse.storage.databases.main.cache import CacheInvalidationWorkerStore from synapse.storage.databases.main.events import DeltaState +from synapse.storage.databases.main.state import StateGroupWorkerStore from synapse.storage.util.id_generators import MultiWriterIdGenerator +from synapse.types.state import StateFilter +from synapse.util.stringutils import shortstr if TYPE_CHECKING: from synapse.server import HomeServer @@ -54,7 +59,7 @@ DELETE_EXPIRED_STICKY_EVENTS_MS = 60 * 1000 * 60 # 1 hour -class StickyEventsWorkerStore(CacheInvalidationWorkerStore): +class StickyEventsWorkerStore(StateGroupWorkerStore, CacheInvalidationWorkerStore): def __init__( self, database: DatabasePool, @@ -203,14 +208,13 @@ def _get_updated_sticky_events_txn( ) return cast(List[Tuple[int, str, str]], txn.fetchall()) - def handle_sticky_events_txn( + async def handle_sticky_events( self, - txn: LoggingTransaction, room_id: str, events_and_contexts: List[EventPersistencePair], state_delta_for_room: Optional[DeltaState], ) -> None: - """Update the sticky events table, used in MSC4354. Intended to be called within the persist + """Update the sticky events table, used in MSC4354. Intended to be called after the persist events transaction. This function assumes that `_store_event_txn()` (to persist the event) and @@ -222,7 +226,6 @@ def handle_sticky_events_txn( - for each still-sticky soft-failed event in the room, re-evaluate soft-failedness. Args: - txn room_id: The room that all of the events belong to events_and_contexts: The events being persisted. state_delta_for_room: The changes to the current state, used to detect if we need to @@ -233,16 +236,28 @@ def handle_sticky_events_txn( assert self._can_write_to_sticky_events - # TODO: finish the impl # fetch soft failed sticky events to recheck now, before we insert new sticky events, else - # we could incorrectly re-evaluate new sticky events - # event_ids_to_check = self._get_soft_failed_sticky_events_to_recheck(txn, room_id, state_delta_for_room) - # logger.info(f"_get_soft_failed_sticky_events_to_recheck => {event_ids_to_check}") - # recheck them and update any that now pass soft-fail checks. - # self._recheck_soft_failed_events(txn, room_id, event_ids_to_check) + # we could incorrectly re-evaluate new sticky events in events_and_contexts + event_ids_to_check = await self._get_soft_failed_sticky_events_to_recheck( + room_id, state_delta_for_room + ) + if event_ids_to_check: + logger.info( + "_get_soft_failed_sticky_events_to_recheck => %s", event_ids_to_check + ) + # recheck them and update any that now pass soft-fail checks. + await self._recheck_soft_failed_events(room_id, event_ids_to_check) # insert brand new sticky events. - self._insert_sticky_events_txn(txn, events_and_contexts) + await self._insert_sticky_events(events_and_contexts) + + async def _insert_sticky_events( + self, + events_and_contexts: List[EventPersistencePair], + ) -> None: + await self.db_pool.runInteraction( + "_insert_sticky_events", self._insert_sticky_events_txn, events_and_contexts + ) def _insert_sticky_events_txn( self, @@ -314,9 +329,8 @@ def _insert_sticky_events_txn( ], ) - def _get_soft_failed_sticky_events_to_recheck( + async def _get_soft_failed_sticky_events_to_recheck( self, - txn: LoggingTransaction, room_id: str, state_delta_for_room: Optional[DeltaState], ) -> List[str]: @@ -324,7 +338,7 @@ def _get_soft_failed_sticky_events_to_recheck( Soft-failed events are not rejected, so they pass auth at the state before the event and at the auth_events in the event. Instead, soft-failed events failed auth at - the _current state of the room_. We only need to recheck soft failure if we have a reason to + the *current* state of the room. We only need to recheck soft failure if we have a reason to believe the event may pass that check now. Note that we don't bother rechecking accepted events that may now be soft-failed, because @@ -384,9 +398,11 @@ def _get_soft_failed_sticky_events_to_recheck( if typ == EventTypes.Member ] ) + # pull out senders of sticky events in this room - events_to_recheck: List[Tuple[str]] = self.db_pool.simple_select_many_txn( - txn, + events_to_recheck: List[ + Tuple[str] + ] = await self.db_pool.simple_select_many_batch( table="sticky_events", column="sender", iterable=new_membership_changes, @@ -394,7 +410,8 @@ def _get_soft_failed_sticky_events_to_recheck( "room_id": room_id, "soft_failed": True, }, - retcols=("event_id"), + retcols=("event_id",), + desc="_get_soft_failed_sticky_events_to_recheck_members", ) return [event_id for (event_id,) in events_to_recheck] @@ -407,32 +424,32 @@ def _get_soft_failed_sticky_events_to_recheck( # an admin user's membership changes which causes a PL event to be allowed, as when the PL event # gets allowed we will re-evaluate anyway. E.g: # - # PL(send_event=0, sender=Admin) + # PL(send_event=0, sender=Admin) #1 # ^ ^_____________________ # | | - # . PL(send_event=50, sender=Mod) sticky event (sender=User) + # . PL(send_event=50, sender=Mod) #2 sticky event (sender=User) #3 # # In this scenario, the sticky event is soft-failed due to the Mod updating the PL event to # set send_event=50, which User does not have. If we learn of an event which makes Mod's PL # event invalid (say, Mod was banned by Admin concurrently to Mod setting the PL event), then # the act of seeing the ban event will cause the old PL event to be in the state delta, meaning - # we will re-evaluate the sticky event due to the PL changing. We don't need to specially handle case.a - events_to_recheck = self.db_pool.simple_select_list_txn( - txn, + # we will re-evaluate the sticky event due to the PL changing. We don't need to specially handle + # this case. + events_to_recheck = await self.db_pool.simple_select_list( table="sticky_events", keyvalues={ "room_id": room_id, "soft_failed": True, }, - retcols=("event_id"), + retcols=("event_id",), + desc="_get_soft_failed_sticky_events_to_recheck", ) return [event_id for (event_id,) in events_to_recheck] - def _recheck_soft_failed_events( + async def _recheck_soft_failed_events( self, - txn: LoggingTransaction, room_id: str, - event_ids: List[str], + soft_failed_event_ids: List[str], ) -> None: """ Recheck authorised but soft-failed events. The provided event IDs must have already passed @@ -441,10 +458,95 @@ def _recheck_soft_failed_events( Args: txn: The SQL transaction room_id: The room the event IDs are in. - event_ids: The soft-failed events to re-evaluate. + soft_failed_event_ids: The soft-failed events to re-evaluate. """ - # We know the events are otherwise authorised, so we only need to load the current state - # and check if the events pass auth at the current state. + # Load all the soft-failed events to recheck, and pull out the precise state tuples we need + soft_failed_event_map = await self.get_events( + soft_failed_event_ids, allow_rejected=False + ) + needed_tuples: Set[Tuple[str, str]] = set() + for ev in soft_failed_event_map.values(): + needed_tuples.update(event_auth.auth_types_for_event(ev.room_version, ev)) + + # We know the events are otherwise authorised, so we only need to load the needed tuples from + # the current state to check if the events pass auth. + current_state_map = await self.get_partial_filtered_current_state_ids( + room_id, StateFilter.from_types(needed_tuples) + ) + current_state_ids_list = [e for _, e in current_state_map.items()] + current_auth_events = await self.get_events_as_list(current_state_ids_list) + passing_event_ids: Set[str] = set() + for soft_failed_event in soft_failed_event_map.values(): + try: + # We don't need to check_state_independent_auth_rules as that doesn't depend on room state, + # so if it passed once it'll pass again. + event_auth.check_state_dependent_auth_rules( + soft_failed_event, current_auth_events + ) + passing_event_ids.add(soft_failed_event.event_id) + except AuthError: + pass + + if not passing_event_ids: + return + + logger.info( + "%s soft-failed events now pass current state checks in room %s : %s", + len(passing_event_ids), + room_id, + shortstr(passing_event_ids), + ) + # Update the DB with the new soft-failure status + await self.db_pool.runInteraction( + "_recheck_soft_failed_events", + self._update_soft_failure_status_txn, + passing_event_ids, + ) + + def _update_soft_failure_status_txn( + self, txn: LoggingTransaction, passing_event_ids: Set[str] + ) -> None: + # Update the sticky events table so we notify downstream of the change in soft-failure status + new_stream_ids: List[Tuple[str, int]] = [ + (event_id, self._sticky_events_id_gen.get_next_txn(txn)) + for event_id in passing_event_ids + ] + values_placeholders = ", ".join(["(?, ?)"] * len(new_stream_ids)) + params = [p for pair in new_stream_ids for p in pair] + txn.execute( + f""" + UPDATE sticky_events AS se + SET + soft_failed = FALSE, + stream_id = v.stream_id + FROM (VALUES + {values_placeholders} + ) AS v(event_id, stream_id) + WHERE se.event_id = v.event_id; + """, + params, + ) + # Also update the internal metadata on the event itself, so when we filter_events_for_client + # we don't filter them out. It's a bit sad internal_metadata is TEXT and not JSONB... + clause, args = make_in_list_sql_clause( + txn.database_engine, + "event_id", + passing_event_ids, + ) + txn.execute( + """ + UPDATE event_json + SET internal_metadata = ( + jsonb_set(internal_metadata::jsonb, '{soft_failed}', 'false'::jsonb) + )::text + WHERE %s + """ + % clause, + args, + ) + # finally, invalidate caches + for event_id in passing_event_ids: + self.invalidate_get_event_cache_after_txn(txn, event_id) async def _delete_expired_sticky_events(self) -> None: logger.info("delete_expired_sticky_events") From 666e94b75ac1e24df73219e1d2a94c0645185a26 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Wed, 24 Sep 2025 11:25:35 +0100 Subject: [PATCH 12/34] Don't re-evaluate spam --- synapse/federation/federation_base.py | 2 +- synapse/storage/databases/main/sticky_events.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/synapse/federation/federation_base.py b/synapse/federation/federation_base.py index c71def8c765..99b111cf290 100644 --- a/synapse/federation/federation_base.py +++ b/synapse/federation/federation_base.py @@ -196,7 +196,7 @@ async def _check_sigs_and_hash( redacted_event = prune_event(pdu) redacted_event.internal_metadata.soft_failed = True # Mark this as spam so we don't re-evaluate soft-failure status. - pdu.internal_metadata.policy_server_spammy = True + redacted_event.internal_metadata.policy_server_spammy = True return redacted_event return pdu diff --git a/synapse/storage/databases/main/sticky_events.py b/synapse/storage/databases/main/sticky_events.py index 5fe862e2ad2..80474e229b7 100644 --- a/synapse/storage/databases/main/sticky_events.py +++ b/synapse/storage/databases/main/sticky_events.py @@ -477,6 +477,9 @@ async def _recheck_soft_failed_events( current_auth_events = await self.get_events_as_list(current_state_ids_list) passing_event_ids: Set[str] = set() for soft_failed_event in soft_failed_event_map.values(): + if soft_failed_event.internal_metadata.policy_server_spammy: + # don't re-evaluate spam. + continue try: # We don't need to check_state_independent_auth_rules as that doesn't depend on room state, # so if it passed once it'll pass again. From 771692addda05253a423535a704f30afed0fb62f Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Wed, 24 Sep 2025 11:45:06 +0100 Subject: [PATCH 13/34] Rejig when we persist sticky events Persist inside persist_events to guarantee it is done. After that txn, recheck soft failure. --- synapse/storage/databases/main/events.py | 3 ++ .../storage/databases/main/sticky_events.py | 37 ++++++++----------- 2 files changed, 18 insertions(+), 22 deletions(-) diff --git a/synapse/storage/databases/main/events.py b/synapse/storage/databases/main/events.py index 824a77e4569..ab9fbc4a1c2 100644 --- a/synapse/storage/databases/main/events.py +++ b/synapse/storage/databases/main/events.py @@ -1186,6 +1186,9 @@ def _persist_events_txn( sliding_sync_table_changes, ) + if self.msc4354_sticky_events: + self.store.insert_sticky_events_txn(txn, events_and_contexts) + # We only update the sliding sync tables for non-backfilled events. self._update_sliding_sync_tables_with_new_persisted_events_txn( txn, room_id, events_and_contexts diff --git a/synapse/storage/databases/main/sticky_events.py b/synapse/storage/databases/main/sticky_events.py index 80474e229b7..e9a45bcf4ed 100644 --- a/synapse/storage/databases/main/sticky_events.py +++ b/synapse/storage/databases/main/sticky_events.py @@ -57,6 +57,7 @@ # Consumers call 'get_sticky_events_in_rooms' which has `WHERE expires_at > ?` # to filter out expired sticky events that have yet to be deleted. DELETE_EXPIRED_STICKY_EVENTS_MS = 60 * 1000 * 60 # 1 hour +MAX_STICKY_DURATION_MS = 3600000 # 1 hour class StickyEventsWorkerStore(StateGroupWorkerStore, CacheInvalidationWorkerStore): @@ -231,16 +232,18 @@ async def handle_sticky_events( state_delta_for_room: The changes to the current state, used to detect if we need to re-evaluate soft-failed sticky events. """ - if len(events_and_contexts) == 0: - return - assert self._can_write_to_sticky_events - # fetch soft failed sticky events to recheck now, before we insert new sticky events, else - # we could incorrectly re-evaluate new sticky events in events_and_contexts + # fetch soft failed sticky events to recheck event_ids_to_check = await self._get_soft_failed_sticky_events_to_recheck( room_id, state_delta_for_room ) + # filter out soft-failed events in events_and_contexts as we just inserted them, so the + # soft failure status won't have changed for them. + persisting_event_ids = {ev.event_id for ev, _ in events_and_contexts} + event_ids_to_check = [ + item for item in event_ids_to_check if item not in persisting_event_ids + ] if event_ids_to_check: logger.info( "_get_soft_failed_sticky_events_to_recheck => %s", event_ids_to_check @@ -248,18 +251,7 @@ async def handle_sticky_events( # recheck them and update any that now pass soft-fail checks. await self._recheck_soft_failed_events(room_id, event_ids_to_check) - # insert brand new sticky events. - await self._insert_sticky_events(events_and_contexts) - - async def _insert_sticky_events( - self, - events_and_contexts: List[EventPersistencePair], - ) -> None: - await self.db_pool.runInteraction( - "_insert_sticky_events", self._insert_sticky_events_txn, events_and_contexts - ) - - def _insert_sticky_events_txn( + def insert_sticky_events_txn( self, txn: LoggingTransaction, events_and_contexts: List[EventPersistencePair], @@ -279,17 +271,18 @@ def _insert_sticky_events_txn( if type(sticky_obj) is not dict: continue sticky_duration_ms = sticky_obj.get("duration_ms", None) - # MSC: Valid values are the integer range 0-3600000 (1 hour). + # MSC: Valid values are the integer range 0-MAX_STICKY_DURATION_MS if ( type(sticky_duration_ms) is int and sticky_duration_ms >= 0 - and sticky_duration_ms <= 3600000 + and sticky_duration_ms <= MAX_STICKY_DURATION_MS ): # MSC: The start time is min(now, origin_server_ts). # This ensures that malicious origin timestamps cannot specify start times in the future. - # Calculate the end time as start_time + min(sticky.duration_ms, 3600000). + # Calculate the end time as start_time + min(sticky.duration_ms, MAX_STICKY_DURATION_MS). expires_at = min(ev.origin_server_ts, now_ms) + min( - ev.get_dict()[StickyEvent.FIELD_NAME]["duration_ms"], 3600000 + ev.get_dict()[StickyEvent.FIELD_NAME]["duration_ms"], + MAX_STICKY_DURATION_MS, ) # filter out already expired sticky events if expires_at > now_ms: @@ -449,7 +442,7 @@ async def _get_soft_failed_sticky_events_to_recheck( async def _recheck_soft_failed_events( self, room_id: str, - soft_failed_event_ids: List[str], + soft_failed_event_ids: Collection[str], ) -> None: """ Recheck authorised but soft-failed events. The provided event IDs must have already passed From ad6a2b9e0cc006d68d3abf7d1b5d014681ccdbe3 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Wed, 24 Sep 2025 12:45:22 +0100 Subject: [PATCH 14/34] Update docs to not lie --- synapse/storage/databases/main/events.py | 4 ++-- synapse/storage/databases/main/sticky_events.py | 15 +++------------ 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/synapse/storage/databases/main/events.py b/synapse/storage/databases/main/events.py index ab9fbc4a1c2..439bdafdaec 100644 --- a/synapse/storage/databases/main/events.py +++ b/synapse/storage/databases/main/events.py @@ -379,8 +379,8 @@ async def _persist_events_and_state_updates( # re-evaluation, so it can't do that without leaking out the txn currently, hence it # now just lives outside. if self.msc4354_sticky_events: - # process events which are sticky as well as re-evaluate soft-failed sticky events. - await self.store.handle_sticky_events( + # re-evaluate soft-failed sticky events. + await self.store.reevaluate_soft_failed_sticky_events( room_id, events_and_contexts, state_delta_for_room, diff --git a/synapse/storage/databases/main/sticky_events.py b/synapse/storage/databases/main/sticky_events.py index e9a45bcf4ed..2f04623a865 100644 --- a/synapse/storage/databases/main/sticky_events.py +++ b/synapse/storage/databases/main/sticky_events.py @@ -209,26 +209,17 @@ def _get_updated_sticky_events_txn( ) return cast(List[Tuple[int, str, str]], txn.fetchall()) - async def handle_sticky_events( + async def reevaluate_soft_failed_sticky_events( self, room_id: str, events_and_contexts: List[EventPersistencePair], state_delta_for_room: Optional[DeltaState], ) -> None: - """Update the sticky events table, used in MSC4354. Intended to be called after the persist - events transaction. - - This function assumes that `_store_event_txn()` (to persist the event) and - `_update_current_state_txn(...)` (so the current state has taken the events into account) - have already been run. - - "Handling" sticky events is broken into two phases: - - for each sticky event in events_and_contexts, mark them as sticky in the sticky events table. - - for each still-sticky soft-failed event in the room, re-evaluate soft-failedness. + """Re-evaluate soft failed events in the room provided. Args: room_id: The room that all of the events belong to - events_and_contexts: The events being persisted. + events_and_contexts: The events just persisted. These are not eligible for re-evaluation. state_delta_for_room: The changes to the current state, used to detect if we need to re-evaluate soft-failed sticky events. """ From 33d80be69f8f6161eee93925c29d5e73b2b63206 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Fri, 26 Sep 2025 09:29:52 +0100 Subject: [PATCH 15/34] Send sticky events when catching up over federation --- .../sender/per_destination_queue.py | 28 ++++++++++ .../storage/databases/main/sticky_events.py | 37 ++++++++++++ .../storage/databases/main/transactions.py | 2 +- tests/federation/test_federation_catch_up.py | 56 +++++++++++++++++++ tests/rest/client/utils.py | 40 +++++++++++++ 5 files changed, 162 insertions(+), 1 deletion(-) diff --git a/synapse/federation/sender/per_destination_queue.py b/synapse/federation/sender/per_destination_queue.py index 4c844d403a2..524ad8cb037 100644 --- a/synapse/federation/sender/per_destination_queue.py +++ b/synapse/federation/sender/per_destination_queue.py @@ -101,6 +101,7 @@ def __init__( self._instance_name = hs.get_instance_name() self._federation_shard_config = hs.config.worker.federation_shard_config self._state = hs.get_state_handler() + self.msc4354_enabled = hs.config.experimental.msc4354_enabled self._should_send_on_this_instance = True if not self._federation_shard_config.should_handle( @@ -558,6 +559,33 @@ async def _catch_up_transmission_loop(self) -> None: # send. extrem_events = await self._store.get_events_as_list(extrems) + if self.msc4354_enabled: + # we also want to send sticky events that are still active in this room + sticky_event_ids = ( + await self._store.get_sticky_event_ids_sent_by_self( + pdu.room_id, + last_successful_stream_ordering, + ) + ) + # skip any that are actually the forward extremities we want to send anyway + sticky_events = await self._store.get_events_as_list( + [ + event_id + for event_id in sticky_event_ids + if event_id not in extrems + ] + ) + if sticky_events: + # *prepend* these to the extrem list, so they are processed first. + # This ensures they will show up before the forward extrem in stream order + extrem_events = sticky_events + extrem_events + logger.info( + "Sending %d missed sticky events to %s: %r", + len(sticky_events), + self._destination, + pdu.room_id, + ) + new_pdus = [] for p in extrem_events: # We pulled this from the DB, so it'll be non-null diff --git a/synapse/storage/databases/main/sticky_events.py b/synapse/storage/databases/main/sticky_events.py index 2f04623a865..5b4c18ab385 100644 --- a/synapse/storage/databases/main/sticky_events.py +++ b/synapse/storage/databases/main/sticky_events.py @@ -209,6 +209,43 @@ def _get_updated_sticky_events_txn( ) return cast(List[Tuple[int, str, str]], txn.fetchall()) + async def get_sticky_event_ids_sent_by_self( + self, room_id: str, from_stream_pos: int + ) -> List[str]: + """Get sticky event IDs which have been sent by users on this homeserver. + + Used when sending sticky events eagerly to newly joined servers, or when catching up over federation. + + Args: + room_id: The room to fetch sticky events in. + from_stream_pos: The stream position to return events from. May be 0 for newly joined servers. + Returns: + A list of event IDs, which may be empty. + """ + return await self.db_pool.runInteraction( + "get_sticky_event_ids_sent_by_self", + self._get_sticky_event_ids_sent_by_self_txn, + room_id, + from_stream_pos, + ) + + def _get_sticky_event_ids_sent_by_self_txn( + self, txn: LoggingTransaction, room_id: str, from_stream_pos: int + ) -> List[str]: + now_ms = self._now() + txn.execute( + """ + SELECT sticky_events.event_id, sticky_events.sender, events.stream_ordering FROM sticky_events + INNER JOIN events ON events.event_id = sticky_events.event_id + WHERE soft_failed=FALSE AND expires_at > ? AND sticky_events.room_id = ? + """, + (now_ms, room_id), + ) + rows = cast(List[Tuple[str, str, int]], txn.fetchall()) + return [ + row[0] for row in rows if row[2] > from_stream_pos and self.hs.is_mine_id(row[1]) + ] + async def reevaluate_soft_failed_sticky_events( self, room_id: str, diff --git a/synapse/storage/databases/main/transactions.py b/synapse/storage/databases/main/transactions.py index bfc324b80d2..a439c41aab3 100644 --- a/synapse/storage/databases/main/transactions.py +++ b/synapse/storage/databases/main/transactions.py @@ -380,7 +380,7 @@ async def get_catch_up_room_event_ids( ) -> List[str]: """ Returns at most 50 event IDs and their corresponding stream_orderings - that correspond to the oldest events that have not yet been sent to + that correspond to the newest events that have not yet been sent to the destination. Args: diff --git a/tests/federation/test_federation_catch_up.py b/tests/federation/test_federation_catch_up.py index f99911b1025..a2bc7e59b52 100644 --- a/tests/federation/test_federation_catch_up.py +++ b/tests/federation/test_federation_catch_up.py @@ -1,3 +1,4 @@ +import time from typing import Callable, Collection, List, Optional, Tuple from unittest import mock from unittest.mock import AsyncMock, Mock @@ -19,6 +20,7 @@ from synapse.util import Clock from synapse.util.retryutils import NotRetryingDestination +from tests import unittest from tests.test_utils import event_injection from tests.unittest import FederatingHomeserverTestCase @@ -452,6 +454,60 @@ def wake_destination_track(destination: str) -> None: # has been successfully sent. self.assertCountEqual(woken, set(server_names[:-1])) + @unittest.override_config({"experimental_features": {"msc4354_enabled": True}}) + def test_sends_sticky_events(self) -> None: + """Test that we send sticky events in addition to the latest event in the room when catching up.""" + # make the clock used when generating origin_server_ts the same as the clock used to check expiry + self.reactor.advance(time.time()) + per_dest_queue, sent_pdus = self.make_fake_destination_queue() + + # Make a room with a local user, and two servers. One will go offline + # and one will send some events. + self.register_user("u1", "you the one") + u1_token = self.login("u1", "you the one") + room_1 = self.helper.create_room_as("u1", tok=u1_token) + + self.get_success( + event_injection.inject_member_event(self.hs, room_1, "@user:host2", "join") + ) + event_1 = self.get_success( + event_injection.inject_member_event(self.hs, room_1, "@user:host3", "join") + ) + + # now we send a sticky event that we expect to be bundled with the fwd extrem event + sticky_event_id = self.helper.send_sticky_event( + room_1, "m.room.sticky", 60000, tok=u1_token + )["event_id"] + # ..and other uninteresting events + self.helper.send(room_1, "you hear me!!", tok=u1_token) + + # Now simulate us receiving an event from the still online remote. + fwd_extrem_event = self.get_success( + event_injection.inject_event( + self.hs, + type=EventTypes.Message, + sender="@user:host3", + room_id=room_1, + content={"msgtype": "m.text", "body": "Hello"}, + ) + ) + + assert event_1.internal_metadata.stream_ordering is not None + self.get_success( + self.hs.get_datastores().main.set_destination_last_successful_stream_ordering( + "host2", event_1.internal_metadata.stream_ordering + ) + ) + + self.get_success(per_dest_queue._catch_up_transmission_loop()) + + # We expect the sticky event and the fwd extrem to be sent + self.assertEqual(len(sent_pdus), 2) + # We expect the sticky event to appear before the fwd extrem + self.assertEqual(sent_pdus[0].event_id, sticky_event_id) + self.assertEqual(sent_pdus[1].event_id, fwd_extrem_event.event_id) + self.assertFalse(per_dest_queue._catching_up) + def test_not_latest_event(self) -> None: """Test that we send the latest event in the room even if its not ours.""" diff --git a/tests/rest/client/utils.py b/tests/rest/client/utils.py index bb214759d98..c34b1c4973c 100644 --- a/tests/rest/client/utils.py +++ b/tests/rest/client/utils.py @@ -456,6 +456,46 @@ def send_event( return channel.json_body + def send_sticky_event( + self, + room_id: str, + type: str, + duration_ms: int, + content: Optional[dict] = None, + txn_id: Optional[str] = None, + tok: Optional[str] = None, + expect_code: int = HTTPStatus.OK, + custom_headers: Optional[Iterable[Tuple[AnyStr, AnyStr]]] = None, + ) -> JsonDict: + if txn_id is None: + txn_id = "m%s" % (str(time.time())) + + path = "/_matrix/client/r0/rooms/%s/send/%s/%s?msc4354_stick_duration_ms=%d" % ( + room_id, + type, + txn_id, + duration_ms, + ) + if tok: + path = path + "&access_token=%s" % tok + + channel = make_request( + self.reactor, + self.site, + "PUT", + path, + content or {}, + custom_headers=custom_headers, + ) + + assert channel.code == expect_code, "Expected: %d, got: %d, resp: %r" % ( + expect_code, + channel.code, + channel.result["body"], + ) + + return channel.json_body + def get_event( self, room_id: str, From 148caefcba49d34ee8a4379eed4ccd2cbe419724 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Fri, 26 Sep 2025 09:51:59 +0100 Subject: [PATCH 16/34] Fix trial tests --- synapse/storage/databases/main/sticky_events.py | 4 +++- tests/rest/admin/test_room.py | 4 ++-- tests/rest/client/test_rooms.py | 4 ++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/synapse/storage/databases/main/sticky_events.py b/synapse/storage/databases/main/sticky_events.py index 5b4c18ab385..db7b8c641ea 100644 --- a/synapse/storage/databases/main/sticky_events.py +++ b/synapse/storage/databases/main/sticky_events.py @@ -243,7 +243,9 @@ def _get_sticky_event_ids_sent_by_self_txn( ) rows = cast(List[Tuple[str, str, int]], txn.fetchall()) return [ - row[0] for row in rows if row[2] > from_stream_pos and self.hs.is_mine_id(row[1]) + row[0] + for row in rows + if row[2] > from_stream_pos and self.hs.is_mine_id(row[1]) ] async def reevaluate_soft_failed_sticky_events( diff --git a/tests/rest/admin/test_room.py b/tests/rest/admin/test_room.py index ee5d0419ab9..f532055dd1e 100644 --- a/tests/rest/admin/test_room.py +++ b/tests/rest/admin/test_room.py @@ -2244,7 +2244,7 @@ def test_timestamp_to_event(self) -> None: def test_topo_token_is_accepted(self) -> None: """Test Topo Token is accepted.""" - token = "t1-0_0_0_0_0_0_0_0_0_0_0" + token = "t1-0_0_0_0_0_0_0_0_0_0_0_0" channel = self.make_request( "GET", "/_synapse/admin/v1/rooms/%s/messages?from=%s" % (self.room_id, token), @@ -2258,7 +2258,7 @@ def test_topo_token_is_accepted(self) -> None: def test_stream_token_is_accepted_for_fwd_pagianation(self) -> None: """Test that stream token is accepted for forward pagination.""" - token = "s0_0_0_0_0_0_0_0_0_0_0" + token = "s0_0_0_0_0_0_0_0_0_0_0_0" channel = self.make_request( "GET", "/_synapse/admin/v1/rooms/%s/messages?from=%s" % (self.room_id, token), diff --git a/tests/rest/client/test_rooms.py b/tests/rest/client/test_rooms.py index d3b5e26132d..bb06c738f8f 100644 --- a/tests/rest/client/test_rooms.py +++ b/tests/rest/client/test_rooms.py @@ -2245,7 +2245,7 @@ def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None: self.room_id = self.helper.create_room_as(self.user_id) def test_topo_token_is_accepted(self) -> None: - token = "t1-0_0_0_0_0_0_0_0_0_0_0" + token = "t1-0_0_0_0_0_0_0_0_0_0_0_0" channel = self.make_request( "GET", "/rooms/%s/messages?access_token=x&from=%s" % (self.room_id, token) ) @@ -2256,7 +2256,7 @@ def test_topo_token_is_accepted(self) -> None: self.assertTrue("end" in channel.json_body) def test_stream_token_is_accepted_for_fwd_pagianation(self) -> None: - token = "s0_0_0_0_0_0_0_0_0_0_0" + token = "s0_0_0_0_0_0_0_0_0_0_0_0" channel = self.make_request( "GET", "/rooms/%s/messages?access_token=x&from=%s" % (self.room_id, token) ) From de3e9b49ecc09e9f31ea80cc3cd4292762838d33 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Mon, 29 Sep 2025 16:26:32 +0100 Subject: [PATCH 17/34] Add msc4354_sticky_duration_ttl_ms support --- synapse/api/constants.py | 3 +++ synapse/events/__init__.py | 21 ++++++++++++++++++- .../storage/databases/main/sticky_events.py | 17 ++++----------- synapse/visibility.py | 9 ++++++++ 4 files changed, 36 insertions(+), 14 deletions(-) diff --git a/synapse/api/constants.py b/synapse/api/constants.py index 5f2b1a4b5ac..105a3fe7c79 100644 --- a/synapse/api/constants.py +++ b/synapse/api/constants.py @@ -279,6 +279,8 @@ class EventUnsignedContentFields: # Requesting user's membership, per MSC4115 MEMBERSHIP: Final = "membership" + STICKY_TTL: Final = "msc4354_sticky_duration_ttl_ms" + class MTextFields: """Fields found inside m.text content blocks.""" @@ -369,3 +371,4 @@ class StickyEventField(TypedDict): class StickyEvent: QUERY_PARAM_NAME: Final = "msc4354_stick_duration_ms" FIELD_NAME: Final = "msc4354_sticky" + MAX_DURATION_MS: Final = 3600000 # 1 hour diff --git a/synapse/events/__init__.py b/synapse/events/__init__.py index db387542806..82f8940dc91 100644 --- a/synapse/events/__init__.py +++ b/synapse/events/__init__.py @@ -41,7 +41,12 @@ import attr from unpaddedbase64 import encode_base64 -from synapse.api.constants import EventContentFields, EventTypes, RelationTypes +from synapse.api.constants import ( + EventContentFields, + EventTypes, + RelationTypes, + StickyEvent, +) from synapse.api.room_versions import EventFormatVersions, RoomVersion, RoomVersions from synapse.synapse_rust.events import EventInternalMetadata from synapse.types import ( @@ -323,6 +328,20 @@ def freeze(self) -> None: # this will be a no-op if the event dict is already frozen. self._dict = freeze(self._dict) + def sticky_duration(self) -> Optional[int]: + sticky_obj = self.get_dict().get(StickyEvent.FIELD_NAME, None) + if type(sticky_obj) is not dict: + return None + sticky_duration_ms = sticky_obj.get("duration_ms", None) + # MSC: Valid values are the integer range 0-MAX_DURATION_MS + if ( + type(sticky_duration_ms) is int + and sticky_duration_ms >= 0 + and sticky_duration_ms <= StickyEvent.MAX_DURATION_MS + ): + return sticky_duration_ms + return None + def __str__(self) -> str: return self.__repr__() diff --git a/synapse/storage/databases/main/sticky_events.py b/synapse/storage/databases/main/sticky_events.py index db7b8c641ea..5672a9b79f1 100644 --- a/synapse/storage/databases/main/sticky_events.py +++ b/synapse/storage/databases/main/sticky_events.py @@ -57,7 +57,6 @@ # Consumers call 'get_sticky_events_in_rooms' which has `WHERE expires_at > ?` # to filter out expired sticky events that have yet to be deleted. DELETE_EXPIRED_STICKY_EVENTS_MS = 60 * 1000 * 60 # 1 hour -MAX_STICKY_DURATION_MS = 3600000 # 1 hour class StickyEventsWorkerStore(StateGroupWorkerStore, CacheInvalidationWorkerStore): @@ -297,22 +296,14 @@ def insert_sticky_events_txn( if ev.rejected_reason is not None: continue # MSC: The presence of sticky.duration_ms with a valid value makes the event “sticky” - sticky_obj = ev.get_dict().get(StickyEvent.FIELD_NAME, None) - if type(sticky_obj) is not dict: - continue - sticky_duration_ms = sticky_obj.get("duration_ms", None) - # MSC: Valid values are the integer range 0-MAX_STICKY_DURATION_MS - if ( - type(sticky_duration_ms) is int - and sticky_duration_ms >= 0 - and sticky_duration_ms <= MAX_STICKY_DURATION_MS - ): + sticky_duration = ev.sticky_duration() + if sticky_duration: # MSC: The start time is min(now, origin_server_ts). # This ensures that malicious origin timestamps cannot specify start times in the future. - # Calculate the end time as start_time + min(sticky.duration_ms, MAX_STICKY_DURATION_MS). + # Calculate the end time as start_time + min(sticky.duration_ms, MAX_DURATION_MS). expires_at = min(ev.origin_server_ts, now_ms) + min( ev.get_dict()[StickyEvent.FIELD_NAME]["duration_ms"], - MAX_STICKY_DURATION_MS, + StickyEvent.MAX_DURATION_MS, ) # filter out already expired sticky events if expires_at > now_ms: diff --git a/synapse/visibility.py b/synapse/visibility.py index d460d8f4c20..d0d23c54a40 100644 --- a/synapse/visibility.py +++ b/synapse/visibility.py @@ -209,6 +209,15 @@ def allowed(event: EventBase) -> Optional[EventBase]: # to the cache! cloned = clone_event(filtered) cloned.unsigned[EventUnsignedContentFields.MEMBERSHIP] = user_membership + if storage.main.config.experimental.msc4354_enabled: + sticky_duration = cloned.sticky_duration() + if sticky_duration: + now = storage.main.clock.time_msec() + expires_at = min(cloned.origin_server_ts, now) + sticky_duration + if sticky_duration and expires_at > now: + cloned.unsigned[EventUnsignedContentFields.STICKY_TTL] = ( + expires_at - now + ) return cloned From 651e82963234e1c853840bf6c318609366e2e734 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Wed, 1 Oct 2025 11:55:39 +0100 Subject: [PATCH 18/34] Use standard unstable identifiers --- synapse/api/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/synapse/api/constants.py b/synapse/api/constants.py index 105a3fe7c79..95ec369bc20 100644 --- a/synapse/api/constants.py +++ b/synapse/api/constants.py @@ -369,6 +369,6 @@ class StickyEventField(TypedDict): class StickyEvent: - QUERY_PARAM_NAME: Final = "msc4354_stick_duration_ms" + QUERY_PARAM_NAME: Final = "org.matrix.msc4354.sticky_duration_ms" FIELD_NAME: Final = "msc4354_sticky" MAX_DURATION_MS: Final = 3600000 # 1 hour From 4acc98d23eb30601a4f333b84a2914823a2720f0 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Wed, 1 Oct 2025 17:01:45 +0100 Subject: [PATCH 19/34] Don't persist sticky outliers --- synapse/storage/databases/main/sticky_events.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/synapse/storage/databases/main/sticky_events.py b/synapse/storage/databases/main/sticky_events.py index 5672a9b79f1..3dd4a699783 100644 --- a/synapse/storage/databases/main/sticky_events.py +++ b/synapse/storage/databases/main/sticky_events.py @@ -295,6 +295,9 @@ def insert_sticky_events_txn( # We shouldn't be passed rejected events, but if we do, we filter them out too. if ev.rejected_reason is not None: continue + # We can't persist outlier sticky events as we don't know the room state at that event + if ev.internal_metadata.is_outlier(): + continue # MSC: The presence of sticky.duration_ms with a valid value makes the event “sticky” sticky_duration = ev.sticky_duration() if sticky_duration: From 78c40973f4d5a658b348698e91b5e7bde909e413 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Thu, 2 Oct 2025 09:26:33 +0100 Subject: [PATCH 20/34] SQLite specific soft-failure update code --- .../storage/databases/main/sticky_events.py | 98 +++++++++++++------ 1 file changed, 66 insertions(+), 32 deletions(-) diff --git a/synapse/storage/databases/main/sticky_events.py b/synapse/storage/databases/main/sticky_events.py index 3dd4a699783..1e72b434f30 100644 --- a/synapse/storage/databases/main/sticky_events.py +++ b/synapse/storage/databases/main/sticky_events.py @@ -25,6 +25,7 @@ cast, ) +from synapse.storage.engines import PostgresEngine from twisted.internet.defer import Deferred from synapse import event_auth @@ -531,39 +532,72 @@ def _update_soft_failure_status_txn( (event_id, self._sticky_events_id_gen.get_next_txn(txn)) for event_id in passing_event_ids ] - values_placeholders = ", ".join(["(?, ?)"] * len(new_stream_ids)) + # [event_id, stream_pos, event_id, stream_pos, ...] params = [p for pair in new_stream_ids for p in pair] - txn.execute( - f""" - UPDATE sticky_events AS se - SET - soft_failed = FALSE, - stream_id = v.stream_id - FROM (VALUES - {values_placeholders} - ) AS v(event_id, stream_id) - WHERE se.event_id = v.event_id; - """, - params, - ) - # Also update the internal metadata on the event itself, so when we filter_events_for_client - # we don't filter them out. It's a bit sad internal_metadata is TEXT and not JSONB... - clause, args = make_in_list_sql_clause( - txn.database_engine, - "event_id", - passing_event_ids, - ) - txn.execute( - """ - UPDATE event_json - SET internal_metadata = ( - jsonb_set(internal_metadata::jsonb, '{soft_failed}', 'false'::jsonb) - )::text - WHERE %s - """ - % clause, - args, - ) + if isinstance(txn.database_engine, PostgresEngine): + values_placeholders = ", ".join(["(?, ?)"] * len(new_stream_ids)) + txn.execute( + f""" + UPDATE sticky_events AS se + SET + soft_failed = FALSE, + stream_id = v.stream_id + FROM (VALUES + {values_placeholders} + ) AS v(event_id, stream_id) + WHERE se.event_id = v.event_id; + """, + params, + ) + # Also update the internal metadata on the event itself, so when we filter_events_for_client + # we don't filter them out. It's a bit sad internal_metadata is TEXT and not JSONB... + clause, args = make_in_list_sql_clause( + txn.database_engine, + "event_id", + passing_event_ids, + ) + txn.execute( + """ + UPDATE event_json + SET internal_metadata = ( + jsonb_set(internal_metadata::jsonb, '{soft_failed}', 'false'::jsonb) + )::text + WHERE %s + """ + % clause, + args, + ) + else: + # Use a CASE expression to update in bulk for sqlite + case_expr = " ".join( + [f"WHEN ? THEN ? " for _ in new_stream_ids] + ) + txn.execute( + f""" + UPDATE sticky_events + SET + soft_failed = FALSE, + stream_id = CASE event_id + {case_expr} + ELSE stream_id + END + WHERE event_id IN ({",".join("?" * len(new_stream_ids))}); + """, + params + [eid for eid, _ in new_stream_ids], + ) + clause, args = make_in_list_sql_clause( + txn.database_engine, + "event_id", + passing_event_ids, + ) + txn.execute( + f""" + UPDATE event_json + SET internal_metadata = json_set(internal_metadata, '$.soft_failed', 'false') + WHERE {clause} + """, + args, + ) # finally, invalidate caches for event_id in passing_event_ids: self.invalidate_get_event_cache_after_txn(txn, event_id) From 15453d4e6eed7f8f164372fa516ad0d8fd1f8c29 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Thu, 2 Oct 2025 09:30:21 +0100 Subject: [PATCH 21/34] JSON false not str --- synapse/storage/databases/main/sticky_events.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/synapse/storage/databases/main/sticky_events.py b/synapse/storage/databases/main/sticky_events.py index 1e72b434f30..dc48d92bc21 100644 --- a/synapse/storage/databases/main/sticky_events.py +++ b/synapse/storage/databases/main/sticky_events.py @@ -593,7 +593,7 @@ def _update_soft_failure_status_txn( txn.execute( f""" UPDATE event_json - SET internal_metadata = json_set(internal_metadata, '$.soft_failed', 'false') + SET internal_metadata = json_set(internal_metadata, '$.soft_failed', json('false')) WHERE {clause} """, args, From aa45bf7c3ae05edea0de4653c15089033f95af1c Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Thu, 2 Oct 2025 10:46:44 +0100 Subject: [PATCH 22/34] Add msc4354 to /versions response --- synapse/replication/tcp/client.py | 1 - synapse/rest/client/versions.py | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/synapse/replication/tcp/client.py b/synapse/replication/tcp/client.py index 55156bf988b..e95eade4891 100644 --- a/synapse/replication/tcp/client.py +++ b/synapse/replication/tcp/client.py @@ -266,7 +266,6 @@ async def on_rdata( users=[row.user_id for row in rows], ) elif stream_name == StickyEventsStream.NAME: - print(f"STICKY_EVENTS on_rdata {token} => {rows}") self.notifier.on_new_event( StreamKeyType.STICKY_EVENTS, token, diff --git a/synapse/rest/client/versions.py b/synapse/rest/client/versions.py index 20395430d70..11317163701 100644 --- a/synapse/rest/client/versions.py +++ b/synapse/rest/client/versions.py @@ -182,6 +182,8 @@ async def on_GET(self, request: SynapseRequest) -> Tuple[int, JsonDict]: "org.matrix.msc4306": self.config.experimental.msc4306_enabled, # MSC4169: Backwards-compatible redaction sending using `/send` "com.beeper.msc4169": self.config.experimental.msc4169_enabled, + # MSC4354: Sticky events + "org.matrix.msc4354": self.config.experimental.msc4354_enabled, }, }, ) From 888ab79b3b274284cb1f893e8c9c874084fae8b6 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Thu, 2 Oct 2025 15:25:47 +0100 Subject: [PATCH 23/34] Add NewServerJoined replication command Currently emits on joins and logs on receive, WIP. --- synapse/federation/send_queue.py | 5 ++++ synapse/federation/sender/__init__.py | 10 +++++++ synapse/notifier.py | 5 ++++ synapse/replication/tcp/commands.py | 29 +++++++++++++++++++ synapse/replication/tcp/handler.py | 10 +++++++ synapse/storage/databases/main/events.py | 8 +++++ .../storage/databases/main/sticky_events.py | 6 ++-- 7 files changed, 69 insertions(+), 4 deletions(-) diff --git a/synapse/federation/send_queue.py b/synapse/federation/send_queue.py index 2fdee9ac549..caa6e956e42 100644 --- a/synapse/federation/send_queue.py +++ b/synapse/federation/send_queue.py @@ -213,6 +213,11 @@ def notify_new_events(self, max_token: RoomStreamToken) -> None: # This should never get called. raise NotImplementedError() + def notify_new_server_joined(self, server: str, room_id: str) -> None: + """As per FederationSender""" + # This should never get called. + raise NotImplementedError() + def build_and_send_edu( self, destination: str, diff --git a/synapse/federation/sender/__init__.py b/synapse/federation/sender/__init__.py index 8e3619d1bca..f5938b20c4f 100644 --- a/synapse/federation/sender/__init__.py +++ b/synapse/federation/sender/__init__.py @@ -239,6 +239,13 @@ def notify_new_events(self, max_token: RoomStreamToken) -> None: """ raise NotImplementedError() + @abc.abstractmethod + def notify_new_server_joined(self, server: str, room_id: str) -> None: + """This gets called when we a new server has joined a room. We might + want to send out some events to this server. + """ + raise NotImplementedError() + @abc.abstractmethod async def send_read_receipt(self, receipt: ReadReceipt) -> None: """Send a RR to any other servers in the room @@ -488,6 +495,9 @@ def _get_per_destination_queue( self._per_destination_queues[destination] = queue return queue + def notify_new_server_joined(self, server: str, room_id: str) -> None: + print(f"FEDSENDER: new server joined: server={server} room={room_id}") + def notify_new_events(self, max_token: RoomStreamToken) -> None: """This gets called when we have some new events we might want to send out to other servers. diff --git a/synapse/notifier.py b/synapse/notifier.py index 136e766d681..65e41206784 100644 --- a/synapse/notifier.py +++ b/synapse/notifier.py @@ -933,6 +933,11 @@ def notify_remote_server_up(self, server: str) -> None: # that any in flight requests can be immediately retried. self._federation_client.wake_destination(server) + def notify_new_server_joined(self, server: str, room_id: str) -> None: + # Inform the federation_sender that it may need to send events to the new server. + if self.federation_sender: + self.federation_sender.notify_new_server_joined(server, room_id) + def add_lock_released_callback( self, callback: Callable[[str, str, str], None] ) -> None: diff --git a/synapse/replication/tcp/commands.py b/synapse/replication/tcp/commands.py index 8eec68c3ddc..12ef6666a64 100644 --- a/synapse/replication/tcp/commands.py +++ b/synapse/replication/tcp/commands.py @@ -462,6 +462,32 @@ class RemoteServerUpCommand(_SimpleCommand): NAME = "REMOTE_SERVER_UP" +class NewServerJoinedCommand(Command): + """Sent when a worker has detected that a new remote server has joined a room. + + Format:: + + NEW_SERVER_JOINED + """ + + NAME = "NEW_SERVER_JOINED" + __slots__ = ["server", "room_id"] + + def __init__(self, server: str, room_id: str): + self.server = server + self.room_id = room_id + + @classmethod + def from_line( + cls: Type["NewServerJoinedCommand"], line: str + ) -> "NewServerJoinedCommand": + server, room_id = line.split(" ") + return cls(server, room_id) + + def to_line(self) -> str: + return "%s %s" % (self.server, self.room_id) + + class LockReleasedCommand(Command): """Sent to inform other instances that a given lock has been dropped. @@ -517,6 +543,7 @@ class NewActiveTaskCommand(_SimpleCommand): FederationAckCommand, UserIpCommand, RemoteServerUpCommand, + NewServerJoinedCommand, ClearUserSyncsCommand, LockReleasedCommand, NewActiveTaskCommand, @@ -533,6 +560,7 @@ class NewActiveTaskCommand(_SimpleCommand): ErrorCommand.NAME, PingCommand.NAME, RemoteServerUpCommand.NAME, + NewServerJoinedCommand.NAME, LockReleasedCommand.NAME, ) @@ -547,6 +575,7 @@ class NewActiveTaskCommand(_SimpleCommand): UserIpCommand.NAME, ErrorCommand.NAME, RemoteServerUpCommand.NAME, + NewServerJoinedCommand.NAME, LockReleasedCommand.NAME, ) diff --git a/synapse/replication/tcp/handler.py b/synapse/replication/tcp/handler.py index 4b01eac09d9..1df9a8dbf08 100644 --- a/synapse/replication/tcp/handler.py +++ b/synapse/replication/tcp/handler.py @@ -48,6 +48,7 @@ FederationAckCommand, LockReleasedCommand, NewActiveTaskCommand, + NewServerJoinedCommand, PositionCommand, RdataCommand, RemoteServerUpCommand, @@ -764,6 +765,12 @@ def on_REMOTE_SERVER_UP( """Called when get a new REMOTE_SERVER_UP command.""" self._notifier.notify_remote_server_up(cmd.data) + def on_NEW_SERVER_JOINED( + self, conn: IReplicationConnection, cmd: NewServerJoinedCommand + ) -> None: + """Called when get a new NEW_SERVER_JOINED command.""" + self._notifier.notify_new_server_joined(cmd.server, cmd.room_id) + def on_LOCK_RELEASED( self, conn: IReplicationConnection, cmd: LockReleasedCommand ) -> None: @@ -886,6 +893,9 @@ def send_user_ip( def send_remote_server_up(self, server: str) -> None: self.send_command(RemoteServerUpCommand(server)) + def send_new_server_joined(self, server: str, room_id: str) -> None: + self.send_command(NewServerJoinedCommand(server, room_id)) + def stream_update(self, stream_name: str, token: Optional[int], data: Any) -> None: """Called when a new update is available to stream to Redis subscribers. diff --git a/synapse/storage/databases/main/events.py b/synapse/storage/databases/main/events.py index c3b883e350e..f12e3f3ede9 100644 --- a/synapse/storage/databases/main/events.py +++ b/synapse/storage/databases/main/events.py @@ -1188,6 +1188,14 @@ def _persist_events_txn( if self.msc4354_sticky_events: self.store.insert_sticky_events_txn(txn, events_and_contexts) + for ev, _ in events_and_contexts: + if ev.type == "m.room.member" and ev.membership == "join": + print(f"GOT JOIN FOR {ev.state_key}") + domain = get_domain_from_id(ev.state_key) + self.hs.get_notifier().notify_new_server_joined(domain, ev.room_id) + self.hs.get_replication_command_handler().send_new_server_joined( + domain, ev.room_id + ) # We only update the sliding sync tables for non-backfilled events. self._update_sliding_sync_tables_with_new_persisted_events_txn( diff --git a/synapse/storage/databases/main/sticky_events.py b/synapse/storage/databases/main/sticky_events.py index dc48d92bc21..082df620bb4 100644 --- a/synapse/storage/databases/main/sticky_events.py +++ b/synapse/storage/databases/main/sticky_events.py @@ -25,7 +25,6 @@ cast, ) -from synapse.storage.engines import PostgresEngine from twisted.internet.defer import Deferred from synapse import event_auth @@ -44,6 +43,7 @@ from synapse.storage.databases.main.cache import CacheInvalidationWorkerStore from synapse.storage.databases.main.events import DeltaState from synapse.storage.databases.main.state import StateGroupWorkerStore +from synapse.storage.engines import PostgresEngine from synapse.storage.util.id_generators import MultiWriterIdGenerator from synapse.types.state import StateFilter from synapse.util.stringutils import shortstr @@ -569,9 +569,7 @@ def _update_soft_failure_status_txn( ) else: # Use a CASE expression to update in bulk for sqlite - case_expr = " ".join( - [f"WHEN ? THEN ? " for _ in new_stream_ids] - ) + case_expr = " ".join(["WHEN ? THEN ? " for _ in new_stream_ids]) txn.execute( f""" UPDATE sticky_events From aac3c846a8e13c22e9b3cac768075719d52d3d7f Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Thu, 2 Oct 2025 16:47:45 +0100 Subject: [PATCH 24/34] Use a tri-state for soft failed to communicate when we need to cache invalidate --- synapse/api/constants.py | 7 ++++ synapse/replication/tcp/streams/_base.py | 22 ++++++------ .../storage/databases/main/events_worker.py | 12 ++++++- .../storage/databases/main/sticky_events.py | 36 ++++++++++--------- .../schema/main/delta/93/01_sticky_events.sql | 6 +++- .../93/01_sticky_events_seq.sql.postgres | 2 +- 6 files changed, 56 insertions(+), 29 deletions(-) diff --git a/synapse/api/constants.py b/synapse/api/constants.py index 95ec369bc20..c5d59a82115 100644 --- a/synapse/api/constants.py +++ b/synapse/api/constants.py @@ -372,3 +372,10 @@ class StickyEvent: QUERY_PARAM_NAME: Final = "org.matrix.msc4354.sticky_duration_ms" FIELD_NAME: Final = "msc4354_sticky" MAX_DURATION_MS: Final = 3600000 # 1 hour + + +# for the database +class StickyEventSoftFailed(enum.IntEnum): + FALSE = 0 + TRUE = 1 + FORMER_TRUE = 2 diff --git a/synapse/replication/tcp/streams/_base.py b/synapse/replication/tcp/streams/_base.py index bdc74e4b6a5..ab3fa02a780 100644 --- a/synapse/replication/tcp/streams/_base.py +++ b/synapse/replication/tcp/streams/_base.py @@ -34,7 +34,7 @@ import attr -from synapse.api.constants import AccountDataTypes +from synapse.api.constants import AccountDataTypes, StickyEventSoftFailed from synapse.replication.http.streams import ReplicationGetStreamUpdates if TYPE_CHECKING: @@ -768,15 +768,17 @@ async def _update_function( return rows, rows[-1][0], len(updates) == limit -class StickyEventsStream(_StreamFromIdGen): - """A sticky event was changed.""" +@attr.s(slots=True, auto_attribs=True) +class StickyEventsStreamRow: + """Stream to inform workers about changes to sticky events.""" - @attr.s(slots=True, auto_attribs=True) - class StickyEventsStreamRow: - """Stream to inform workers about changes to sticky events.""" + room_id: str + event_id: str # The sticky event ID + soft_failed_status: StickyEventSoftFailed - room_id: str - event_id: str # The sticky event ID + +class StickyEventsStream(_StreamFromIdGen): + """A sticky event was changed.""" NAME = "sticky_events" ROW_TYPE = StickyEventsStreamRow @@ -799,9 +801,9 @@ async def _update_function( ( stream_id, # These are the args to `StickyEventsStreamRow` - (room_id, event_id), + (room_id, event_id, soft_failed), ) - for stream_id, room_id, event_id in updates + for stream_id, room_id, event_id, soft_failed in updates ] if not rows: diff --git a/synapse/storage/databases/main/events_worker.py b/synapse/storage/databases/main/events_worker.py index 31e23122115..17579cc4657 100644 --- a/synapse/storage/databases/main/events_worker.py +++ b/synapse/storage/databases/main/events_worker.py @@ -45,7 +45,7 @@ from twisted.internet import defer -from synapse.api.constants import Direction, EventTypes +from synapse.api.constants import Direction, EventTypes, StickyEventSoftFailed from synapse.api.errors import NotFoundError, SynapseError from synapse.api.room_versions import ( KNOWN_ROOM_VERSIONS, @@ -74,6 +74,10 @@ wrap_as_background_process, ) from synapse.replication.tcp.streams import BackfillStream, UnPartialStatedEventStream +from synapse.replication.tcp.streams._base import ( + StickyEventsStream, + StickyEventsStreamRow, +) from synapse.replication.tcp.streams.events import EventsStream from synapse.replication.tcp.streams.partial_state import UnPartialStatedEventStreamRow from synapse.storage._base import SQLBaseStore, db_to_json, make_in_list_sql_clause @@ -463,6 +467,12 @@ def process_replication_rows( # If the partial-stated event became rejected or unrejected # when it wasn't before, we need to invalidate this cache. self._invalidate_local_get_event_cache(row.event_id) + elif stream_name == StickyEventsStream.NAME: + for row in rows: + assert isinstance(row, StickyEventsStreamRow) + if row.soft_failed_status == StickyEventSoftFailed.FORMER_TRUE: + # was soft-failed, now not, so invalidate caches + self._invalidate_local_get_event_cache(row.event_id) super().process_replication_rows(stream_name, instance_name, token, rows) diff --git a/synapse/storage/databases/main/sticky_events.py b/synapse/storage/databases/main/sticky_events.py index 082df620bb4..077518cfbb5 100644 --- a/synapse/storage/databases/main/sticky_events.py +++ b/synapse/storage/databases/main/sticky_events.py @@ -28,7 +28,7 @@ from twisted.internet.defer import Deferred from synapse import event_auth -from synapse.api.constants import EventTypes, StickyEvent +from synapse.api.constants import EventTypes, StickyEvent, StickyEventSoftFailed from synapse.api.errors import AuthError from synapse.events import EventBase from synapse.events.snapshot import EventPersistencePair @@ -171,15 +171,15 @@ def _get_sticky_events_in_rooms_txn( txn.execute( f""" SELECT stream_id, room_id, event_id FROM sticky_events - WHERE soft_failed=FALSE AND expires_at > ? AND stream_id > ? AND stream_id <= ? AND {clause} + WHERE soft_failed != ? AND expires_at > ? AND stream_id > ? AND stream_id <= ? AND {clause} """, - (now, from_id, to_id, *room_id_values), + (StickyEventSoftFailed.TRUE, now, from_id, to_id, *room_id_values), ) return cast(List[Tuple[int, str, str]], txn.fetchall()) async def get_updated_sticky_events( self, from_id: int, to_id: int, limit: int - ) -> List[Tuple[int, str, str]]: + ) -> List[Tuple[int, str, str, StickyEventSoftFailed]]: """Get updates to sticky events between two stream IDs. Args: @@ -200,14 +200,14 @@ async def get_updated_sticky_events( def _get_updated_sticky_events_txn( self, txn: LoggingTransaction, from_id: int, to_id: int, limit: int - ) -> List[Tuple[int, str, str]]: + ) -> List[Tuple[int, str, str, StickyEventSoftFailed]]: txn.execute( """ - SELECT stream_id, room_id, event_id FROM sticky_events WHERE stream_id > ? AND stream_id <= ? LIMIT ? + SELECT stream_id, room_id, event_id, soft_failed FROM sticky_events WHERE stream_id > ? AND stream_id <= ? LIMIT ? """, (from_id, to_id, limit), ) - return cast(List[Tuple[int, str, str]], txn.fetchall()) + return cast(List[Tuple[int, str, str, StickyEventSoftFailed]], txn.fetchall()) async def get_sticky_event_ids_sent_by_self( self, room_id: str, from_stream_pos: int @@ -237,9 +237,9 @@ def _get_sticky_event_ids_sent_by_self_txn( """ SELECT sticky_events.event_id, sticky_events.sender, events.stream_ordering FROM sticky_events INNER JOIN events ON events.event_id = sticky_events.event_id - WHERE soft_failed=FALSE AND expires_at > ? AND sticky_events.room_id = ? + WHERE soft_failed=? AND expires_at > ? AND sticky_events.room_id = ? """, - (now_ms, room_id), + (StickyEventSoftFailed.FALSE, now_ms, room_id), ) rows = cast(List[Tuple[str, str, int]], txn.fetchall()) return [ @@ -341,7 +341,9 @@ def insert_sticky_events_txn( ev.event_id, ev.sender, expires_at, - ev.internal_metadata.is_soft_failed(), + StickyEventSoftFailed.TRUE + if ev.internal_metadata.is_soft_failed() + else StickyEventSoftFailed.FALSE, ) for (ev, expires_at, stream_id) in sticky_events ], @@ -426,7 +428,7 @@ async def _get_soft_failed_sticky_events_to_recheck( iterable=new_membership_changes, keyvalues={ "room_id": room_id, - "soft_failed": True, + "soft_failed": StickyEventSoftFailed.TRUE, }, retcols=("event_id",), desc="_get_soft_failed_sticky_events_to_recheck_members", @@ -457,7 +459,7 @@ async def _get_soft_failed_sticky_events_to_recheck( table="sticky_events", keyvalues={ "room_id": room_id, - "soft_failed": True, + "soft_failed": StickyEventSoftFailed.TRUE, }, retcols=("event_id",), desc="_get_soft_failed_sticky_events_to_recheck", @@ -540,14 +542,14 @@ def _update_soft_failure_status_txn( f""" UPDATE sticky_events AS se SET - soft_failed = FALSE, + soft_failed = ?, stream_id = v.stream_id FROM (VALUES {values_placeholders} ) AS v(event_id, stream_id) WHERE se.event_id = v.event_id; """, - params, + [StickyEventSoftFailed.FORMER_TRUE] + params, ) # Also update the internal metadata on the event itself, so when we filter_events_for_client # we don't filter them out. It's a bit sad internal_metadata is TEXT and not JSONB... @@ -574,14 +576,16 @@ def _update_soft_failure_status_txn( f""" UPDATE sticky_events SET - soft_failed = FALSE, + soft_failed = ?, stream_id = CASE event_id {case_expr} ELSE stream_id END WHERE event_id IN ({",".join("?" * len(new_stream_ids))}); """, - params + [eid for eid, _ in new_stream_ids], + [StickyEventSoftFailed.FORMER_TRUE] + + params + + [eid for eid, _ in new_stream_ids], ) clause, args = make_in_list_sql_clause( txn.database_engine, diff --git a/synapse/storage/schema/main/delta/93/01_sticky_events.sql b/synapse/storage/schema/main/delta/93/01_sticky_events.sql index 0c9319a7d2f..18cce22fbc7 100644 --- a/synapse/storage/schema/main/delta/93/01_sticky_events.sql +++ b/synapse/storage/schema/main/delta/93/01_sticky_events.sql @@ -18,7 +18,11 @@ CREATE TABLE IF NOT EXISTS sticky_events( event_id TEXT NOT NULL, sender TEXT NOT NULL, expires_at BIGINT NOT NULL, - soft_failed BOOLEAN NOT NULL + -- 0=False, 1=True, 2=False-but-was-True + -- We need '2' to handle cache invalidation downstream. + -- Receiving a sticky event replication row with '2' will cause get_event + -- caches to be invalidated, so the soft-failure status can change. + soft_failed SMALLINT NOT NULL ); -- for pulling out soft failed events by room diff --git a/synapse/storage/schema/main/delta/93/01_sticky_events_seq.sql.postgres b/synapse/storage/schema/main/delta/93/01_sticky_events_seq.sql.postgres index 5a28a309d91..9ba72856bc9 100644 --- a/synapse/storage/schema/main/delta/93/01_sticky_events_seq.sql.postgres +++ b/synapse/storage/schema/main/delta/93/01_sticky_events_seq.sql.postgres @@ -15,4 +15,4 @@ CREATE SEQUENCE sticky_events_sequence; -- Synapse streams start at 2, because the default position is 1 -- so any item inserted at position 1 is ignored. -- We have to use nextval not START WITH 2, see https://github.com/element-hq/synapse/issues/18712 -SELECT nextval('thread_subscriptions_sequence'); +SELECT nextval('sticky_events_sequence'); From 075312cf2dea291a1760b64f71c4ec318507f3fe Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Fri, 3 Oct 2025 11:35:12 +0100 Subject: [PATCH 25/34] Send sticky events to newly joined servers. Process those sticky events as well. In so doing, fixes https://github.com/element-hq/synapse/issues/18563 because we now have the room_creator as an update value not insert value. --- synapse/federation/sender/__init__.py | 54 +++- synapse/handlers/federation.py | 239 ++++++++++-------- synapse/storage/databases/main/events.py | 9 +- synapse/storage/databases/main/room.py | 55 ++-- .../storage/databases/main/sticky_events.py | 6 +- 5 files changed, 235 insertions(+), 128 deletions(-) diff --git a/synapse/federation/sender/__init__.py b/synapse/federation/sender/__init__.py index f5938b20c4f..18f5aab102a 100644 --- a/synapse/federation/sender/__init__.py +++ b/synapse/federation/sender/__init__.py @@ -181,6 +181,7 @@ from synapse.util.clock import Clock from synapse.util.metrics import Measure from synapse.util.retryutils import filter_destinations_by_retry_limiter +from synapse.visibility import filter_events_for_server if TYPE_CHECKING: from synapse.events.presence_router import PresenceRouter @@ -496,7 +497,58 @@ def _get_per_destination_queue( return queue def notify_new_server_joined(self, server: str, room_id: str) -> None: - print(f"FEDSENDER: new server joined: server={server} room={room_id}") + # We currently only use this notification for MSC4354: Sticky Events. + if not self.hs.config.experimental.msc4354_enabled: + return + # fire off a processing loop in the background + run_as_background_process( + "process_new_server_joined_over_federation", + self.server_name, + self._process_new_server_joined_over_federation, + server, + room_id, + ) + + async def _process_new_server_joined_over_federation( + self, new_server: str, room_id: str + ) -> None: + sticky_event_ids = await self.store.get_sticky_event_ids_sent_by_self( + room_id, + 0, + ) + sticky_events = await self.store.get_events_as_list(sticky_event_ids) + + # We must not send events that are outliers / lack a stream ordering, else we won't be able to + # satisfy /get_missing_events requests + sticky_events = [ + ev + for ev in sticky_events + if ev.internal_metadata.stream_ordering is not None + and not ev.internal_metadata.is_outlier() + ] + # order by stream ordering so we present things in the right timeline order on the receiver + sticky_events = sorted( + sticky_events, + key=lambda ev: ev.internal_metadata.stream_ordering + or 0, # not possible to be 0 + ) + + sticky_events = await filter_events_for_server( + self._storage_controllers, + new_server, + self.server_name, + sticky_events, + redact=False, + filter_out_erased_senders=True, + filter_out_remote_partial_state_events=True, + ) + if sticky_events: + logger.info("sending %d sticky events to newly joined server %s in room %s", len(sticky_events), new_server, room_id) + # we don't track that we sent up to this stream position since it won't make any difference + # since notify_new_server_joined is only called initially. + await self._transaction_manager.send_new_transaction( + new_server, sticky_events, [] + ) def notify_new_events(self, max_token: RoomStreamToken) -> None: """This gets called when we have some new events we might want to diff --git a/synapse/handlers/federation.py b/synapse/handlers/federation.py index 41fb3076c36..be4d173e700 100644 --- a/synapse/handlers/federation.py +++ b/synapse/handlers/federation.py @@ -67,6 +67,7 @@ from synapse.events.snapshot import EventContext, UnpersistedEventContextBase from synapse.events.validator import EventValidator from synapse.federation.federation_client import InvalidResponseError +from synapse.federation.federation_server import _INBOUND_EVENT_HANDLING_LOCK_NAME from synapse.handlers.pagination import PURGE_PAGINATION_LOCK_NAME from synapse.http.servlet import assert_params_in_dict from synapse.logging.context import nested_logging_context @@ -75,6 +76,7 @@ from synapse.metrics.background_process_metrics import run_as_background_process from synapse.module_api import NOT_SPAM from synapse.storage.databases.main.events_worker import EventRedactBehaviour +from synapse.storage.databases.main.lock import Lock from synapse.storage.invite_rule import InviteRule from synapse.types import JsonDict, StrCollection, get_domain_from_id from synapse.types.state import StateFilter @@ -647,125 +649,158 @@ async def do_invite_join( except ValueError: pass + lock: Optional[Lock] = None async with self._is_partial_state_room_linearizer.queue(room_id): - already_partial_state_room = await self.store.is_partial_state_room( - room_id - ) + try: + # MSC4354: Sticky Events causes existing servers in the room to send sticky events + # to the newly joined server as soon as they realise the new server is in the room. + # If they do this before we've persisted the /send_join response we will be unable to + # process those PDUs. Therefore, we take a lock out now for this room, and release it + # once we have processed the /send_join response, to buffer up these inbound messages. + # This may be useful to do even without MSC4354, but it's gated behind an + # experimental flag check to reduce the chance of this having unintended side-effects + # e.g accidental deadlocks. Once we're confident of this behaviour, we can probably + # drop the flag check. We take the lock AFTER we have been queued by the linearizer + # else we would just hold the lock for no reason whilst in the queue: we want to hold + # the lock for the smallest amount of time possible. + if self.config.experimental.msc4354_enabled: + lock = await self.store.try_acquire_lock( + _INBOUND_EVENT_HANDLING_LOCK_NAME, room_id + ) + # Insert the room into the rooms table now so we can process potential incoming + # /send transactions enough to be able to insert into the federation staging + # area. We won't process the staging area until we release the lock above. + await self.store.upsert_room_on_join( + room_id=room_id, + room_version=room_version_obj, + state_events=None, + ) - ret = await self.federation_client.send_join( - host_list, - event, - room_version_obj, - # Perform a full join when we are already in the room and it is a - # full state room, since we are not allowed to persist a partial - # state join event in a full state room. In the future, we could - # optimize this by always performing a partial state join and - # computing the state ourselves or retrieving it from the remote - # homeserver if necessary. - # - # There's a race where we leave the room, then perform a full join - # anyway. This should end up being fast anyway, since we would - # already have the full room state and auth chain persisted. - partial_state=not is_host_joined or already_partial_state_room, - ) + already_partial_state_room = await self.store.is_partial_state_room( + room_id + ) - event = ret.event - origin = ret.origin - state = ret.state - auth_chain = ret.auth_chain - auth_chain.sort(key=lambda e: e.depth) + ret = await self.federation_client.send_join( + host_list, + event, + room_version_obj, + # Perform a full join when we are already in the room and it is a + # full state room, since we are not allowed to persist a partial + # state join event in a full state room. In the future, we could + # optimize this by always performing a partial state join and + # computing the state ourselves or retrieving it from the remote + # homeserver if necessary. + # + # There's a race where we leave the room, then perform a full join + # anyway. This should end up being fast anyway, since we would + # already have the full room state and auth chain persisted. + partial_state=not is_host_joined or already_partial_state_room, + ) - logger.debug("do_invite_join auth_chain: %s", auth_chain) - logger.debug("do_invite_join state: %s", state) + event = ret.event + origin = ret.origin + state = ret.state + auth_chain = ret.auth_chain + auth_chain.sort(key=lambda e: e.depth) - logger.debug("do_invite_join event: %s", event) + logger.debug("do_invite_join auth_chain: %s", auth_chain) + logger.debug("do_invite_join state: %s", state) - # if this is the first time we've joined this room, it's time to add - # a row to `rooms` with the correct room version. If there's already a - # row there, we should override it, since it may have been populated - # based on an invite request which lied about the room version. - # - # federation_client.send_join has already checked that the room - # version in the received create event is the same as room_version_obj, - # so we can rely on it now. - # - await self.store.upsert_room_on_join( - room_id=room_id, - room_version=room_version_obj, - state_events=state, - ) + logger.debug("do_invite_join event: %s", event) - if ret.partial_state and not already_partial_state_room: - # Mark the room as having partial state. - # The background process is responsible for unmarking this flag, - # even if the join fails. - # TODO(faster_joins): - # We may want to reset the partial state info if it's from an - # old, failed partial state join. - # https://github.com/matrix-org/synapse/issues/13000 - - # FIXME: Ideally, we would store the full stream token here - # not just the minimum stream ID, so that we can compute an - # accurate list of device changes when un-partial-ing the - # room. The only side effect of this is that we may send - # extra unecessary device list outbound pokes through - # federation, which is harmless. - device_lists_stream_id = self.store.get_device_stream_token().stream - - await self.store.store_partial_state_room( + # if this is the first time we've joined this room, it's time to add + # a row to `rooms` with the correct room version. If there's already a + # row there, we should override it, since it may have been populated + # based on an invite request which lied about the room version. + # + # federation_client.send_join has already checked that the room + # version in the received create event is the same as room_version_obj, + # so we can rely on it now. + # + await self.store.upsert_room_on_join( room_id=room_id, - servers=ret.servers_in_room, - device_lists_stream_id=device_lists_stream_id, - joined_via=origin, + room_version=room_version_obj, + state_events=state, ) - try: - max_stream_id = ( - await self._federation_event_handler.process_remote_join( - origin, - room_id, - auth_chain, - state, - event, - room_version_obj, - partial_state=ret.partial_state, - ) - ) - except PartialStateConflictError: - # This should be impossible, since we hold the lock on the room's - # partial statedness. - logger.error( - "Room %s was un-partial stated while processing remote join.", - room_id, - ) - raise - else: - # Record the join event id for future use (when we finish the full - # join). We have to do this after persisting the event to keep - # foreign key constraints intact. if ret.partial_state and not already_partial_state_room: + # Mark the room as having partial state. + # The background process is responsible for unmarking this flag, + # even if the join fails. # TODO(faster_joins): - # We may want to reset the partial state info if it's from - # an old, failed partial state join. + # We may want to reset the partial state info if it's from an + # old, failed partial state join. # https://github.com/matrix-org/synapse/issues/13000 - await self.store.write_partial_state_rooms_join_event_id( - room_id, event.event_id + + # FIXME: Ideally, we would store the full stream token here + # not just the minimum stream ID, so that we can compute an + # accurate list of device changes when un-partial-ing the + # room. The only side effect of this is that we may send + # extra unecessary device list outbound pokes through + # federation, which is harmless. + device_lists_stream_id = ( + self.store.get_device_stream_token().stream ) - finally: - # Always kick off the background process that asynchronously fetches - # state for the room. - # If the join failed, the background process is responsible for - # cleaning up — including unmarking the room as a partial state - # room. - if ret.partial_state: - # Kick off the process of asynchronously fetching the state for - # this room. - self._start_partial_state_room_sync( - initial_destination=origin, - other_destinations=ret.servers_in_room, + + await self.store.store_partial_state_room( room_id=room_id, + servers=ret.servers_in_room, + device_lists_stream_id=device_lists_stream_id, + joined_via=origin, ) + try: + max_stream_id = ( + await self._federation_event_handler.process_remote_join( + origin, + room_id, + auth_chain, + state, + event, + room_version_obj, + partial_state=ret.partial_state, + ) + ) + except PartialStateConflictError: + # This should be impossible, since we hold the lock on the room's + # partial statedness. + logger.error( + "Room %s was un-partial stated while processing remote join.", + room_id, + ) + raise + else: + # Record the join event id for future use (when we finish the full + # join). We have to do this after persisting the event to keep + # foreign key constraints intact. + if ret.partial_state and not already_partial_state_room: + # TODO(faster_joins): + # We may want to reset the partial state info if it's from + # an old, failed partial state join. + # https://github.com/matrix-org/synapse/issues/13000 + await self.store.write_partial_state_rooms_join_event_id( + room_id, event.event_id + ) + finally: + # Always kick off the background process that asynchronously fetches + # state for the room. + # If the join failed, the background process is responsible for + # cleaning up — including unmarking the room as a partial state + # room. + if ret.partial_state: + # Kick off the process of asynchronously fetching the state for + # this room. + self._start_partial_state_room_sync( + initial_destination=origin, + other_destinations=ret.servers_in_room, + room_id=room_id, + ) + finally: + # allow inbound events which happened during the join to be processed. + # Also ensures we release the lock on unexpected errors e.g db errors from + # upsert_room_on_join or network errors from send_join. + if lock: + await lock.release() # We wait here until this instance has seen the events come down # replication (if we're using replication) as the below uses caches. await self._replication.wait_for_stream_position( diff --git a/synapse/storage/databases/main/events.py b/synapse/storage/databases/main/events.py index f12e3f3ede9..77008f16079 100644 --- a/synapse/storage/databases/main/events.py +++ b/synapse/storage/databases/main/events.py @@ -1187,7 +1187,9 @@ def _persist_events_txn( ) if self.msc4354_sticky_events: - self.store.insert_sticky_events_txn(txn, events_and_contexts) + self.store.insert_sticky_events_txn( + txn, [ev for ev, _ in events_and_contexts] + ) for ev, _ in events_and_contexts: if ev.type == "m.room.member" and ev.membership == "join": print(f"GOT JOIN FOR {ev.state_key}") @@ -2658,6 +2660,11 @@ def _update_outliers_txn( # event isn't an outlier any more. self._update_backward_extremeties(txn, [event]) + if self.msc4354_sticky_events and event.sticky_duration(): + # The de-outliered event is sticky. Update the sticky events table to ensure + # we delivery this down /sync. + self.store.insert_sticky_events_txn(txn, [event]) + return [ec for ec in events_and_contexts if ec[0] not in to_remove] def _store_event_txn( diff --git a/synapse/storage/databases/main/room.py b/synapse/storage/databases/main/room.py index 9f03c084a59..d67b018ee78 100644 --- a/synapse/storage/databases/main/room.py +++ b/synapse/storage/databases/main/room.py @@ -2460,7 +2460,10 @@ def __init__( self._instance_name = hs.get_instance_name() async def upsert_room_on_join( - self, room_id: str, room_version: RoomVersion, state_events: List[EventBase] + self, + room_id: str, + room_version: RoomVersion, + state_events: Optional[List[EventBase]], ) -> None: """Ensure that the room is stored in the table @@ -2472,36 +2475,46 @@ async def upsert_room_on_join( # mark the room as having an auth chain cover index. has_auth_chain_index = await self.has_auth_chain_index(room_id) - create_event = None - for e in state_events: - if (e.type, e.state_key) == (EventTypes.Create, ""): - create_event = e - break + # We may want to insert a row into the rooms table BEFORE having the state events in the + # room, in order to correctly handle the race condition where the /send_join is processed + # remotely which causes remote servers to send us events before we've processed the /send_join + # response. Therefore, we allow state_events (and thus the creator column) to be optional. + # When we get the /send_join response, we'll patch this up. + room_creator: Optional[str] = None + if state_events: + create_event = None + for e in state_events: + if (e.type, e.state_key) == (EventTypes.Create, ""): + create_event = e + break + + if create_event is None: + # If the state doesn't have a create event then the room is + # invalid, and it would fail auth checks anyway. + raise StoreError(400, "No create event in state") - if create_event is None: - # If the state doesn't have a create event then the room is - # invalid, and it would fail auth checks anyway. - raise StoreError(400, "No create event in state") + # Before MSC2175, the room creator was a separate field. + if not room_version.implicit_room_creator: + room_creator = create_event.content.get(EventContentFields.ROOM_CREATOR) - # Before MSC2175, the room creator was a separate field. - if not room_version.implicit_room_creator: - room_creator = create_event.content.get(EventContentFields.ROOM_CREATOR) + if not isinstance(room_creator, str): + # If the create event does not have a creator then the room is + # invalid, and it would fail auth checks anyway. + raise StoreError(400, "No creator defined on the create event") + else: + room_creator = create_event.sender - if not isinstance(room_creator, str): - # If the create event does not have a creator then the room is - # invalid, and it would fail auth checks anyway. - raise StoreError(400, "No creator defined on the create event") - else: - room_creator = create_event.sender + update_with = {"room_version": room_version.identifier} + if room_creator: + update_with["creator"] = room_creator await self.db_pool.simple_upsert( desc="upsert_room_on_join", table="rooms", keyvalues={"room_id": room_id}, - values={"room_version": room_version.identifier}, + values=update_with, insertion_values={ "is_public": False, - "creator": room_creator, "has_auth_chain_index": has_auth_chain_index, }, ) diff --git a/synapse/storage/databases/main/sticky_events.py b/synapse/storage/databases/main/sticky_events.py index 077518cfbb5..2578aad3e1a 100644 --- a/synapse/storage/databases/main/sticky_events.py +++ b/synapse/storage/databases/main/sticky_events.py @@ -212,7 +212,7 @@ def _get_updated_sticky_events_txn( async def get_sticky_event_ids_sent_by_self( self, room_id: str, from_stream_pos: int ) -> List[str]: - """Get sticky event IDs which have been sent by users on this homeserver. + """Get unexpired sticky event IDs which have been sent by users on this homeserver. Used when sending sticky events eagerly to newly joined servers, or when catching up over federation. @@ -284,12 +284,12 @@ async def reevaluate_soft_failed_sticky_events( def insert_sticky_events_txn( self, txn: LoggingTransaction, - events_and_contexts: List[EventPersistencePair], + events: List[EventBase], ) -> None: now_ms = self._now() # event, expires_at, stream_id sticky_events: List[Tuple[EventBase, int, int]] = [] - for ev, _ in events_and_contexts: + for ev in events: # MSC: Note: policy servers and other similar antispam techniques still apply to these events. if ev.internal_metadata.policy_server_spammy: continue From 7f1e057cca98b2d4b397b097e1c403cc00920466 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Fri, 3 Oct 2025 15:24:34 +0100 Subject: [PATCH 26/34] Always cache invalidate when receiving sticky event updates, so removing the need for a tri-state soft failure flag --- synapse/api/constants.py | 7 ----- synapse/federation/sender/__init__.py | 7 ++++- synapse/replication/tcp/streams/_base.py | 7 ++--- .../storage/databases/main/events_worker.py | 7 ++--- .../storage/databases/main/sticky_events.py | 30 ++++++++----------- .../schema/main/delta/93/01_sticky_events.sql | 6 +--- 6 files changed, 26 insertions(+), 38 deletions(-) diff --git a/synapse/api/constants.py b/synapse/api/constants.py index c5d59a82115..95ec369bc20 100644 --- a/synapse/api/constants.py +++ b/synapse/api/constants.py @@ -372,10 +372,3 @@ class StickyEvent: QUERY_PARAM_NAME: Final = "org.matrix.msc4354.sticky_duration_ms" FIELD_NAME: Final = "msc4354_sticky" MAX_DURATION_MS: Final = 3600000 # 1 hour - - -# for the database -class StickyEventSoftFailed(enum.IntEnum): - FALSE = 0 - TRUE = 1 - FORMER_TRUE = 2 diff --git a/synapse/federation/sender/__init__.py b/synapse/federation/sender/__init__.py index 18f5aab102a..c0bf5c599af 100644 --- a/synapse/federation/sender/__init__.py +++ b/synapse/federation/sender/__init__.py @@ -543,7 +543,12 @@ async def _process_new_server_joined_over_federation( filter_out_remote_partial_state_events=True, ) if sticky_events: - logger.info("sending %d sticky events to newly joined server %s in room %s", len(sticky_events), new_server, room_id) + logger.info( + "sending %d sticky events to newly joined server %s in room %s", + len(sticky_events), + new_server, + room_id, + ) # we don't track that we sent up to this stream position since it won't make any difference # since notify_new_server_joined is only called initially. await self._transaction_manager.send_new_transaction( diff --git a/synapse/replication/tcp/streams/_base.py b/synapse/replication/tcp/streams/_base.py index ab3fa02a780..48bfc6cfa25 100644 --- a/synapse/replication/tcp/streams/_base.py +++ b/synapse/replication/tcp/streams/_base.py @@ -34,7 +34,7 @@ import attr -from synapse.api.constants import AccountDataTypes, StickyEventSoftFailed +from synapse.api.constants import AccountDataTypes from synapse.replication.http.streams import ReplicationGetStreamUpdates if TYPE_CHECKING: @@ -774,7 +774,6 @@ class StickyEventsStreamRow: room_id: str event_id: str # The sticky event ID - soft_failed_status: StickyEventSoftFailed class StickyEventsStream(_StreamFromIdGen): @@ -801,9 +800,9 @@ async def _update_function( ( stream_id, # These are the args to `StickyEventsStreamRow` - (room_id, event_id, soft_failed), + (room_id, event_id), ) - for stream_id, room_id, event_id, soft_failed in updates + for stream_id, room_id, event_id, _ in updates ] if not rows: diff --git a/synapse/storage/databases/main/events_worker.py b/synapse/storage/databases/main/events_worker.py index 17579cc4657..42f0c8324af 100644 --- a/synapse/storage/databases/main/events_worker.py +++ b/synapse/storage/databases/main/events_worker.py @@ -45,7 +45,7 @@ from twisted.internet import defer -from synapse.api.constants import Direction, EventTypes, StickyEventSoftFailed +from synapse.api.constants import Direction, EventTypes from synapse.api.errors import NotFoundError, SynapseError from synapse.api.room_versions import ( KNOWN_ROOM_VERSIONS, @@ -470,9 +470,8 @@ def process_replication_rows( elif stream_name == StickyEventsStream.NAME: for row in rows: assert isinstance(row, StickyEventsStreamRow) - if row.soft_failed_status == StickyEventSoftFailed.FORMER_TRUE: - # was soft-failed, now not, so invalidate caches - self._invalidate_local_get_event_cache(row.event_id) + # In case soft-failure status changed, invalidate the cache. + self._invalidate_local_get_event_cache(row.event_id) super().process_replication_rows(stream_name, instance_name, token, rows) diff --git a/synapse/storage/databases/main/sticky_events.py b/synapse/storage/databases/main/sticky_events.py index 2578aad3e1a..d22e6099f83 100644 --- a/synapse/storage/databases/main/sticky_events.py +++ b/synapse/storage/databases/main/sticky_events.py @@ -28,7 +28,7 @@ from twisted.internet.defer import Deferred from synapse import event_auth -from synapse.api.constants import EventTypes, StickyEvent, StickyEventSoftFailed +from synapse.api.constants import EventTypes, StickyEvent from synapse.api.errors import AuthError from synapse.events import EventBase from synapse.events.snapshot import EventPersistencePair @@ -173,13 +173,13 @@ def _get_sticky_events_in_rooms_txn( SELECT stream_id, room_id, event_id FROM sticky_events WHERE soft_failed != ? AND expires_at > ? AND stream_id > ? AND stream_id <= ? AND {clause} """, - (StickyEventSoftFailed.TRUE, now, from_id, to_id, *room_id_values), + (True, now, from_id, to_id, *room_id_values), ) return cast(List[Tuple[int, str, str]], txn.fetchall()) async def get_updated_sticky_events( self, from_id: int, to_id: int, limit: int - ) -> List[Tuple[int, str, str, StickyEventSoftFailed]]: + ) -> List[Tuple[int, str, str, bool]]: """Get updates to sticky events between two stream IDs. Args: @@ -200,14 +200,14 @@ async def get_updated_sticky_events( def _get_updated_sticky_events_txn( self, txn: LoggingTransaction, from_id: int, to_id: int, limit: int - ) -> List[Tuple[int, str, str, StickyEventSoftFailed]]: + ) -> List[Tuple[int, str, str, bool]]: txn.execute( """ SELECT stream_id, room_id, event_id, soft_failed FROM sticky_events WHERE stream_id > ? AND stream_id <= ? LIMIT ? """, (from_id, to_id, limit), ) - return cast(List[Tuple[int, str, str, StickyEventSoftFailed]], txn.fetchall()) + return cast(List[Tuple[int, str, str, bool]], txn.fetchall()) async def get_sticky_event_ids_sent_by_self( self, room_id: str, from_stream_pos: int @@ -239,7 +239,7 @@ def _get_sticky_event_ids_sent_by_self_txn( INNER JOIN events ON events.event_id = sticky_events.event_id WHERE soft_failed=? AND expires_at > ? AND sticky_events.room_id = ? """, - (StickyEventSoftFailed.FALSE, now_ms, room_id), + (False, now_ms, room_id), ) rows = cast(List[Tuple[str, str, int]], txn.fetchall()) return [ @@ -341,9 +341,7 @@ def insert_sticky_events_txn( ev.event_id, ev.sender, expires_at, - StickyEventSoftFailed.TRUE - if ev.internal_metadata.is_soft_failed() - else StickyEventSoftFailed.FALSE, + ev.internal_metadata.is_soft_failed(), ) for (ev, expires_at, stream_id) in sticky_events ], @@ -428,7 +426,7 @@ async def _get_soft_failed_sticky_events_to_recheck( iterable=new_membership_changes, keyvalues={ "room_id": room_id, - "soft_failed": StickyEventSoftFailed.TRUE, + "soft_failed": True, }, retcols=("event_id",), desc="_get_soft_failed_sticky_events_to_recheck_members", @@ -459,7 +457,7 @@ async def _get_soft_failed_sticky_events_to_recheck( table="sticky_events", keyvalues={ "room_id": room_id, - "soft_failed": StickyEventSoftFailed.TRUE, + "soft_failed": True, }, retcols=("event_id",), desc="_get_soft_failed_sticky_events_to_recheck", @@ -542,14 +540,14 @@ def _update_soft_failure_status_txn( f""" UPDATE sticky_events AS se SET - soft_failed = ?, + soft_failed = FALSE, stream_id = v.stream_id FROM (VALUES {values_placeholders} ) AS v(event_id, stream_id) WHERE se.event_id = v.event_id; """, - [StickyEventSoftFailed.FORMER_TRUE] + params, + params, ) # Also update the internal metadata on the event itself, so when we filter_events_for_client # we don't filter them out. It's a bit sad internal_metadata is TEXT and not JSONB... @@ -576,16 +574,14 @@ def _update_soft_failure_status_txn( f""" UPDATE sticky_events SET - soft_failed = ?, + soft_failed = FALSE, stream_id = CASE event_id {case_expr} ELSE stream_id END WHERE event_id IN ({",".join("?" * len(new_stream_ids))}); """, - [StickyEventSoftFailed.FORMER_TRUE] - + params - + [eid for eid, _ in new_stream_ids], + params + [eid for eid, _ in new_stream_ids], ) clause, args = make_in_list_sql_clause( txn.database_engine, diff --git a/synapse/storage/schema/main/delta/93/01_sticky_events.sql b/synapse/storage/schema/main/delta/93/01_sticky_events.sql index 18cce22fbc7..0c9319a7d2f 100644 --- a/synapse/storage/schema/main/delta/93/01_sticky_events.sql +++ b/synapse/storage/schema/main/delta/93/01_sticky_events.sql @@ -18,11 +18,7 @@ CREATE TABLE IF NOT EXISTS sticky_events( event_id TEXT NOT NULL, sender TEXT NOT NULL, expires_at BIGINT NOT NULL, - -- 0=False, 1=True, 2=False-but-was-True - -- We need '2' to handle cache invalidation downstream. - -- Receiving a sticky event replication row with '2' will cause get_event - -- caches to be invalidated, so the soft-failure status can change. - soft_failed SMALLINT NOT NULL + soft_failed BOOLEAN NOT NULL ); -- for pulling out soft failed events by room From 58bf128581169e49699589b85c2b7387307c8707 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Mon, 6 Oct 2025 14:18:39 +0100 Subject: [PATCH 27/34] Send NEW_SERVER_JOINED at the right time --- synapse/storage/controllers/persist_events.py | 85 +++++++++++++++++++ synapse/storage/databases/main/events.py | 8 -- 2 files changed, 85 insertions(+), 8 deletions(-) diff --git a/synapse/storage/controllers/persist_events.py b/synapse/storage/controllers/persist_events.py index 120934af578..e384062ec12 100644 --- a/synapse/storage/controllers/persist_events.py +++ b/synapse/storage/controllers/persist_events.py @@ -664,6 +664,29 @@ async def _persist_event_batch( async with self._state_deletion_store.persisting_state_group_references( events_and_contexts ): + new_servers: Optional[Set[str]] = None + if self.hs.config.experimental.msc4354_enabled and state_delta_for_room: + # We specifically only consider events in `chunk` to reduce the risk of state rollbacks + # causing servers to appear to repeatedly rejoin rooms. This works because we only + # persist events once, whereas the state delta may unreliably flap between joined members + # on unrelated events. This means we may miss cases where the /first/ join event for a server + # is as a result of a state rollback and not as a result of a new join event. That is fine + # because the chance of that happening is vanishingly rare because the join event would need to be + # persisted without it affecting the current state (e.g there's a concurrent ban for that user) + # which is then revoked concurrently by a later event (e.g the user is unbanned). + # If state resolution were more reliable (in terms of state resets) then we could feasibly only + # consider the events in the state_delta_for_room, but we aren't there yet. + new_event_ids_in_current_state = set( + state_delta_for_room.to_insert.values() + ) + new_servers = await self._check_new_servers_joined( + room_id, + [ + ev + for (ev, _) in chunk + if ev.event_id in new_event_ids_in_current_state + ], + ) await self.persist_events_store._persist_events_and_state_updates( room_id, chunk, @@ -673,9 +696,71 @@ async def _persist_event_batch( inhibit_local_membership_updates=backfilled, new_event_links=new_event_links, ) + if new_servers: + # Notify other workers after the server has joined so they can take into account + # the latest events that are in `chunk`. + for server_name in new_servers: + self.hs.get_notifier().notify_new_server_joined( + server_name, room_id + ) + self.hs.get_replication_command_handler().send_new_server_joined( + server_name, room_id + ) return replaced_events + async def _check_new_servers_joined( + self, room_id: str, new_events_in_current_state: List[EventBase] + ) -> Optional[Set[str]]: + """Check if new servers have joined the given room. + + Assumes this function is called BEFORE the current_state_events table is updated. + + A new server is "joined" if this is the first join event seen from this domain. + + Args: + room_id: The room in question + new_events_in_current_state: A list of events that will become part of the current state, + but have not yet been persisted. + """ + # filter to only join events from other servers. We're obviously joined if we are getting full events + # so needn't consider ourselves. + join_events = [ + ev + for ev in new_events_in_current_state + if ev.type == EventTypes.Member + and ev.is_state() + and not self.is_mine_id(ev.state_key) + and ev.membership == Membership.JOIN + ] + if not join_events: + return None + + joining_domains = {get_domain_from_id(ev.state_key) for ev in join_events} + + # load all joined members from the current_state_events table as this table is fast and has what we want. + # This is the current state prior to applying the update. + joined_members: List[ + Tuple[str] + ] = await self.main_store.db_pool.simple_select_list( + "current_state_events", + { + "room_id": room_id, + "type": EventTypes.Member, + "membership": Membership.JOIN, + }, + retcols=["state_key"], + desc="_check_new_servers_joined", + ) + joined_domains = { + get_domain_from_id(state_key) for (state_key,) in joined_members + } + + newly_joined_domains = joining_domains.difference(joined_domains) + if not newly_joined_domains: + return None + return newly_joined_domains + async def _calculate_new_forward_extremities_and_state_delta( self, room_id: str, ev_ctx_rm: List[EventPersistencePair] ) -> Tuple[Optional[Set[str]], Optional[DeltaState]]: diff --git a/synapse/storage/databases/main/events.py b/synapse/storage/databases/main/events.py index 77008f16079..cc8b95e2fe4 100644 --- a/synapse/storage/databases/main/events.py +++ b/synapse/storage/databases/main/events.py @@ -1190,14 +1190,6 @@ def _persist_events_txn( self.store.insert_sticky_events_txn( txn, [ev for ev, _ in events_and_contexts] ) - for ev, _ in events_and_contexts: - if ev.type == "m.room.member" and ev.membership == "join": - print(f"GOT JOIN FOR {ev.state_key}") - domain = get_domain_from_id(ev.state_key) - self.hs.get_notifier().notify_new_server_joined(domain, ev.room_id) - self.hs.get_replication_command_handler().send_new_server_joined( - domain, ev.room_id - ) # We only update the sliding sync tables for non-backfilled events. self._update_sliding_sync_tables_with_new_persisted_events_txn( From 686ce52723bb2bf12c292c14953dff58ec73f970 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Mon, 6 Oct 2025 14:28:21 +0100 Subject: [PATCH 28/34] Changelog --- changelog.d/18968.feature | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/18968.feature diff --git a/changelog.d/18968.feature b/changelog.d/18968.feature new file mode 100644 index 00000000000..30368b23fd9 --- /dev/null +++ b/changelog.d/18968.feature @@ -0,0 +1 @@ +Implement support for MSC4354: Sticky Events. From 4def40414e29c5d706cb86df38ae7448b8fae09e Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Mon, 6 Oct 2025 14:45:14 +0100 Subject: [PATCH 29/34] Appease the regexp on delta lint checks --- synapse/storage/schema/main/delta/93/01_sticky_events.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/synapse/storage/schema/main/delta/93/01_sticky_events.sql b/synapse/storage/schema/main/delta/93/01_sticky_events.sql index 0c9319a7d2f..c62b6f61ce5 100644 --- a/synapse/storage/schema/main/delta/93/01_sticky_events.sql +++ b/synapse/storage/schema/main/delta/93/01_sticky_events.sql @@ -11,7 +11,7 @@ -- See the GNU Affero General Public License for more details: -- . -CREATE TABLE IF NOT EXISTS sticky_events( +CREATE TABLE sticky_events ( stream_id INTEGER NOT NULL PRIMARY KEY, instance_name TEXT NOT NULL, room_id TEXT NOT NULL, @@ -22,7 +22,7 @@ CREATE TABLE IF NOT EXISTS sticky_events( ); -- for pulling out soft failed events by room -CREATE INDEX IF NOT EXISTS sticky_events_room_idx ON sticky_events(room_id, soft_failed); +CREATE INDEX sticky_events_room_idx ON sticky_events (room_id, soft_failed); -- A optional int for combining sticky events with delayed events. Used at send time. ALTER TABLE delayed_events ADD COLUMN sticky_duration_ms BIGINT; \ No newline at end of file From adb601b2d10099623beee73a684131f14b695ef0 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Wed, 8 Oct 2025 09:11:32 +0100 Subject: [PATCH 30/34] Add chunking for /sync, not SSS --- synapse/api/constants.py | 1 + synapse/handlers/sliding_sync/extensions.py | 4 + synapse/handlers/sync.py | 88 +++++++++---------- .../storage/databases/main/sticky_events.py | 17 ++-- 4 files changed, 60 insertions(+), 50 deletions(-) diff --git a/synapse/api/constants.py b/synapse/api/constants.py index 95ec369bc20..84f316eb23f 100644 --- a/synapse/api/constants.py +++ b/synapse/api/constants.py @@ -372,3 +372,4 @@ class StickyEvent: QUERY_PARAM_NAME: Final = "org.matrix.msc4354.sticky_duration_ms" FIELD_NAME: Final = "msc4354_sticky" MAX_DURATION_MS: Final = 3600000 # 1 hour + MAX_EVENTS_IN_SYNC: Final = 100 diff --git a/synapse/handlers/sliding_sync/extensions.py b/synapse/handlers/sliding_sync/extensions.py index 1bb7ff0d876..813b0ddcc39 100644 --- a/synapse/handlers/sliding_sync/extensions.py +++ b/synapse/handlers/sliding_sync/extensions.py @@ -1008,6 +1008,10 @@ async def get_sticky_events_extension_response( from_id, to_token.sticky_events_key, now, + # We set no limit here because the client can control when they get sticky events. + # Furthermore, it doesn't seem possible to set a limit with the internal API shape + # as given, as we cannot manipulate the to_token.sticky_events_key sent to the client... + limit=0, ) all_sticky_event_ids = { ev_id for evs in room_to_event_ids.values() for ev_id in evs diff --git a/synapse/handlers/sync.py b/synapse/handlers/sync.py index f1997070b83..6b139609cef 100644 --- a/synapse/handlers/sync.py +++ b/synapse/handlers/sync.py @@ -45,6 +45,7 @@ EventTypes, JoinRules, Membership, + StickyEvent, ) from synapse.api.filtering import FilterCollection from synapse.api.presence import UserPresenceState @@ -165,7 +166,11 @@ def __bool__(self) -> bool: to tell if room needs to be part of the sync result. """ return bool( - self.timeline or self.state or self.ephemeral or self.account_data + self.timeline + or self.state + or self.ephemeral + or self.account_data + or self.sticky # nb the notification count does not, er, count: if there's nothing # else in the result, we don't need to send it. ) @@ -639,43 +644,12 @@ async def sticky_events_by_room( from_id, now_token.sticky_events_key, now, + StickyEvent.MAX_EVENTS_IN_SYNC, ) now_token = now_token.copy_and_replace(StreamKeyType.STICKY_EVENTS, to_id) return now_token, sticky_by_room - async def _generate_sticky_events( - self, - sync_result_builder: "SyncResultBuilder", - sticky_by_room: Dict[str, Set[str]], - ) -> None: - """Generate sticky events to put into the sync response. - - The builder should already be populated with timeline events for joined rooms, so we can - duplicate suppress sticky events that are already going to be returned in the timeline section - of the sync response. - - Args: - sync_result_builder - sticky_by_room: Map of room ID to sticky event IDs. - """ - for joined_room in sync_result_builder.joined: - sticky_event_ids = sticky_by_room.get(joined_room.room_id, set()) - if len(sticky_event_ids) == 0: - continue - # remove sticky events that are in the timeline - timeline = {ev.event_id for ev in joined_room.timeline.events} - sticky_event_ids = sticky_event_ids.difference(timeline) - if len(sticky_event_ids) == 0: - continue - event_map = await self.store.get_events(sticky_event_ids) - joined_room.sticky = await filter_events_for_client( - self._storage_controllers, - sync_result_builder.sync_config.user.to_string(), - list(event_map.values()), - always_include_ids=frozenset(sticky_event_ids), - ) - async def _load_filtered_recents( self, room_id: str, @@ -2253,6 +2227,13 @@ async def _generate_sync_entry_for_rooms( ) sync_result_builder.now_token = now_token + sticky_by_room: Dict[str, Set[str]] = {} + if self.hs_config.experimental.msc4354_enabled: + now_token, sticky_by_room = await self.sticky_events_by_room( + sync_result_builder, now_token, since_token + ) + sync_result_builder.now_token = now_token + # 2. We check up front if anything has changed, if it hasn't then there is # no point in going further. if not sync_result_builder.full_state: @@ -2263,7 +2244,7 @@ async def _generate_sync_entry_for_rooms( tags_by_room = await self.store.get_updated_tags( user_id, since_token.account_data_key ) - if not tags_by_room: + if not tags_by_room and not sticky_by_room: logger.debug("no-oping sync") return set(), set() @@ -2283,7 +2264,6 @@ async def _generate_sync_entry_for_rooms( tags_by_room = await self.store.get_tags_for_user(user_id) log_kv({"rooms_changed": len(room_changes.room_entries)}) - room_entries = room_changes.room_entries invited = room_changes.invited knocked = room_changes.knocked @@ -2301,6 +2281,7 @@ async def handle_room_entries(room_entry: "RoomSyncResultBuilder") -> None: ephemeral=ephemeral_by_room.get(room_entry.room_id, []), tags=tags_by_room.get(room_entry.room_id), account_data=account_data_by_room.get(room_entry.room_id, {}), + sticky_event_ids=sticky_by_room.get(room_entry.room_id, set()), always_include=sync_result_builder.full_state, ) logger.debug("Generated room entry for %s", room_entry.room_id) @@ -2311,14 +2292,6 @@ async def handle_room_entries(room_entry: "RoomSyncResultBuilder") -> None: sync_result_builder.invited.extend(invited) sync_result_builder.knocked.extend(knocked) - if self.hs_config.experimental.msc4354_enabled: - now_token, sticky_by_room = await self.sticky_events_by_room( - sync_result_builder, now_token, since_token - ) - if sticky_by_room: - await self._generate_sticky_events(sync_result_builder, sticky_by_room) - sync_result_builder.now_token = now_token - return set(newly_joined_rooms), set(newly_left_rooms) async def _have_rooms_changed( @@ -2695,6 +2668,7 @@ async def _generate_room_entry( ephemeral: List[JsonDict], tags: Optional[Mapping[str, JsonMapping]], account_data: Mapping[str, JsonMapping], + sticky_event_ids: Set[str], always_include: bool = False, ) -> None: """Populates the `joined` and `archived` section of `sync_result_builder` @@ -2724,6 +2698,7 @@ async def _generate_room_entry( tags: List of *all* tags for room, or None if there has been no change. account_data: List of new account data for room + sticky_event_ids: MSC4354 sticky events in the room, if any. always_include: Always include this room in the sync response, even if empty. """ @@ -2734,7 +2709,13 @@ async def _generate_room_entry( events = room_builder.events # We want to shortcut out as early as possible. - if not (always_include or account_data or ephemeral or full_state): + if not ( + always_include + or account_data + or ephemeral + or full_state + or sticky_event_ids + ): if events == [] and tags is None: return @@ -2818,6 +2799,7 @@ async def _generate_room_entry( or account_data_events or ephemeral or full_state + or sticky_event_ids ): return @@ -2864,6 +2846,22 @@ async def _generate_room_entry( if room_builder.rtype == "joined": unread_notifications: Dict[str, int] = {} + sticky_events: List[EventBase] = [] + if sticky_event_ids: + # remove sticky events that are in the timeline, else we will needlessly duplicate + # events. This is particularly important given the risk of sticky events spam since + # anyone can send sticky events, so halving the bandwidth on average for each sticky + # event is helpful. + timeline = {ev.event_id for ev in batch.events} + sticky_event_ids = sticky_event_ids.difference(timeline) + if sticky_event_ids: + sticky_event_map = await self.store.get_events(sticky_event_ids) + sticky_events = await filter_events_for_client( + self._storage_controllers, + sync_result_builder.sync_config.user.to_string(), + list(sticky_event_map.values()), + always_include_ids=frozenset(sticky_event_ids), + ) room_sync = JoinedSyncResult( room_id=room_id, timeline=batch, @@ -2874,7 +2872,7 @@ async def _generate_room_entry( unread_thread_notifications={}, summary=summary, unread_count=0, - sticky=[], + sticky=sticky_events, ) if room_sync or always_include: diff --git a/synapse/storage/databases/main/sticky_events.py b/synapse/storage/databases/main/sticky_events.py index d22e6099f83..398bf3598a7 100644 --- a/synapse/storage/databases/main/sticky_events.py +++ b/synapse/storage/databases/main/sticky_events.py @@ -126,6 +126,7 @@ async def get_sticky_events_in_rooms( from_id: int, to_id: int, now: int, + limit: int, ) -> Tuple[int, Dict[str, Set[str]]]: """ Fetch all the sticky events in the given rooms, from the given sticky stream ID. @@ -135,6 +136,7 @@ async def get_sticky_events_in_rooms( from_id: The sticky stream ID that sticky events should be returned from (exclusive). to_id: The sticky stream ID that sticky events should end at (inclusive). now: The current time in unix millis, used for skipping expired events. + limit: Max sticky events to return. If <= 0, no limit is applied. Returns: A tuple of (to_id, map[room_id, event_ids]) """ @@ -145,6 +147,7 @@ async def get_sticky_events_in_rooms( from_id, to_id, now, + limit, ) new_to_id = from_id room_to_events: Dict[str, Set[str]] = {} @@ -162,19 +165,23 @@ def _get_sticky_events_in_rooms_txn( from_id: int, to_id: int, now: int, + limit: int, ) -> List[Tuple[int, str, str]]: if len(room_ids) == 0: return [] clause, room_id_values = make_in_list_sql_clause( txn.database_engine, "room_id", room_ids ) - txn.execute( - f""" + query = f""" SELECT stream_id, room_id, event_id FROM sticky_events WHERE soft_failed != ? AND expires_at > ? AND stream_id > ? AND stream_id <= ? AND {clause} - """, - (True, now, from_id, to_id, *room_id_values), - ) + ORDER BY stream_id ASC + """ + params = (True, now, from_id, to_id, *room_id_values) + if limit > 0: + query += "LIMIT ?" + params += (limit,) + txn.execute(query, params) return cast(List[Tuple[int, str, str]], txn.fetchall()) async def get_updated_sticky_events( From f0689cee5e606d176b94b87df55104a94ba00838 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Wed, 8 Oct 2025 10:01:47 +0100 Subject: [PATCH 31/34] Linting --- synapse/federation/sender/__init__.py | 3 +-- synapse/storage/databases/main/sticky_events.py | 6 ++---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/synapse/federation/sender/__init__.py b/synapse/federation/sender/__init__.py index a9dd9b35aed..6b6e8e33391 100644 --- a/synapse/federation/sender/__init__.py +++ b/synapse/federation/sender/__init__.py @@ -518,9 +518,8 @@ def notify_new_server_joined(self, server: str, room_id: str) -> None: if not self.hs.config.experimental.msc4354_enabled: return # fire off a processing loop in the background - run_as_background_process( + self.hs.run_as_background_process( "process_new_server_joined_over_federation", - self.server_name, self._process_new_server_joined_over_federation, server, room_id, diff --git a/synapse/storage/databases/main/sticky_events.py b/synapse/storage/databases/main/sticky_events.py index 398bf3598a7..08fae9c67b1 100644 --- a/synapse/storage/databases/main/sticky_events.py +++ b/synapse/storage/databases/main/sticky_events.py @@ -32,7 +32,6 @@ from synapse.api.errors import AuthError from synapse.events import EventBase from synapse.events.snapshot import EventPersistencePair -from synapse.metrics.background_process_metrics import run_as_background_process from synapse.replication.tcp.streams._base import StickyEventsStream from synapse.storage.database import ( DatabasePool, @@ -75,7 +74,7 @@ def __init__( # Technically this means we will cleanup N times, once per event persister, maybe put on master? if self._can_write_to_sticky_events: - self._clock.looping_call( + self.clock.looping_call( self._run_background_cleanup, DELETE_EXPIRED_STICKY_EVENTS_MS ) @@ -629,8 +628,7 @@ def _now(self) -> int: return round(time.time() * 1000) def _run_background_cleanup(self) -> Deferred: - return run_as_background_process( + return self.hs.run_as_background_process( "delete_expired_sticky_events", - self.server_name, self._delete_expired_sticky_events, ) From b2c967fd1c09ffe08747e74704963f7c4c95bbee Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Wed, 8 Oct 2025 11:37:27 +0100 Subject: [PATCH 32/34] Fix query param name --- tests/rest/client/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/rest/client/utils.py b/tests/rest/client/utils.py index c34b1c4973c..3ca3b059b5d 100644 --- a/tests/rest/client/utils.py +++ b/tests/rest/client/utils.py @@ -470,7 +470,7 @@ def send_sticky_event( if txn_id is None: txn_id = "m%s" % (str(time.time())) - path = "/_matrix/client/r0/rooms/%s/send/%s/%s?msc4354_stick_duration_ms=%d" % ( + path = "/_matrix/client/r0/rooms/%s/send/%s/%s?org.matrix.msc4354.sticky_duration_ms=%d" % ( room_id, type, txn_id, From 4d02a4cca0f80421309ef76e818848b06f6d0d31 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Wed, 8 Oct 2025 11:59:17 +0100 Subject: [PATCH 33/34] Linting --- tests/rest/client/utils.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tests/rest/client/utils.py b/tests/rest/client/utils.py index 3ca3b059b5d..1002e9ef62c 100644 --- a/tests/rest/client/utils.py +++ b/tests/rest/client/utils.py @@ -470,11 +470,14 @@ def send_sticky_event( if txn_id is None: txn_id = "m%s" % (str(time.time())) - path = "/_matrix/client/r0/rooms/%s/send/%s/%s?org.matrix.msc4354.sticky_duration_ms=%d" % ( - room_id, - type, - txn_id, - duration_ms, + path = ( + "/_matrix/client/r0/rooms/%s/send/%s/%s?org.matrix.msc4354.sticky_duration_ms=%d" + % ( + room_id, + type, + txn_id, + duration_ms, + ) ) if tok: path = path + "&access_token=%s" % tok From a4cac852a9d6a6b633427b47fdefa1ccf62988d6 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Wed, 8 Oct 2025 12:50:21 +0100 Subject: [PATCH 34/34] Run MSC4354 tests --- scripts-dev/complement.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts-dev/complement.sh b/scripts-dev/complement.sh index c4d678b142d..2e1632dfeef 100755 --- a/scripts-dev/complement.sh +++ b/scripts-dev/complement.sh @@ -231,6 +231,7 @@ test_packages=( ./tests/msc4140 ./tests/msc4155 ./tests/msc4306 + ./tests/msc4354 ) # Enable dirty runs, so tests will reuse the same container where possible.