diff --git a/changelog.d/19026.misc b/changelog.d/19026.misc new file mode 100644 index 00000000000..6dfa60acf90 --- /dev/null +++ b/changelog.d/19026.misc @@ -0,0 +1,3 @@ +Reduces the SRV DNS record lookup timeout to 15 seconds. +This fixes issues when DNS lookups hang, as the default timeout of 60 seconds matches the timeout for the federation request itself, +thus we see the HTTP request timeout and not the actual DNS error. diff --git a/synapse/http/federation/srv_resolver.py b/synapse/http/federation/srv_resolver.py index 639bf309d67..642e273e6a5 100644 --- a/synapse/http/federation/srv_resolver.py +++ b/synapse/http/federation/srv_resolver.py @@ -26,6 +26,7 @@ import attr +from twisted.internet import defer from twisted.internet.error import ConnectError from twisted.names import client, dns from twisted.names.error import DNSNameError, DNSNotImplementedError, DomainError @@ -145,7 +146,37 @@ async def resolve_service(self, service_name: bytes) -> List[Server]: try: answers, _, _ = await make_deferred_yieldable( - self._dns_client.lookupService(service_name) + self._dns_client.lookupService( + service_name, + # This is a sequence of ints that represent the "number of seconds + # after which to reissue the query. When the last timeout expires, + # the query is considered failed." The default value in Twisted is + # `timeout=(1, 3, 11, 45)` (60s total) which is an "arbitrary" + # exponential backoff sequence and is too long (see below). + # + # We want the total timeout to be below the overarching HTTP request + # timeout (60s for federation requests) that spurred on this lookup. + # This way, we can see the underlying DNS failure and move on + # instead of the user ending up with a generic HTTP request timeout. + # + # Since these DNS queries are done over UDP (unreliable transport), + # by it's nature, it's bound to occasionally fail (dropped packets, + # etc). We want a list that starts small and re-issues DNS queries + # multiple times until we get a response or timeout. + timeout=( + 1, # Quick retry for packet loss/scenarios + 3, # Still reasonable for slow responders + 3, # ... + 3, # Already catching 99.9% of successful queries at 10s + # Final attempt for extreme edge cases. + # + # TODO: In the future, we could consider removing this extra + # time if we don't see complaints. For comparison, The Windows + # DNS resolver gives up after 10s using `(1, 1, 2, 4, 2)`, see + # https://learn.microsoft.com/en-us/troubleshoot/windows-server/networking/dns-client-resolution-timeouts + 5, + ), + ) ) except DNSNameError: # TODO: cache this. We can get the SOA out of the exception, and use @@ -165,6 +196,10 @@ async def resolve_service(self, service_name: bytes) -> List[Server]: return list(cache_entry) else: raise e + except defer.TimeoutError as e: + raise defer.TimeoutError( + f"Could not resolve DNS for SRV record {service_name!r} due to timeout (50s total)" + ) from e if ( len(answers) == 1 diff --git a/tests/http/federation/test_srv_resolver.py b/tests/http/federation/test_srv_resolver.py index a359b0a1415..47c5eedd81a 100644 --- a/tests/http/federation/test_srv_resolver.py +++ b/tests/http/federation/test_srv_resolver.py @@ -68,7 +68,9 @@ def do_lookup() -> Generator["Deferred[object]", object, List[Server]]: test_d = do_lookup() self.assertNoResult(test_d) - dns_client_mock.lookupService.assert_called_once_with(service_name) + dns_client_mock.lookupService.assert_called_once_with( + service_name, timeout=(1, 3, 3, 3, 5) + ) result_deferred.callback(([answer_srv], None, None)) @@ -98,7 +100,9 @@ def test_from_cache_expired_and_dns_fail( servers: List[Server] servers = yield defer.ensureDeferred(resolver.resolve_service(service_name)) # type: ignore[assignment] - dns_client_mock.lookupService.assert_called_once_with(service_name) + dns_client_mock.lookupService.assert_called_once_with( + service_name, timeout=(1, 3, 3, 3, 5) + ) self.assertEqual(len(servers), 1) self.assertEqual(servers, cache[service_name])