well_known_resolver.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318
  1. # -*- coding: utf-8 -*-
  2. # Copyright 2019 The Matrix.org Foundation C.I.C.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. import logging
  16. import random
  17. import time
  18. from typing import Callable, Dict, Optional, Tuple
  19. import attr
  20. from twisted.internet import defer
  21. from twisted.web.client import RedirectAgent, readBody
  22. from twisted.web.http import stringToDatetime
  23. from twisted.web.http_headers import Headers
  24. from twisted.web.iweb import IResponse
  25. from synapse.logging.context import make_deferred_yieldable
  26. from synapse.util import Clock, json_decoder
  27. from synapse.util.caches.ttlcache import TTLCache
  28. from synapse.util.metrics import Measure
  29. # period to cache .well-known results for by default
  30. WELL_KNOWN_DEFAULT_CACHE_PERIOD = 24 * 3600
  31. # jitter factor to add to the .well-known default cache ttls
  32. WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER = 0.1
  33. # period to cache failure to fetch .well-known for
  34. WELL_KNOWN_INVALID_CACHE_PERIOD = 1 * 3600
  35. # period to cache failure to fetch .well-known if there has recently been a
  36. # valid well-known for that domain.
  37. WELL_KNOWN_DOWN_CACHE_PERIOD = 2 * 60
  38. # period to remember there was a valid well-known after valid record expires
  39. WELL_KNOWN_REMEMBER_DOMAIN_HAD_VALID = 2 * 3600
  40. # cap for .well-known cache period
  41. WELL_KNOWN_MAX_CACHE_PERIOD = 48 * 3600
  42. # lower bound for .well-known cache period
  43. WELL_KNOWN_MIN_CACHE_PERIOD = 5 * 60
  44. # Attempt to refetch a cached well-known N% of the TTL before it expires.
  45. # e.g. if set to 0.2 and we have a cached entry with a TTL of 5mins, then
  46. # we'll start trying to refetch 1 minute before it expires.
  47. WELL_KNOWN_GRACE_PERIOD_FACTOR = 0.2
  48. # Number of times we retry fetching a well-known for a domain we know recently
  49. # had a valid entry.
  50. WELL_KNOWN_RETRY_ATTEMPTS = 3
  51. logger = logging.getLogger(__name__)
  52. _well_known_cache = TTLCache("well-known")
  53. _had_valid_well_known_cache = TTLCache("had-valid-well-known")
  54. @attr.s(slots=True, frozen=True)
  55. class WellKnownLookupResult:
  56. delegated_server = attr.ib()
  57. class WellKnownResolver:
  58. """Handles well-known lookups for matrix servers.
  59. """
  60. def __init__(
  61. self,
  62. reactor,
  63. agent,
  64. user_agent,
  65. well_known_cache=None,
  66. had_well_known_cache=None,
  67. ):
  68. self._reactor = reactor
  69. self._clock = Clock(reactor)
  70. if well_known_cache is None:
  71. well_known_cache = _well_known_cache
  72. if had_well_known_cache is None:
  73. had_well_known_cache = _had_valid_well_known_cache
  74. self._well_known_cache = well_known_cache
  75. self._had_valid_well_known_cache = had_well_known_cache
  76. self._well_known_agent = RedirectAgent(agent)
  77. self.user_agent = user_agent
  78. async def get_well_known(self, server_name: bytes) -> WellKnownLookupResult:
  79. """Attempt to fetch and parse a .well-known file for the given server
  80. Args:
  81. server_name: name of the server, from the requested url
  82. Returns:
  83. The result of the lookup
  84. """
  85. try:
  86. prev_result, expiry, ttl = self._well_known_cache.get_with_expiry(
  87. server_name
  88. )
  89. now = self._clock.time()
  90. if now < expiry - WELL_KNOWN_GRACE_PERIOD_FACTOR * ttl:
  91. return WellKnownLookupResult(delegated_server=prev_result)
  92. except KeyError:
  93. prev_result = None
  94. # TODO: should we linearise so that we don't end up doing two .well-known
  95. # requests for the same server in parallel?
  96. try:
  97. with Measure(self._clock, "get_well_known"):
  98. result, cache_period = await self._fetch_well_known(
  99. server_name
  100. ) # type: Tuple[Optional[bytes], float]
  101. except _FetchWellKnownFailure as e:
  102. if prev_result and e.temporary:
  103. # This is a temporary failure and we have a still valid cached
  104. # result, so lets return that. Hopefully the next time we ask
  105. # the remote will be back up again.
  106. return WellKnownLookupResult(delegated_server=prev_result)
  107. result = None
  108. if self._had_valid_well_known_cache.get(server_name, False):
  109. # We have recently seen a valid well-known record for this
  110. # server, so we cache the lack of well-known for a shorter time.
  111. cache_period = WELL_KNOWN_DOWN_CACHE_PERIOD
  112. else:
  113. cache_period = WELL_KNOWN_INVALID_CACHE_PERIOD
  114. # add some randomness to the TTL to avoid a stampeding herd
  115. cache_period *= random.uniform(
  116. 1 - WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER,
  117. 1 + WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER,
  118. )
  119. if cache_period > 0:
  120. self._well_known_cache.set(server_name, result, cache_period)
  121. return WellKnownLookupResult(delegated_server=result)
  122. async def _fetch_well_known(self, server_name: bytes) -> Tuple[bytes, float]:
  123. """Actually fetch and parse a .well-known, without checking the cache
  124. Args:
  125. server_name: name of the server, from the requested url
  126. Raises:
  127. _FetchWellKnownFailure if we fail to lookup a result
  128. Returns:
  129. The lookup result and cache period.
  130. """
  131. had_valid_well_known = self._had_valid_well_known_cache.get(server_name, False)
  132. # We do this in two steps to differentiate between possibly transient
  133. # errors (e.g. can't connect to host, 503 response) and more permenant
  134. # errors (such as getting a 404 response).
  135. response, body = await self._make_well_known_request(
  136. server_name, retry=had_valid_well_known
  137. )
  138. try:
  139. if response.code != 200:
  140. raise Exception("Non-200 response %s" % (response.code,))
  141. parsed_body = json_decoder.decode(body.decode("utf-8"))
  142. logger.info("Response from .well-known: %s", parsed_body)
  143. result = parsed_body["m.server"].encode("ascii")
  144. except defer.CancelledError:
  145. # Bail if we've been cancelled
  146. raise
  147. except Exception as e:
  148. logger.info("Error parsing well-known for %s: %s", server_name, e)
  149. raise _FetchWellKnownFailure(temporary=False)
  150. cache_period = _cache_period_from_headers(
  151. response.headers, time_now=self._reactor.seconds
  152. )
  153. if cache_period is None:
  154. cache_period = WELL_KNOWN_DEFAULT_CACHE_PERIOD
  155. # add some randomness to the TTL to avoid a stampeding herd every 24 hours
  156. # after startup
  157. cache_period *= random.uniform(
  158. 1 - WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER,
  159. 1 + WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER,
  160. )
  161. else:
  162. cache_period = min(cache_period, WELL_KNOWN_MAX_CACHE_PERIOD)
  163. cache_period = max(cache_period, WELL_KNOWN_MIN_CACHE_PERIOD)
  164. # We got a success, mark as such in the cache
  165. self._had_valid_well_known_cache.set(
  166. server_name,
  167. bool(result),
  168. cache_period + WELL_KNOWN_REMEMBER_DOMAIN_HAD_VALID,
  169. )
  170. return result, cache_period
  171. async def _make_well_known_request(
  172. self, server_name: bytes, retry: bool
  173. ) -> Tuple[IResponse, bytes]:
  174. """Make the well known request.
  175. This will retry the request if requested and it fails (with unable
  176. to connect or receives a 5xx error).
  177. Args:
  178. server_name: name of the server, from the requested url
  179. retry: Whether to retry the request if it fails.
  180. Returns:
  181. Returns the response object and body. Response may be a non-200 response.
  182. """
  183. uri = b"https://%s/.well-known/matrix/server" % (server_name,)
  184. uri_str = uri.decode("ascii")
  185. headers = {
  186. b"User-Agent": [self.user_agent],
  187. }
  188. i = 0
  189. while True:
  190. i += 1
  191. logger.info("Fetching %s", uri_str)
  192. try:
  193. response = await make_deferred_yieldable(
  194. self._well_known_agent.request(
  195. b"GET", uri, headers=Headers(headers)
  196. )
  197. )
  198. body = await make_deferred_yieldable(readBody(response))
  199. if 500 <= response.code < 600:
  200. raise Exception("Non-200 response %s" % (response.code,))
  201. return response, body
  202. except defer.CancelledError:
  203. # Bail if we've been cancelled
  204. raise
  205. except Exception as e:
  206. if not retry or i >= WELL_KNOWN_RETRY_ATTEMPTS:
  207. logger.info("Error fetching %s: %s", uri_str, e)
  208. raise _FetchWellKnownFailure(temporary=True)
  209. logger.info("Error fetching %s: %s. Retrying", uri_str, e)
  210. # Sleep briefly in the hopes that they come back up
  211. await self._clock.sleep(0.5)
  212. def _cache_period_from_headers(
  213. headers: Headers, time_now: Callable[[], float] = time.time
  214. ) -> Optional[float]:
  215. cache_controls = _parse_cache_control(headers)
  216. if b"no-store" in cache_controls:
  217. return 0
  218. if b"max-age" in cache_controls:
  219. max_age = cache_controls[b"max-age"]
  220. if max_age:
  221. try:
  222. return int(max_age)
  223. except ValueError:
  224. pass
  225. expires = headers.getRawHeaders(b"expires")
  226. if expires is not None:
  227. try:
  228. expires_date = stringToDatetime(expires[-1])
  229. return expires_date - time_now()
  230. except ValueError:
  231. # RFC7234 says 'A cache recipient MUST interpret invalid date formats,
  232. # especially the value "0", as representing a time in the past (i.e.,
  233. # "already expired").
  234. return 0
  235. return None
  236. def _parse_cache_control(headers: Headers) -> Dict[bytes, Optional[bytes]]:
  237. cache_controls = {}
  238. for hdr in headers.getRawHeaders(b"cache-control", []):
  239. for directive in hdr.split(b","):
  240. splits = [x.strip() for x in directive.split(b"=", 1)]
  241. k = splits[0].lower()
  242. v = splits[1] if len(splits) > 1 else None
  243. cache_controls[k] = v
  244. return cache_controls
  245. @attr.s()
  246. class _FetchWellKnownFailure(Exception):
  247. # True if we didn't get a non-5xx HTTP response, i.e. this may or may not be
  248. # a temporary failure.
  249. temporary = attr.ib()