proxyagent.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
  1. # Copyright 2019 The Matrix.org Foundation C.I.C.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import logging
  15. import re
  16. from typing import Any, Dict, Optional, Tuple
  17. from urllib.parse import urlparse
  18. from urllib.request import ( # type: ignore[attr-defined]
  19. getproxies_environment,
  20. proxy_bypass_environment,
  21. )
  22. from zope.interface import implementer
  23. from twisted.internet import defer
  24. from twisted.internet.endpoints import HostnameEndpoint, wrapClientTLS
  25. from twisted.internet.interfaces import IReactorCore, IStreamClientEndpoint
  26. from twisted.python.failure import Failure
  27. from twisted.web.client import (
  28. URI,
  29. BrowserLikePolicyForHTTPS,
  30. HTTPConnectionPool,
  31. _AgentBase,
  32. )
  33. from twisted.web.error import SchemeNotSupported
  34. from twisted.web.http_headers import Headers
  35. from twisted.web.iweb import IAgent, IBodyProducer, IPolicyForHTTPS
  36. from synapse.http.connectproxyclient import HTTPConnectProxyEndpoint, ProxyCredentials
  37. from synapse.types import ISynapseReactor
  38. logger = logging.getLogger(__name__)
  39. _VALID_URI = re.compile(rb"\A[\x21-\x7e]+\Z")
  40. @implementer(IAgent)
  41. class ProxyAgent(_AgentBase):
  42. """An Agent implementation which will use an HTTP proxy if one was requested
  43. Args:
  44. reactor: twisted reactor to place outgoing
  45. connections.
  46. proxy_reactor: twisted reactor to use for connections to the proxy server
  47. reactor might have some blacklisting applied (i.e. for DNS queries),
  48. but we need unblocked access to the proxy.
  49. contextFactory: A factory for TLS contexts, to control the
  50. verification parameters of OpenSSL. The default is to use a
  51. `BrowserLikePolicyForHTTPS`, so unless you have special
  52. requirements you can leave this as-is.
  53. connectTimeout: The amount of time that this Agent will wait
  54. for the peer to accept a connection, in seconds. If 'None',
  55. HostnameEndpoint's default (30s) will be used.
  56. This is used for connections to both proxies and destination servers.
  57. bindAddress: The local address for client sockets to bind to.
  58. pool: connection pool to be used. If None, a
  59. non-persistent pool instance will be created.
  60. use_proxy: Whether proxy settings should be discovered and used
  61. from conventional environment variables.
  62. Raises:
  63. ValueError if use_proxy is set and the environment variables
  64. contain an invalid proxy specification.
  65. RuntimeError if no tls_options_factory is given for a https connection
  66. """
  67. def __init__(
  68. self,
  69. reactor: IReactorCore,
  70. proxy_reactor: Optional[ISynapseReactor] = None,
  71. contextFactory: Optional[IPolicyForHTTPS] = None,
  72. connectTimeout: Optional[float] = None,
  73. bindAddress: Optional[bytes] = None,
  74. pool: Optional[HTTPConnectionPool] = None,
  75. use_proxy: bool = False,
  76. ):
  77. contextFactory = contextFactory or BrowserLikePolicyForHTTPS()
  78. _AgentBase.__init__(self, reactor, pool)
  79. if proxy_reactor is None:
  80. self.proxy_reactor = reactor
  81. else:
  82. self.proxy_reactor = proxy_reactor
  83. self._endpoint_kwargs: Dict[str, Any] = {}
  84. if connectTimeout is not None:
  85. self._endpoint_kwargs["timeout"] = connectTimeout
  86. if bindAddress is not None:
  87. self._endpoint_kwargs["bindAddress"] = bindAddress
  88. http_proxy = None
  89. https_proxy = None
  90. no_proxy = None
  91. if use_proxy:
  92. proxies = getproxies_environment()
  93. http_proxy = proxies["http"].encode() if "http" in proxies else None
  94. https_proxy = proxies["https"].encode() if "https" in proxies else None
  95. no_proxy = proxies["no"] if "no" in proxies else None
  96. self.http_proxy_endpoint, self.http_proxy_creds = http_proxy_endpoint(
  97. http_proxy, self.proxy_reactor, contextFactory, **self._endpoint_kwargs
  98. )
  99. self.https_proxy_endpoint, self.https_proxy_creds = http_proxy_endpoint(
  100. https_proxy, self.proxy_reactor, contextFactory, **self._endpoint_kwargs
  101. )
  102. self.no_proxy = no_proxy
  103. self._policy_for_https = contextFactory
  104. self._reactor = reactor
  105. def request(
  106. self,
  107. method: bytes,
  108. uri: bytes,
  109. headers: Optional[Headers] = None,
  110. bodyProducer: Optional[IBodyProducer] = None,
  111. ) -> defer.Deferred:
  112. """
  113. Issue a request to the server indicated by the given uri.
  114. Supports `http` and `https` schemes.
  115. An existing connection from the connection pool may be used or a new one may be
  116. created.
  117. See also: twisted.web.iweb.IAgent.request
  118. Args:
  119. method: The request method to use, such as `GET`, `POST`, etc
  120. uri: The location of the resource to request.
  121. headers: Extra headers to send with the request
  122. bodyProducer: An object which can generate bytes to make up the body of
  123. this request (for example, the properly encoded contents of a file for
  124. a file upload). Or, None if the request is to have no body.
  125. Returns:
  126. Deferred[IResponse]: completes when the header of the response has
  127. been received (regardless of the response status code).
  128. Can fail with:
  129. SchemeNotSupported: if the uri is not http or https
  130. twisted.internet.error.TimeoutError if the server we are connecting
  131. to (proxy or destination) does not accept a connection before
  132. connectTimeout.
  133. ... other things too.
  134. """
  135. uri = uri.strip()
  136. if not _VALID_URI.match(uri):
  137. raise ValueError(f"Invalid URI {uri!r}")
  138. parsed_uri = URI.fromBytes(uri)
  139. pool_key = f"{parsed_uri.scheme!r}{parsed_uri.host!r}{parsed_uri.port}"
  140. request_path = parsed_uri.originForm
  141. should_skip_proxy = False
  142. if self.no_proxy is not None:
  143. should_skip_proxy = proxy_bypass_environment(
  144. parsed_uri.host.decode(),
  145. proxies={"no": self.no_proxy},
  146. )
  147. if (
  148. parsed_uri.scheme == b"http"
  149. and self.http_proxy_endpoint
  150. and not should_skip_proxy
  151. ):
  152. # Determine whether we need to set Proxy-Authorization headers
  153. if self.http_proxy_creds:
  154. # Set a Proxy-Authorization header
  155. if headers is None:
  156. headers = Headers()
  157. headers.addRawHeader(
  158. b"Proxy-Authorization",
  159. self.http_proxy_creds.as_proxy_authorization_value(),
  160. )
  161. # Cache *all* connections under the same key, since we are only
  162. # connecting to a single destination, the proxy:
  163. pool_key = "http-proxy"
  164. endpoint = self.http_proxy_endpoint
  165. request_path = uri
  166. elif (
  167. parsed_uri.scheme == b"https"
  168. and self.https_proxy_endpoint
  169. and not should_skip_proxy
  170. ):
  171. endpoint = HTTPConnectProxyEndpoint(
  172. self.proxy_reactor,
  173. self.https_proxy_endpoint,
  174. parsed_uri.host,
  175. parsed_uri.port,
  176. self.https_proxy_creds,
  177. )
  178. else:
  179. # not using a proxy
  180. endpoint = HostnameEndpoint(
  181. self._reactor, parsed_uri.host, parsed_uri.port, **self._endpoint_kwargs
  182. )
  183. logger.debug("Requesting %s via %s", uri, endpoint)
  184. if parsed_uri.scheme == b"https":
  185. tls_connection_creator = self._policy_for_https.creatorForNetloc(
  186. parsed_uri.host, parsed_uri.port
  187. )
  188. endpoint = wrapClientTLS(tls_connection_creator, endpoint)
  189. elif parsed_uri.scheme == b"http":
  190. pass
  191. else:
  192. return defer.fail(
  193. Failure(
  194. SchemeNotSupported("Unsupported scheme: %r" % (parsed_uri.scheme,))
  195. )
  196. )
  197. return self._requestWithEndpoint(
  198. pool_key, endpoint, method, parsed_uri, headers, bodyProducer, request_path
  199. )
  200. def http_proxy_endpoint(
  201. proxy: Optional[bytes],
  202. reactor: IReactorCore,
  203. tls_options_factory: Optional[IPolicyForHTTPS],
  204. **kwargs: object,
  205. ) -> Tuple[Optional[IStreamClientEndpoint], Optional[ProxyCredentials]]:
  206. """Parses an http proxy setting and returns an endpoint for the proxy
  207. Args:
  208. proxy: the proxy setting in the form: [scheme://][<username>:<password>@]<host>[:<port>]
  209. This currently supports http:// and https:// proxies.
  210. A hostname without scheme is assumed to be http.
  211. reactor: reactor to be used to connect to the proxy
  212. tls_options_factory: the TLS options to use when connecting through a https proxy
  213. kwargs: other args to be passed to HostnameEndpoint
  214. Returns:
  215. a tuple of
  216. endpoint to use to connect to the proxy, or None
  217. ProxyCredentials or if no credentials were found, or None
  218. Raise:
  219. ValueError if proxy has no hostname or unsupported scheme.
  220. RuntimeError if no tls_options_factory is given for a https connection
  221. """
  222. if proxy is None:
  223. return None, None
  224. # Note: urlsplit/urlparse cannot be used here as that does not work (for Python
  225. # 3.9+) on scheme-less proxies, e.g. host:port.
  226. scheme, host, port, credentials = parse_proxy(proxy)
  227. proxy_endpoint = HostnameEndpoint(reactor, host, port, **kwargs)
  228. if scheme == b"https":
  229. if tls_options_factory:
  230. tls_options = tls_options_factory.creatorForNetloc(host, port)
  231. proxy_endpoint = wrapClientTLS(tls_options, proxy_endpoint)
  232. else:
  233. raise RuntimeError(
  234. f"No TLS options for a https connection via proxy {proxy!s}"
  235. )
  236. return proxy_endpoint, credentials
  237. def parse_proxy(
  238. proxy: bytes, default_scheme: bytes = b"http", default_port: int = 1080
  239. ) -> Tuple[bytes, bytes, int, Optional[ProxyCredentials]]:
  240. """
  241. Parse a proxy connection string.
  242. Given a HTTP proxy URL, breaks it down into components and checks that it
  243. has a hostname (otherwise it is not useful to us when trying to find a
  244. proxy) and asserts that the URL has a scheme we support.
  245. Args:
  246. proxy: The proxy connection string. Must be in the form '[scheme://][<username>:<password>@]host[:port]'.
  247. default_scheme: The default scheme to return if one is not found in `proxy`. Defaults to http
  248. default_port: The default port to return if one is not found in `proxy`. Defaults to 1080
  249. Returns:
  250. A tuple containing the scheme, hostname, port and ProxyCredentials.
  251. If no credentials were found, the ProxyCredentials instance is replaced with None.
  252. Raise:
  253. ValueError if proxy has no hostname or unsupported scheme.
  254. """
  255. # First check if we have a scheme present
  256. # Note: urlsplit/urlparse cannot be used (for Python # 3.9+) on scheme-less proxies, e.g. host:port.
  257. if b"://" not in proxy:
  258. proxy = b"".join([default_scheme, b"://", proxy])
  259. url = urlparse(proxy)
  260. if not url.hostname:
  261. raise ValueError("Proxy URL did not contain a hostname! Please specify one.")
  262. if url.scheme not in (b"http", b"https"):
  263. raise ValueError(
  264. f"Unknown proxy scheme {url.scheme!s}; only 'http' and 'https' is supported."
  265. )
  266. credentials = None
  267. if url.username and url.password:
  268. credentials = ProxyCredentials(b"".join([url.username, b":", url.password]))
  269. return url.scheme, url.hostname, url.port or default_port, credentials