client.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466
  1. # -*- coding: utf-8 -*-
  2. # Copyright 2014-2016 OpenMarket Ltd
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. from OpenSSL import SSL
  16. from OpenSSL.SSL import VERIFY_NONE
  17. from synapse.api.errors import (
  18. CodeMessageException, SynapseError, Codes,
  19. )
  20. from synapse.util.logcontext import preserve_context_over_fn
  21. import synapse.metrics
  22. from synapse.http.endpoint import SpiderEndpoint
  23. from canonicaljson import encode_canonical_json
  24. from twisted.internet import defer, reactor, ssl, protocol
  25. from twisted.internet.endpoints import SSL4ClientEndpoint, TCP4ClientEndpoint
  26. from twisted.web.client import (
  27. BrowserLikeRedirectAgent, ContentDecoderAgent, GzipDecoder, Agent,
  28. readBody, FileBodyProducer, PartialDownloadError,
  29. )
  30. from twisted.web.http import PotentialDataLoss
  31. from twisted.web.http_headers import Headers
  32. from twisted.web._newclient import ResponseDone
  33. from StringIO import StringIO
  34. import simplejson as json
  35. import logging
  36. import urllib
  37. logger = logging.getLogger(__name__)
  38. metrics = synapse.metrics.get_metrics_for(__name__)
  39. outgoing_requests_counter = metrics.register_counter(
  40. "requests",
  41. labels=["method"],
  42. )
  43. incoming_responses_counter = metrics.register_counter(
  44. "responses",
  45. labels=["method", "code"],
  46. )
  47. class SimpleHttpClient(object):
  48. """
  49. A simple, no-frills HTTP client with methods that wrap up common ways of
  50. using HTTP in Matrix
  51. """
  52. def __init__(self, hs):
  53. self.hs = hs
  54. # The default context factory in Twisted 14.0.0 (which we require) is
  55. # BrowserLikePolicyForHTTPS which will do regular cert validation
  56. # 'like a browser'
  57. self.agent = Agent(
  58. reactor,
  59. connectTimeout=15,
  60. contextFactory=hs.get_http_client_context_factory()
  61. )
  62. self.user_agent = hs.version_string
  63. if hs.config.user_agent_suffix:
  64. self.user_agent = "%s %s" % (self.user_agent, hs.config.user_agent_suffix,)
  65. def request(self, method, uri, *args, **kwargs):
  66. # A small wrapper around self.agent.request() so we can easily attach
  67. # counters to it
  68. outgoing_requests_counter.inc(method)
  69. d = preserve_context_over_fn(
  70. self.agent.request,
  71. method, uri, *args, **kwargs
  72. )
  73. logger.info("Sending request %s %s", method, uri)
  74. def _cb(response):
  75. incoming_responses_counter.inc(method, response.code)
  76. logger.info(
  77. "Received response to %s %s: %s",
  78. method, uri, response.code
  79. )
  80. return response
  81. def _eb(failure):
  82. incoming_responses_counter.inc(method, "ERR")
  83. logger.info(
  84. "Error sending request to %s %s: %s %s",
  85. method, uri, failure.type, failure.getErrorMessage()
  86. )
  87. return failure
  88. d.addCallbacks(_cb, _eb)
  89. return d
  90. @defer.inlineCallbacks
  91. def post_urlencoded_get_json(self, uri, args={}):
  92. # TODO: Do we ever want to log message contents?
  93. logger.debug("post_urlencoded_get_json args: %s", args)
  94. query_bytes = urllib.urlencode(encode_urlencode_args(args), True)
  95. response = yield self.request(
  96. "POST",
  97. uri.encode("ascii"),
  98. headers=Headers({
  99. b"Content-Type": [b"application/x-www-form-urlencoded"],
  100. b"User-Agent": [self.user_agent],
  101. }),
  102. bodyProducer=FileBodyProducer(StringIO(query_bytes))
  103. )
  104. body = yield preserve_context_over_fn(readBody, response)
  105. defer.returnValue(json.loads(body))
  106. @defer.inlineCallbacks
  107. def post_json_get_json(self, uri, post_json):
  108. json_str = encode_canonical_json(post_json)
  109. logger.debug("HTTP POST %s -> %s", json_str, uri)
  110. response = yield self.request(
  111. "POST",
  112. uri.encode("ascii"),
  113. headers=Headers({
  114. b"Content-Type": [b"application/json"],
  115. b"User-Agent": [self.user_agent],
  116. }),
  117. bodyProducer=FileBodyProducer(StringIO(json_str))
  118. )
  119. body = yield preserve_context_over_fn(readBody, response)
  120. defer.returnValue(json.loads(body))
  121. @defer.inlineCallbacks
  122. def get_json(self, uri, args={}):
  123. """ Gets some json from the given URI.
  124. Args:
  125. uri (str): The URI to request, not including query parameters
  126. args (dict): A dictionary used to create query strings, defaults to
  127. None.
  128. **Note**: The value of each key is assumed to be an iterable
  129. and *not* a string.
  130. Returns:
  131. Deferred: Succeeds when we get *any* 2xx HTTP response, with the
  132. HTTP body as JSON.
  133. Raises:
  134. On a non-2xx HTTP response. The response body will be used as the
  135. error message.
  136. """
  137. body = yield self.get_raw(uri, args)
  138. defer.returnValue(json.loads(body))
  139. @defer.inlineCallbacks
  140. def put_json(self, uri, json_body, args={}):
  141. """ Puts some json to the given URI.
  142. Args:
  143. uri (str): The URI to request, not including query parameters
  144. json_body (dict): The JSON to put in the HTTP body,
  145. args (dict): A dictionary used to create query strings, defaults to
  146. None.
  147. **Note**: The value of each key is assumed to be an iterable
  148. and *not* a string.
  149. Returns:
  150. Deferred: Succeeds when we get *any* 2xx HTTP response, with the
  151. HTTP body as JSON.
  152. Raises:
  153. On a non-2xx HTTP response.
  154. """
  155. if len(args):
  156. query_bytes = urllib.urlencode(args, True)
  157. uri = "%s?%s" % (uri, query_bytes)
  158. json_str = encode_canonical_json(json_body)
  159. response = yield self.request(
  160. "PUT",
  161. uri.encode("ascii"),
  162. headers=Headers({
  163. b"User-Agent": [self.user_agent],
  164. "Content-Type": ["application/json"]
  165. }),
  166. bodyProducer=FileBodyProducer(StringIO(json_str))
  167. )
  168. body = yield preserve_context_over_fn(readBody, response)
  169. if 200 <= response.code < 300:
  170. defer.returnValue(json.loads(body))
  171. else:
  172. # NB: This is explicitly not json.loads(body)'d because the contract
  173. # of CodeMessageException is a *string* message. Callers can always
  174. # load it into JSON if they want.
  175. raise CodeMessageException(response.code, body)
  176. @defer.inlineCallbacks
  177. def get_raw(self, uri, args={}):
  178. """ Gets raw text from the given URI.
  179. Args:
  180. uri (str): The URI to request, not including query parameters
  181. args (dict): A dictionary used to create query strings, defaults to
  182. None.
  183. **Note**: The value of each key is assumed to be an iterable
  184. and *not* a string.
  185. Returns:
  186. Deferred: Succeeds when we get *any* 2xx HTTP response, with the
  187. HTTP body at text.
  188. Raises:
  189. On a non-2xx HTTP response. The response body will be used as the
  190. error message.
  191. """
  192. if len(args):
  193. query_bytes = urllib.urlencode(args, True)
  194. uri = "%s?%s" % (uri, query_bytes)
  195. response = yield self.request(
  196. "GET",
  197. uri.encode("ascii"),
  198. headers=Headers({
  199. b"User-Agent": [self.user_agent],
  200. })
  201. )
  202. body = yield preserve_context_over_fn(readBody, response)
  203. if 200 <= response.code < 300:
  204. defer.returnValue(body)
  205. else:
  206. raise CodeMessageException(response.code, body)
  207. # XXX: FIXME: This is horribly copy-pasted from matrixfederationclient.
  208. # The two should be factored out.
  209. @defer.inlineCallbacks
  210. def get_file(self, url, output_stream, max_size=None):
  211. """GETs a file from a given URL
  212. Args:
  213. url (str): The URL to GET
  214. output_stream (file): File to write the response body to.
  215. Returns:
  216. A (int,dict,string,int) tuple of the file length, dict of the response
  217. headers, absolute URI of the response and HTTP response code.
  218. """
  219. response = yield self.request(
  220. "GET",
  221. url.encode("ascii"),
  222. headers=Headers({
  223. b"User-Agent": [self.user_agent],
  224. })
  225. )
  226. headers = dict(response.headers.getAllRawHeaders())
  227. if 'Content-Length' in headers and headers['Content-Length'] > max_size:
  228. logger.warn("Requested URL is too large > %r bytes" % (self.max_size,))
  229. raise SynapseError(
  230. 502,
  231. "Requested file is too large > %r bytes" % (self.max_size,),
  232. Codes.TOO_LARGE,
  233. )
  234. if response.code > 299:
  235. logger.warn("Got %d when downloading %s" % (response.code, url))
  236. raise SynapseError(
  237. 502,
  238. "Got error %d" % (response.code,),
  239. Codes.UNKNOWN,
  240. )
  241. # TODO: if our Content-Type is HTML or something, just read the first
  242. # N bytes into RAM rather than saving it all to disk only to read it
  243. # straight back in again
  244. try:
  245. length = yield preserve_context_over_fn(
  246. _readBodyToFile,
  247. response, output_stream, max_size
  248. )
  249. except Exception as e:
  250. logger.exception("Failed to download body")
  251. raise SynapseError(
  252. 502,
  253. ("Failed to download remote body: %s" % e),
  254. Codes.UNKNOWN,
  255. )
  256. defer.returnValue((length, headers, response.request.absoluteURI, response.code))
  257. # XXX: FIXME: This is horribly copy-pasted from matrixfederationclient.
  258. # The two should be factored out.
  259. class _ReadBodyToFileProtocol(protocol.Protocol):
  260. def __init__(self, stream, deferred, max_size):
  261. self.stream = stream
  262. self.deferred = deferred
  263. self.length = 0
  264. self.max_size = max_size
  265. def dataReceived(self, data):
  266. self.stream.write(data)
  267. self.length += len(data)
  268. if self.max_size is not None and self.length >= self.max_size:
  269. self.deferred.errback(SynapseError(
  270. 502,
  271. "Requested file is too large > %r bytes" % (self.max_size,),
  272. Codes.TOO_LARGE,
  273. ))
  274. self.deferred = defer.Deferred()
  275. self.transport.loseConnection()
  276. def connectionLost(self, reason):
  277. if reason.check(ResponseDone):
  278. self.deferred.callback(self.length)
  279. elif reason.check(PotentialDataLoss):
  280. # stolen from https://github.com/twisted/treq/pull/49/files
  281. # http://twistedmatrix.com/trac/ticket/4840
  282. self.deferred.callback(self.length)
  283. else:
  284. self.deferred.errback(reason)
  285. # XXX: FIXME: This is horribly copy-pasted from matrixfederationclient.
  286. # The two should be factored out.
  287. def _readBodyToFile(response, stream, max_size):
  288. d = defer.Deferred()
  289. response.deliverBody(_ReadBodyToFileProtocol(stream, d, max_size))
  290. return d
  291. class CaptchaServerHttpClient(SimpleHttpClient):
  292. """
  293. Separate HTTP client for talking to google's captcha servers
  294. Only slightly special because accepts partial download responses
  295. used only by c/s api v1
  296. """
  297. @defer.inlineCallbacks
  298. def post_urlencoded_get_raw(self, url, args={}):
  299. query_bytes = urllib.urlencode(encode_urlencode_args(args), True)
  300. response = yield self.request(
  301. "POST",
  302. url.encode("ascii"),
  303. bodyProducer=FileBodyProducer(StringIO(query_bytes)),
  304. headers=Headers({
  305. b"Content-Type": [b"application/x-www-form-urlencoded"],
  306. b"User-Agent": [self.user_agent],
  307. })
  308. )
  309. try:
  310. body = yield preserve_context_over_fn(readBody, response)
  311. defer.returnValue(body)
  312. except PartialDownloadError as e:
  313. # twisted dislikes google's response, no content length.
  314. defer.returnValue(e.response)
  315. class SpiderEndpointFactory(object):
  316. def __init__(self, hs):
  317. self.blacklist = hs.config.url_preview_ip_range_blacklist
  318. self.policyForHTTPS = hs.get_http_client_context_factory()
  319. def endpointForURI(self, uri):
  320. logger.info("Getting endpoint for %s", uri.toBytes())
  321. if uri.scheme == "http":
  322. return SpiderEndpoint(
  323. reactor, uri.host, uri.port, self.blacklist,
  324. endpoint=TCP4ClientEndpoint,
  325. endpoint_kw_args={
  326. 'timeout': 15
  327. },
  328. )
  329. elif uri.scheme == "https":
  330. tlsPolicy = self.policyForHTTPS.creatorForNetloc(uri.host, uri.port)
  331. return SpiderEndpoint(
  332. reactor, uri.host, uri.port, self.blacklist,
  333. endpoint=SSL4ClientEndpoint,
  334. endpoint_kw_args={
  335. 'sslContextFactory': tlsPolicy,
  336. 'timeout': 15
  337. },
  338. )
  339. else:
  340. logger.warn("Can't get endpoint for unrecognised scheme %s", uri.scheme)
  341. class SpiderHttpClient(SimpleHttpClient):
  342. """
  343. Separate HTTP client for spidering arbitrary URLs.
  344. Special in that it follows retries and has a UA that looks
  345. like a browser.
  346. used by the preview_url endpoint in the content repo.
  347. """
  348. def __init__(self, hs):
  349. SimpleHttpClient.__init__(self, hs)
  350. # clobber the base class's agent and UA:
  351. self.agent = ContentDecoderAgent(
  352. BrowserLikeRedirectAgent(
  353. Agent.usingEndpointFactory(
  354. reactor,
  355. SpiderEndpointFactory(hs)
  356. )
  357. ), [('gzip', GzipDecoder)]
  358. )
  359. # We could look like Chrome:
  360. # self.user_agent = ("Mozilla/5.0 (%s) (KHTML, like Gecko)
  361. # Chrome Safari" % hs.version_string)
  362. def encode_urlencode_args(args):
  363. return {k: encode_urlencode_arg(v) for k, v in args.items()}
  364. def encode_urlencode_arg(arg):
  365. if isinstance(arg, unicode):
  366. return arg.encode('utf-8')
  367. elif isinstance(arg, list):
  368. return [encode_urlencode_arg(i) for i in arg]
  369. else:
  370. return arg
  371. def _print_ex(e):
  372. if hasattr(e, "reasons") and e.reasons:
  373. for ex in e.reasons:
  374. _print_ex(ex)
  375. else:
  376. logger.exception(e)
  377. class InsecureInterceptableContextFactory(ssl.ContextFactory):
  378. """
  379. Factory for PyOpenSSL SSL contexts which accepts any certificate for any domain.
  380. Do not use this since it allows an attacker to intercept your communications.
  381. """
  382. def __init__(self):
  383. self._context = SSL.Context(SSL.SSLv23_METHOD)
  384. self._context.set_verify(VERIFY_NONE, lambda *_: None)
  385. def getContext(self, hostname, port):
  386. return self._context