client.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494
  1. # -*- coding: utf-8 -*-
  2. # Copyright 2014-2016 OpenMarket Ltd
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. from OpenSSL import SSL
  16. from OpenSSL.SSL import VERIFY_NONE
  17. from synapse.api.errors import (
  18. CodeMessageException, SynapseError, Codes,
  19. )
  20. from synapse.util.logcontext import preserve_context_over_fn
  21. import synapse.metrics
  22. from synapse.http.endpoint import SpiderEndpoint
  23. from canonicaljson import encode_canonical_json
  24. from twisted.internet import defer, reactor, ssl, protocol, task
  25. from twisted.internet.endpoints import SSL4ClientEndpoint, TCP4ClientEndpoint
  26. from twisted.web.client import (
  27. BrowserLikeRedirectAgent, ContentDecoderAgent, GzipDecoder, Agent,
  28. readBody, PartialDownloadError,
  29. )
  30. from twisted.web.client import FileBodyProducer as TwistedFileBodyProducer
  31. from twisted.web.http import PotentialDataLoss
  32. from twisted.web.http_headers import Headers
  33. from twisted.web._newclient import ResponseDone
  34. from StringIO import StringIO
  35. import simplejson as json
  36. import logging
  37. import urllib
  38. logger = logging.getLogger(__name__)
  39. metrics = synapse.metrics.get_metrics_for(__name__)
  40. outgoing_requests_counter = metrics.register_counter(
  41. "requests",
  42. labels=["method"],
  43. )
  44. incoming_responses_counter = metrics.register_counter(
  45. "responses",
  46. labels=["method", "code"],
  47. )
  48. class SimpleHttpClient(object):
  49. """
  50. A simple, no-frills HTTP client with methods that wrap up common ways of
  51. using HTTP in Matrix
  52. """
  53. def __init__(self, hs):
  54. self.hs = hs
  55. # The default context factory in Twisted 14.0.0 (which we require) is
  56. # BrowserLikePolicyForHTTPS which will do regular cert validation
  57. # 'like a browser'
  58. self.agent = Agent(
  59. reactor,
  60. connectTimeout=15,
  61. contextFactory=hs.get_http_client_context_factory()
  62. )
  63. self.user_agent = hs.version_string
  64. if hs.config.user_agent_suffix:
  65. self.user_agent = "%s %s" % (self.user_agent, hs.config.user_agent_suffix,)
  66. def request(self, method, uri, *args, **kwargs):
  67. # A small wrapper around self.agent.request() so we can easily attach
  68. # counters to it
  69. outgoing_requests_counter.inc(method)
  70. d = preserve_context_over_fn(
  71. self.agent.request,
  72. method, uri, *args, **kwargs
  73. )
  74. logger.info("Sending request %s %s", method, uri)
  75. def _cb(response):
  76. incoming_responses_counter.inc(method, response.code)
  77. logger.info(
  78. "Received response to %s %s: %s",
  79. method, uri, response.code
  80. )
  81. return response
  82. def _eb(failure):
  83. incoming_responses_counter.inc(method, "ERR")
  84. logger.info(
  85. "Error sending request to %s %s: %s %s",
  86. method, uri, failure.type, failure.getErrorMessage()
  87. )
  88. return failure
  89. d.addCallbacks(_cb, _eb)
  90. return d
  91. @defer.inlineCallbacks
  92. def post_urlencoded_get_json(self, uri, args={}):
  93. # TODO: Do we ever want to log message contents?
  94. logger.debug("post_urlencoded_get_json args: %s", args)
  95. query_bytes = urllib.urlencode(encode_urlencode_args(args), True)
  96. response = yield self.request(
  97. "POST",
  98. uri.encode("ascii"),
  99. headers=Headers({
  100. b"Content-Type": [b"application/x-www-form-urlencoded"],
  101. b"User-Agent": [self.user_agent],
  102. }),
  103. bodyProducer=FileBodyProducer(StringIO(query_bytes))
  104. )
  105. body = yield preserve_context_over_fn(readBody, response)
  106. defer.returnValue(json.loads(body))
  107. @defer.inlineCallbacks
  108. def post_json_get_json(self, uri, post_json):
  109. json_str = encode_canonical_json(post_json)
  110. logger.debug("HTTP POST %s -> %s", json_str, uri)
  111. response = yield self.request(
  112. "POST",
  113. uri.encode("ascii"),
  114. headers=Headers({
  115. b"Content-Type": [b"application/json"],
  116. b"User-Agent": [self.user_agent],
  117. }),
  118. bodyProducer=FileBodyProducer(StringIO(json_str))
  119. )
  120. body = yield preserve_context_over_fn(readBody, response)
  121. defer.returnValue(json.loads(body))
  122. @defer.inlineCallbacks
  123. def get_json(self, uri, args={}):
  124. """ Gets some json from the given URI.
  125. Args:
  126. uri (str): The URI to request, not including query parameters
  127. args (dict): A dictionary used to create query strings, defaults to
  128. None.
  129. **Note**: The value of each key is assumed to be an iterable
  130. and *not* a string.
  131. Returns:
  132. Deferred: Succeeds when we get *any* 2xx HTTP response, with the
  133. HTTP body as JSON.
  134. Raises:
  135. On a non-2xx HTTP response. The response body will be used as the
  136. error message.
  137. """
  138. body = yield self.get_raw(uri, args)
  139. defer.returnValue(json.loads(body))
  140. @defer.inlineCallbacks
  141. def put_json(self, uri, json_body, args={}):
  142. """ Puts some json to the given URI.
  143. Args:
  144. uri (str): The URI to request, not including query parameters
  145. json_body (dict): The JSON to put in the HTTP body,
  146. args (dict): A dictionary used to create query strings, defaults to
  147. None.
  148. **Note**: The value of each key is assumed to be an iterable
  149. and *not* a string.
  150. Returns:
  151. Deferred: Succeeds when we get *any* 2xx HTTP response, with the
  152. HTTP body as JSON.
  153. Raises:
  154. On a non-2xx HTTP response.
  155. """
  156. if len(args):
  157. query_bytes = urllib.urlencode(args, True)
  158. uri = "%s?%s" % (uri, query_bytes)
  159. json_str = encode_canonical_json(json_body)
  160. response = yield self.request(
  161. "PUT",
  162. uri.encode("ascii"),
  163. headers=Headers({
  164. b"User-Agent": [self.user_agent],
  165. "Content-Type": ["application/json"]
  166. }),
  167. bodyProducer=FileBodyProducer(StringIO(json_str))
  168. )
  169. body = yield preserve_context_over_fn(readBody, response)
  170. if 200 <= response.code < 300:
  171. defer.returnValue(json.loads(body))
  172. else:
  173. # NB: This is explicitly not json.loads(body)'d because the contract
  174. # of CodeMessageException is a *string* message. Callers can always
  175. # load it into JSON if they want.
  176. raise CodeMessageException(response.code, body)
  177. @defer.inlineCallbacks
  178. def get_raw(self, uri, args={}):
  179. """ Gets raw text from the given URI.
  180. Args:
  181. uri (str): The URI to request, not including query parameters
  182. args (dict): A dictionary used to create query strings, defaults to
  183. None.
  184. **Note**: The value of each key is assumed to be an iterable
  185. and *not* a string.
  186. Returns:
  187. Deferred: Succeeds when we get *any* 2xx HTTP response, with the
  188. HTTP body at text.
  189. Raises:
  190. On a non-2xx HTTP response. The response body will be used as the
  191. error message.
  192. """
  193. if len(args):
  194. query_bytes = urllib.urlencode(args, True)
  195. uri = "%s?%s" % (uri, query_bytes)
  196. response = yield self.request(
  197. "GET",
  198. uri.encode("ascii"),
  199. headers=Headers({
  200. b"User-Agent": [self.user_agent],
  201. })
  202. )
  203. body = yield preserve_context_over_fn(readBody, response)
  204. if 200 <= response.code < 300:
  205. defer.returnValue(body)
  206. else:
  207. raise CodeMessageException(response.code, body)
  208. # XXX: FIXME: This is horribly copy-pasted from matrixfederationclient.
  209. # The two should be factored out.
  210. @defer.inlineCallbacks
  211. def get_file(self, url, output_stream, max_size=None):
  212. """GETs a file from a given URL
  213. Args:
  214. url (str): The URL to GET
  215. output_stream (file): File to write the response body to.
  216. Returns:
  217. A (int,dict,string,int) tuple of the file length, dict of the response
  218. headers, absolute URI of the response and HTTP response code.
  219. """
  220. response = yield self.request(
  221. "GET",
  222. url.encode("ascii"),
  223. headers=Headers({
  224. b"User-Agent": [self.user_agent],
  225. })
  226. )
  227. headers = dict(response.headers.getAllRawHeaders())
  228. if 'Content-Length' in headers and headers['Content-Length'] > max_size:
  229. logger.warn("Requested URL is too large > %r bytes" % (self.max_size,))
  230. raise SynapseError(
  231. 502,
  232. "Requested file is too large > %r bytes" % (self.max_size,),
  233. Codes.TOO_LARGE,
  234. )
  235. if response.code > 299:
  236. logger.warn("Got %d when downloading %s" % (response.code, url))
  237. raise SynapseError(
  238. 502,
  239. "Got error %d" % (response.code,),
  240. Codes.UNKNOWN,
  241. )
  242. # TODO: if our Content-Type is HTML or something, just read the first
  243. # N bytes into RAM rather than saving it all to disk only to read it
  244. # straight back in again
  245. try:
  246. length = yield preserve_context_over_fn(
  247. _readBodyToFile,
  248. response, output_stream, max_size
  249. )
  250. except Exception as e:
  251. logger.exception("Failed to download body")
  252. raise SynapseError(
  253. 502,
  254. ("Failed to download remote body: %s" % e),
  255. Codes.UNKNOWN,
  256. )
  257. defer.returnValue((length, headers, response.request.absoluteURI, response.code))
  258. # XXX: FIXME: This is horribly copy-pasted from matrixfederationclient.
  259. # The two should be factored out.
  260. class _ReadBodyToFileProtocol(protocol.Protocol):
  261. def __init__(self, stream, deferred, max_size):
  262. self.stream = stream
  263. self.deferred = deferred
  264. self.length = 0
  265. self.max_size = max_size
  266. def dataReceived(self, data):
  267. self.stream.write(data)
  268. self.length += len(data)
  269. if self.max_size is not None and self.length >= self.max_size:
  270. self.deferred.errback(SynapseError(
  271. 502,
  272. "Requested file is too large > %r bytes" % (self.max_size,),
  273. Codes.TOO_LARGE,
  274. ))
  275. self.deferred = defer.Deferred()
  276. self.transport.loseConnection()
  277. def connectionLost(self, reason):
  278. if reason.check(ResponseDone):
  279. self.deferred.callback(self.length)
  280. elif reason.check(PotentialDataLoss):
  281. # stolen from https://github.com/twisted/treq/pull/49/files
  282. # http://twistedmatrix.com/trac/ticket/4840
  283. self.deferred.callback(self.length)
  284. else:
  285. self.deferred.errback(reason)
  286. # XXX: FIXME: This is horribly copy-pasted from matrixfederationclient.
  287. # The two should be factored out.
  288. def _readBodyToFile(response, stream, max_size):
  289. d = defer.Deferred()
  290. response.deliverBody(_ReadBodyToFileProtocol(stream, d, max_size))
  291. return d
  292. class CaptchaServerHttpClient(SimpleHttpClient):
  293. """
  294. Separate HTTP client for talking to google's captcha servers
  295. Only slightly special because accepts partial download responses
  296. used only by c/s api v1
  297. """
  298. @defer.inlineCallbacks
  299. def post_urlencoded_get_raw(self, url, args={}):
  300. query_bytes = urllib.urlencode(encode_urlencode_args(args), True)
  301. response = yield self.request(
  302. "POST",
  303. url.encode("ascii"),
  304. bodyProducer=FileBodyProducer(StringIO(query_bytes)),
  305. headers=Headers({
  306. b"Content-Type": [b"application/x-www-form-urlencoded"],
  307. b"User-Agent": [self.user_agent],
  308. })
  309. )
  310. try:
  311. body = yield preserve_context_over_fn(readBody, response)
  312. defer.returnValue(body)
  313. except PartialDownloadError as e:
  314. # twisted dislikes google's response, no content length.
  315. defer.returnValue(e.response)
  316. class SpiderEndpointFactory(object):
  317. def __init__(self, hs):
  318. self.blacklist = hs.config.url_preview_ip_range_blacklist
  319. self.whitelist = hs.config.url_preview_ip_range_whitelist
  320. self.policyForHTTPS = hs.get_http_client_context_factory()
  321. def endpointForURI(self, uri):
  322. logger.info("Getting endpoint for %s", uri.toBytes())
  323. if uri.scheme == "http":
  324. return SpiderEndpoint(
  325. reactor, uri.host, uri.port, self.blacklist, self.whitelist,
  326. endpoint=TCP4ClientEndpoint,
  327. endpoint_kw_args={
  328. 'timeout': 15
  329. },
  330. )
  331. elif uri.scheme == "https":
  332. tlsPolicy = self.policyForHTTPS.creatorForNetloc(uri.host, uri.port)
  333. return SpiderEndpoint(
  334. reactor, uri.host, uri.port, self.blacklist, self.whitelist,
  335. endpoint=SSL4ClientEndpoint,
  336. endpoint_kw_args={
  337. 'sslContextFactory': tlsPolicy,
  338. 'timeout': 15
  339. },
  340. )
  341. else:
  342. logger.warn("Can't get endpoint for unrecognised scheme %s", uri.scheme)
  343. class SpiderHttpClient(SimpleHttpClient):
  344. """
  345. Separate HTTP client for spidering arbitrary URLs.
  346. Special in that it follows retries and has a UA that looks
  347. like a browser.
  348. used by the preview_url endpoint in the content repo.
  349. """
  350. def __init__(self, hs):
  351. SimpleHttpClient.__init__(self, hs)
  352. # clobber the base class's agent and UA:
  353. self.agent = ContentDecoderAgent(
  354. BrowserLikeRedirectAgent(
  355. Agent.usingEndpointFactory(
  356. reactor,
  357. SpiderEndpointFactory(hs)
  358. )
  359. ), [('gzip', GzipDecoder)]
  360. )
  361. # We could look like Chrome:
  362. # self.user_agent = ("Mozilla/5.0 (%s) (KHTML, like Gecko)
  363. # Chrome Safari" % hs.version_string)
  364. def encode_urlencode_args(args):
  365. return {k: encode_urlencode_arg(v) for k, v in args.items()}
  366. def encode_urlencode_arg(arg):
  367. if isinstance(arg, unicode):
  368. return arg.encode('utf-8')
  369. elif isinstance(arg, list):
  370. return [encode_urlencode_arg(i) for i in arg]
  371. else:
  372. return arg
  373. def _print_ex(e):
  374. if hasattr(e, "reasons") and e.reasons:
  375. for ex in e.reasons:
  376. _print_ex(ex)
  377. else:
  378. logger.exception(e)
  379. class InsecureInterceptableContextFactory(ssl.ContextFactory):
  380. """
  381. Factory for PyOpenSSL SSL contexts which accepts any certificate for any domain.
  382. Do not use this since it allows an attacker to intercept your communications.
  383. """
  384. def __init__(self):
  385. self._context = SSL.Context(SSL.SSLv23_METHOD)
  386. self._context.set_verify(VERIFY_NONE, lambda *_: None)
  387. def getContext(self, hostname=None, port=None):
  388. return self._context
  389. def creatorForNetloc(self, hostname, port):
  390. return self
  391. class FileBodyProducer(TwistedFileBodyProducer):
  392. """Workaround for https://twistedmatrix.com/trac/ticket/8473
  393. We override the pauseProducing and resumeProducing methods in twisted's
  394. FileBodyProducer so that they do not raise exceptions if the task has
  395. already completed.
  396. """
  397. def pauseProducing(self):
  398. try:
  399. super(FileBodyProducer, self).pauseProducing()
  400. except task.TaskDone:
  401. # task has already completed
  402. pass
  403. def resumeProducing(self):
  404. try:
  405. super(FileBodyProducer, self).resumeProducing()
  406. except task.NotPaused:
  407. # task was not paused (probably because it had already completed)
  408. pass