preview_url_resource.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462
  1. # -*- coding: utf-8 -*-
  2. # Copyright 2016 OpenMarket Ltd
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. from .base_resource import BaseMediaResource
  16. from twisted.web.server import NOT_DONE_YET
  17. from twisted.internet import defer
  18. from urlparse import urlparse, urlsplit, urlunparse
  19. from synapse.api.errors import (
  20. SynapseError, Codes,
  21. )
  22. from synapse.util.stringutils import random_string
  23. from synapse.util.caches.expiringcache import ExpiringCache
  24. from synapse.http.client import SpiderHttpClient
  25. from synapse.http.server import (
  26. request_handler, respond_with_json_bytes
  27. )
  28. from synapse.util.async import ObservableDeferred
  29. from synapse.util.stringutils import is_ascii
  30. import os
  31. import re
  32. import fnmatch
  33. import cgi
  34. import ujson as json
  35. import logging
  36. logger = logging.getLogger(__name__)
  37. try:
  38. from lxml import html
  39. except ImportError:
  40. pass
  41. class PreviewUrlResource(BaseMediaResource):
  42. isLeaf = True
  43. def __init__(self, hs, filepaths):
  44. try:
  45. if html:
  46. pass
  47. except:
  48. raise RuntimeError("Disabling PreviewUrlResource as lxml not available")
  49. if not hasattr(hs.config, "url_preview_ip_range_blacklist"):
  50. logger.warn(
  51. "For security, you must specify an explicit target IP address "
  52. "blacklist in url_preview_ip_range_blacklist for url previewing "
  53. "to work"
  54. )
  55. raise RuntimeError(
  56. "Disabling PreviewUrlResource as "
  57. "url_preview_ip_range_blacklist not specified"
  58. )
  59. BaseMediaResource.__init__(self, hs, filepaths)
  60. self.client = SpiderHttpClient(hs)
  61. if hasattr(hs.config, "url_preview_url_blacklist"):
  62. self.url_preview_url_blacklist = hs.config.url_preview_url_blacklist
  63. # simple memory cache mapping urls to OG metadata
  64. self.cache = ExpiringCache(
  65. cache_name="url_previews",
  66. clock=self.clock,
  67. # don't spider URLs more often than once an hour
  68. expiry_ms=60 * 60 * 1000,
  69. )
  70. self.cache.start()
  71. self.downloads = {}
  72. def render_GET(self, request):
  73. self._async_render_GET(request)
  74. return NOT_DONE_YET
  75. @request_handler
  76. @defer.inlineCallbacks
  77. def _async_render_GET(self, request):
  78. # XXX: if get_user_by_req fails, what should we do in an async render?
  79. requester = yield self.auth.get_user_by_req(request)
  80. url = request.args.get("url")[0]
  81. if "ts" in request.args:
  82. ts = int(request.args.get("ts")[0])
  83. else:
  84. ts = self.clock.time_msec()
  85. # impose the URL pattern blacklist
  86. if hasattr(self, "url_preview_url_blacklist"):
  87. url_tuple = urlsplit(url)
  88. for entry in self.url_preview_url_blacklist:
  89. match = True
  90. for attrib in entry:
  91. pattern = entry[attrib]
  92. value = getattr(url_tuple, attrib)
  93. logger.debug((
  94. "Matching attrib '%s' with value '%s' against"
  95. " pattern '%s'"
  96. ) % (attrib, value, pattern))
  97. if value is None:
  98. match = False
  99. continue
  100. if pattern.startswith('^'):
  101. if not re.match(pattern, getattr(url_tuple, attrib)):
  102. match = False
  103. continue
  104. else:
  105. if not fnmatch.fnmatch(getattr(url_tuple, attrib), pattern):
  106. match = False
  107. continue
  108. if match:
  109. logger.warn(
  110. "URL %s blocked by url_blacklist entry %s", url, entry
  111. )
  112. raise SynapseError(
  113. 403, "URL blocked by url pattern blacklist entry",
  114. Codes.UNKNOWN
  115. )
  116. # first check the memory cache - good to handle all the clients on this
  117. # HS thundering away to preview the same URL at the same time.
  118. og = self.cache.get(url)
  119. if og:
  120. respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True)
  121. return
  122. # then check the URL cache in the DB (which will also provide us with
  123. # historical previews, if we have any)
  124. cache_result = yield self.store.get_url_cache(url, ts)
  125. if (
  126. cache_result and
  127. cache_result["download_ts"] + cache_result["expires"] > ts and
  128. cache_result["response_code"] / 100 == 2
  129. ):
  130. respond_with_json_bytes(
  131. request, 200, cache_result["og"].encode('utf-8'),
  132. send_cors=True
  133. )
  134. return
  135. # Ensure only one download for a given URL is active at a time
  136. download = self.downloads.get(url)
  137. if download is None:
  138. download = self._download_url(url, requester.user)
  139. download = ObservableDeferred(
  140. download,
  141. consumeErrors=True
  142. )
  143. self.downloads[url] = download
  144. @download.addBoth
  145. def callback(media_info):
  146. del self.downloads[url]
  147. return media_info
  148. media_info = yield download.observe()
  149. # FIXME: we should probably update our cache now anyway, so that
  150. # even if the OG calculation raises, we don't keep hammering on the
  151. # remote server. For now, leave it uncached to aid debugging OG
  152. # calculation problems
  153. logger.debug("got media_info of '%s'" % media_info)
  154. if self._is_media(media_info['media_type']):
  155. dims = yield self._generate_local_thumbnails(
  156. media_info['filesystem_id'], media_info
  157. )
  158. og = {
  159. "og:description": media_info['download_name'],
  160. "og:image": "mxc://%s/%s" % (
  161. self.server_name, media_info['filesystem_id']
  162. ),
  163. "og:image:type": media_info['media_type'],
  164. "matrix:image:size": media_info['media_length'],
  165. }
  166. if dims:
  167. og["og:image:width"] = dims['width']
  168. og["og:image:height"] = dims['height']
  169. else:
  170. logger.warn("Couldn't get dims for %s" % url)
  171. # define our OG response for this media
  172. elif self._is_html(media_info['media_type']):
  173. # TODO: somehow stop a big HTML tree from exploding synapse's RAM
  174. try:
  175. tree = html.parse(media_info['filename'])
  176. og = yield self._calc_og(tree, media_info, requester)
  177. except UnicodeDecodeError:
  178. # XXX: evil evil bodge
  179. # Empirically, sites like google.com mix Latin-1 and utf-8
  180. # encodings in the same page. The rogue Latin-1 characters
  181. # cause lxml to choke with a UnicodeDecodeError, so if we
  182. # see this we go and do a manual decode of the HTML before
  183. # handing it to lxml as utf-8 encoding, counter-intuitively,
  184. # which seems to make it happier...
  185. file = open(media_info['filename'])
  186. body = file.read()
  187. file.close()
  188. tree = html.fromstring(body.decode('utf-8', 'ignore'))
  189. og = yield self._calc_og(tree, media_info, requester)
  190. else:
  191. logger.warn("Failed to find any OG data in %s", url)
  192. og = {}
  193. logger.debug("Calculated OG for %s as %s" % (url, og))
  194. # store OG in ephemeral in-memory cache
  195. self.cache[url] = og
  196. # store OG in history-aware DB cache
  197. yield self.store.store_url_cache(
  198. url,
  199. media_info["response_code"],
  200. media_info["etag"],
  201. media_info["expires"],
  202. json.dumps(og),
  203. media_info["filesystem_id"],
  204. media_info["created_ts"],
  205. )
  206. respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True)
  207. @defer.inlineCallbacks
  208. def _calc_og(self, tree, media_info, requester):
  209. # suck our tree into lxml and define our OG response.
  210. # if we see any image URLs in the OG response, then spider them
  211. # (although the client could choose to do this by asking for previews of those
  212. # URLs to avoid DoSing the server)
  213. # "og:type" : "video",
  214. # "og:url" : "https://www.youtube.com/watch?v=LXDBoHyjmtw",
  215. # "og:site_name" : "YouTube",
  216. # "og:video:type" : "application/x-shockwave-flash",
  217. # "og:description" : "Fun stuff happening here",
  218. # "og:title" : "RemoteJam - Matrix team hack for Disrupt Europe Hackathon",
  219. # "og:image" : "https://i.ytimg.com/vi/LXDBoHyjmtw/maxresdefault.jpg",
  220. # "og:video:url" : "http://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1",
  221. # "og:video:width" : "1280"
  222. # "og:video:height" : "720",
  223. # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",
  224. og = {}
  225. for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"):
  226. og[tag.attrib['property']] = tag.attrib['content']
  227. # TODO: grab article: meta tags too, e.g.:
  228. # "article:publisher" : "https://www.facebook.com/thethudonline" />
  229. # "article:author" content="https://www.facebook.com/thethudonline" />
  230. # "article:tag" content="baby" />
  231. # "article:section" content="Breaking News" />
  232. # "article:published_time" content="2016-03-31T19:58:24+00:00" />
  233. # "article:modified_time" content="2016-04-01T18:31:53+00:00" />
  234. if 'og:title' not in og:
  235. # do some basic spidering of the HTML
  236. title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]")
  237. og['og:title'] = title[0].text.strip() if title else None
  238. if 'og:image' not in og:
  239. # TODO: extract a favicon failing all else
  240. meta_image = tree.xpath(
  241. "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"
  242. )
  243. if meta_image:
  244. og['og:image'] = self._rebase_url(meta_image[0], media_info['uri'])
  245. else:
  246. # TODO: consider inlined CSS styles as well as width & height attribs
  247. images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
  248. images = sorted(images, key=lambda i: (
  249. -1 * int(i.attrib['width']) * int(i.attrib['height'])
  250. ))
  251. if not images:
  252. images = tree.xpath("//img[@src]")
  253. if images:
  254. og['og:image'] = images[0].attrib['src']
  255. # pre-cache the image for posterity
  256. # FIXME: it might be cleaner to use the same flow as the main /preview_url request
  257. # itself and benefit from the same caching etc. But for now we just rely on the
  258. # caching on the master request to speed things up.
  259. if 'og:image' in og and og['og:image']:
  260. image_info = yield self._download_url(
  261. self._rebase_url(og['og:image'], media_info['uri']), requester.user
  262. )
  263. if self._is_media(image_info['media_type']):
  264. # TODO: make sure we don't choke on white-on-transparent images
  265. dims = yield self._generate_local_thumbnails(
  266. image_info['filesystem_id'], image_info
  267. )
  268. if dims:
  269. og["og:image:width"] = dims['width']
  270. og["og:image:height"] = dims['height']
  271. else:
  272. logger.warn("Couldn't get dims for %s" % og["og:image"])
  273. og["og:image"] = "mxc://%s/%s" % (
  274. self.server_name, image_info['filesystem_id']
  275. )
  276. og["og:image:type"] = image_info['media_type']
  277. og["matrix:image:size"] = image_info['media_length']
  278. else:
  279. del og["og:image"]
  280. if 'og:description' not in og:
  281. meta_description = tree.xpath(
  282. "//*/meta"
  283. "[translate(@name, 'DESCRIPTION', 'description')='description']"
  284. "/@content")
  285. if meta_description:
  286. og['og:description'] = meta_description[0]
  287. else:
  288. # grab any text nodes which are inside the <body/> tag...
  289. # unless they are within an HTML5 semantic markup tag...
  290. # <header/>, <nav/>, <aside/>, <footer/>
  291. # ...or if they are within a <script/> or <style/> tag.
  292. # This is a very very very coarse approximation to a plain text
  293. # render of the page.
  294. text_nodes = tree.xpath("//text()[not(ancestor::header | ancestor::nav | "
  295. "ancestor::aside | ancestor::footer | "
  296. "ancestor::script | ancestor::style)]" +
  297. "[ancestor::body]")
  298. text = ''
  299. for text_node in text_nodes:
  300. if len(text) < 500:
  301. text += text_node + ' '
  302. else:
  303. break
  304. text = re.sub(r'[\t ]+', ' ', text)
  305. text = re.sub(r'[\t \r\n]*[\r\n]+', '\n', text)
  306. text = text.strip()[:500]
  307. og['og:description'] = text if text else None
  308. # TODO: delete the url downloads to stop diskfilling,
  309. # as we only ever cared about its OG
  310. defer.returnValue(og)
  311. def _rebase_url(self, url, base):
  312. base = list(urlparse(base))
  313. url = list(urlparse(url))
  314. if not url[0]: # fix up schema
  315. url[0] = base[0] or "http"
  316. if not url[1]: # fix up hostname
  317. url[1] = base[1]
  318. if not url[2].startswith('/'):
  319. url[2] = re.sub(r'/[^/]+$', '/', base[2]) + url[2]
  320. return urlunparse(url)
  321. @defer.inlineCallbacks
  322. def _download_url(self, url, user):
  323. # TODO: we should probably honour robots.txt... except in practice
  324. # we're most likely being explicitly triggered by a human rather than a
  325. # bot, so are we really a robot?
  326. # XXX: horrible duplication with base_resource's _download_remote_file()
  327. file_id = random_string(24)
  328. fname = self.filepaths.local_media_filepath(file_id)
  329. self._makedirs(fname)
  330. try:
  331. with open(fname, "wb") as f:
  332. logger.debug("Trying to get url '%s'" % url)
  333. length, headers, uri, code = yield self.client.get_file(
  334. url, output_stream=f, max_size=self.max_spider_size,
  335. )
  336. # FIXME: pass through 404s and other error messages nicely
  337. media_type = headers["Content-Type"][0]
  338. time_now_ms = self.clock.time_msec()
  339. content_disposition = headers.get("Content-Disposition", None)
  340. if content_disposition:
  341. _, params = cgi.parse_header(content_disposition[0],)
  342. download_name = None
  343. # First check if there is a valid UTF-8 filename
  344. download_name_utf8 = params.get("filename*", None)
  345. if download_name_utf8:
  346. if download_name_utf8.lower().startswith("utf-8''"):
  347. download_name = download_name_utf8[7:]
  348. # If there isn't check for an ascii name.
  349. if not download_name:
  350. download_name_ascii = params.get("filename", None)
  351. if download_name_ascii and is_ascii(download_name_ascii):
  352. download_name = download_name_ascii
  353. if download_name:
  354. download_name = urlparse.unquote(download_name)
  355. try:
  356. download_name = download_name.decode("utf-8")
  357. except UnicodeDecodeError:
  358. download_name = None
  359. else:
  360. download_name = None
  361. yield self.store.store_local_media(
  362. media_id=file_id,
  363. media_type=media_type,
  364. time_now_ms=self.clock.time_msec(),
  365. upload_name=download_name,
  366. media_length=length,
  367. user_id=user,
  368. )
  369. except Exception as e:
  370. os.remove(fname)
  371. raise SynapseError(
  372. 500, ("Failed to download content: %s" % e),
  373. Codes.UNKNOWN
  374. )
  375. defer.returnValue({
  376. "media_type": media_type,
  377. "media_length": length,
  378. "download_name": download_name,
  379. "created_ts": time_now_ms,
  380. "filesystem_id": file_id,
  381. "filename": fname,
  382. "uri": uri,
  383. "response_code": code,
  384. # FIXME: we should calculate a proper expiration based on the
  385. # Cache-Control and Expire headers. But for now, assume 1 hour.
  386. "expires": 60 * 60 * 1000,
  387. "etag": headers["ETag"][0] if "ETag" in headers else None,
  388. })
  389. def _is_media(self, content_type):
  390. if content_type.lower().startswith("image/"):
  391. return True
  392. def _is_html(self, content_type):
  393. content_type = content_type.lower()
  394. if (
  395. content_type.startswith("text/html") or
  396. content_type.startswith("application/xhtml")
  397. ):
  398. return True