test_url_preview.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564
  1. # -*- coding: utf-8 -*-
  2. # Copyright 2018 New Vector Ltd
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. import os
  16. import attr
  17. from twisted.internet._resolver import HostResolution
  18. from twisted.internet.address import IPv4Address, IPv6Address
  19. from twisted.internet.error import DNSLookupError
  20. from twisted.python.failure import Failure
  21. from twisted.test.proto_helpers import AccumulatingProtocol
  22. from twisted.web._newclient import ResponseDone
  23. from tests import unittest
  24. from tests.server import FakeTransport
  25. @attr.s
  26. class FakeResponse(object):
  27. version = attr.ib()
  28. code = attr.ib()
  29. phrase = attr.ib()
  30. headers = attr.ib()
  31. body = attr.ib()
  32. absoluteURI = attr.ib()
  33. @property
  34. def request(self):
  35. @attr.s
  36. class FakeTransport(object):
  37. absoluteURI = self.absoluteURI
  38. return FakeTransport()
  39. def deliverBody(self, protocol):
  40. protocol.dataReceived(self.body)
  41. protocol.connectionLost(Failure(ResponseDone()))
  42. class URLPreviewTests(unittest.HomeserverTestCase):
  43. hijack_auth = True
  44. user_id = "@test:user"
  45. end_content = (
  46. b"<html><head>"
  47. b'<meta property="og:title" content="~matrix~" />'
  48. b'<meta property="og:description" content="hi" />'
  49. b"</head></html>"
  50. )
  51. def make_homeserver(self, reactor, clock):
  52. config = self.default_config()
  53. config["url_preview_enabled"] = True
  54. config["max_spider_size"] = 9999999
  55. config["url_preview_ip_range_blacklist"] = (
  56. "192.168.1.1",
  57. "1.0.0.0/8",
  58. "3fff:ffff:ffff:ffff:ffff:ffff:ffff:ffff",
  59. "2001:800::/21",
  60. )
  61. config["url_preview_ip_range_whitelist"] = ("1.1.1.1",)
  62. config["url_preview_url_blacklist"] = []
  63. config["url_preview_accept_language"] = [
  64. "en-UK",
  65. "en-US;q=0.9",
  66. "fr;q=0.8",
  67. "*;q=0.7",
  68. ]
  69. self.storage_path = self.mktemp()
  70. self.media_store_path = self.mktemp()
  71. os.mkdir(self.storage_path)
  72. os.mkdir(self.media_store_path)
  73. config["media_store_path"] = self.media_store_path
  74. provider_config = {
  75. "module": "synapse.rest.media.v1.storage_provider.FileStorageProviderBackend",
  76. "store_local": True,
  77. "store_synchronous": False,
  78. "store_remote": True,
  79. "config": {"directory": self.storage_path},
  80. }
  81. config["media_storage_providers"] = [provider_config]
  82. hs = self.setup_test_homeserver(config=config)
  83. return hs
  84. def prepare(self, reactor, clock, hs):
  85. self.media_repo = hs.get_media_repository_resource()
  86. self.preview_url = self.media_repo.children[b"preview_url"]
  87. self.lookups = {}
  88. class Resolver(object):
  89. def resolveHostName(
  90. _self,
  91. resolutionReceiver,
  92. hostName,
  93. portNumber=0,
  94. addressTypes=None,
  95. transportSemantics="TCP",
  96. ):
  97. resolution = HostResolution(hostName)
  98. resolutionReceiver.resolutionBegan(resolution)
  99. if hostName not in self.lookups:
  100. raise DNSLookupError("OH NO")
  101. for i in self.lookups[hostName]:
  102. resolutionReceiver.addressResolved(i[0]("TCP", i[1], portNumber))
  103. resolutionReceiver.resolutionComplete()
  104. return resolutionReceiver
  105. self.reactor.nameResolver = Resolver()
  106. def test_cache_returns_correct_type(self):
  107. self.lookups["matrix.org"] = [(IPv4Address, "8.8.8.8")]
  108. request, channel = self.make_request(
  109. "GET", "url_preview?url=http://matrix.org", shorthand=False
  110. )
  111. request.render(self.preview_url)
  112. self.pump()
  113. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  114. server = AccumulatingProtocol()
  115. server.makeConnection(FakeTransport(client, self.reactor))
  116. client.makeConnection(FakeTransport(server, self.reactor))
  117. client.dataReceived(
  118. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
  119. % (len(self.end_content),)
  120. + self.end_content
  121. )
  122. self.pump()
  123. self.assertEqual(channel.code, 200)
  124. self.assertEqual(
  125. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  126. )
  127. # Check the cache returns the correct response
  128. request, channel = self.make_request(
  129. "GET", "url_preview?url=http://matrix.org", shorthand=False
  130. )
  131. request.render(self.preview_url)
  132. self.pump()
  133. # Check the cache response has the same content
  134. self.assertEqual(channel.code, 200)
  135. self.assertEqual(
  136. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  137. )
  138. # Clear the in-memory cache
  139. self.assertIn("http://matrix.org", self.preview_url._cache)
  140. self.preview_url._cache.pop("http://matrix.org")
  141. self.assertNotIn("http://matrix.org", self.preview_url._cache)
  142. # Check the database cache returns the correct response
  143. request, channel = self.make_request(
  144. "GET", "url_preview?url=http://matrix.org", shorthand=False
  145. )
  146. request.render(self.preview_url)
  147. self.pump()
  148. # Check the cache response has the same content
  149. self.assertEqual(channel.code, 200)
  150. self.assertEqual(
  151. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  152. )
  153. def test_non_ascii_preview_httpequiv(self):
  154. self.lookups["matrix.org"] = [(IPv4Address, "8.8.8.8")]
  155. end_content = (
  156. b"<html><head>"
  157. b'<meta http-equiv="Content-Type" content="text/html; charset=windows-1251"/>'
  158. b'<meta property="og:title" content="\xe4\xea\xe0" />'
  159. b'<meta property="og:description" content="hi" />'
  160. b"</head></html>"
  161. )
  162. request, channel = self.make_request(
  163. "GET", "url_preview?url=http://matrix.org", shorthand=False
  164. )
  165. request.render(self.preview_url)
  166. self.pump()
  167. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  168. server = AccumulatingProtocol()
  169. server.makeConnection(FakeTransport(client, self.reactor))
  170. client.makeConnection(FakeTransport(server, self.reactor))
  171. client.dataReceived(
  172. (
  173. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  174. b'Content-Type: text/html; charset="utf8"\r\n\r\n'
  175. )
  176. % (len(end_content),)
  177. + end_content
  178. )
  179. self.pump()
  180. self.assertEqual(channel.code, 200)
  181. self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")
  182. def test_non_ascii_preview_content_type(self):
  183. self.lookups["matrix.org"] = [(IPv4Address, "8.8.8.8")]
  184. end_content = (
  185. b"<html><head>"
  186. b'<meta property="og:title" content="\xe4\xea\xe0" />'
  187. b'<meta property="og:description" content="hi" />'
  188. b"</head></html>"
  189. )
  190. request, channel = self.make_request(
  191. "GET", "url_preview?url=http://matrix.org", shorthand=False
  192. )
  193. request.render(self.preview_url)
  194. self.pump()
  195. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  196. server = AccumulatingProtocol()
  197. server.makeConnection(FakeTransport(client, self.reactor))
  198. client.makeConnection(FakeTransport(server, self.reactor))
  199. client.dataReceived(
  200. (
  201. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  202. b'Content-Type: text/html; charset="windows-1251"\r\n\r\n'
  203. )
  204. % (len(end_content),)
  205. + end_content
  206. )
  207. self.pump()
  208. self.assertEqual(channel.code, 200)
  209. self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")
  210. def test_overlong_title(self):
  211. self.lookups["matrix.org"] = [(IPv4Address, "8.8.8.8")]
  212. end_content = (
  213. b"<html><head>"
  214. b"<title>" + b"x" * 2000 + b"</title>"
  215. b'<meta property="og:description" content="hi" />'
  216. b"</head></html>"
  217. )
  218. request, channel = self.make_request(
  219. "GET", "url_preview?url=http://matrix.org", shorthand=False
  220. )
  221. request.render(self.preview_url)
  222. self.pump()
  223. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  224. server = AccumulatingProtocol()
  225. server.makeConnection(FakeTransport(client, self.reactor))
  226. client.makeConnection(FakeTransport(server, self.reactor))
  227. client.dataReceived(
  228. (
  229. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  230. b'Content-Type: text/html; charset="windows-1251"\r\n\r\n'
  231. )
  232. % (len(end_content),)
  233. + end_content
  234. )
  235. self.pump()
  236. self.assertEqual(channel.code, 200)
  237. res = channel.json_body
  238. # We should only see the `og:description` field, as `title` is too long and should be stripped out
  239. self.assertCountEqual(["og:description"], res.keys())
  240. def test_ipaddr(self):
  241. """
  242. IP addresses can be previewed directly.
  243. """
  244. self.lookups["example.com"] = [(IPv4Address, "8.8.8.8")]
  245. request, channel = self.make_request(
  246. "GET", "url_preview?url=http://example.com", shorthand=False
  247. )
  248. request.render(self.preview_url)
  249. self.pump()
  250. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  251. server = AccumulatingProtocol()
  252. server.makeConnection(FakeTransport(client, self.reactor))
  253. client.makeConnection(FakeTransport(server, self.reactor))
  254. client.dataReceived(
  255. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
  256. % (len(self.end_content),)
  257. + self.end_content
  258. )
  259. self.pump()
  260. self.assertEqual(channel.code, 200)
  261. self.assertEqual(
  262. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  263. )
  264. def test_blacklisted_ip_specific(self):
  265. """
  266. Blacklisted IP addresses, found via DNS, are not spidered.
  267. """
  268. self.lookups["example.com"] = [(IPv4Address, "192.168.1.1")]
  269. request, channel = self.make_request(
  270. "GET", "url_preview?url=http://example.com", shorthand=False
  271. )
  272. request.render(self.preview_url)
  273. self.pump()
  274. # No requests made.
  275. self.assertEqual(len(self.reactor.tcpClients), 0)
  276. self.assertEqual(channel.code, 502)
  277. self.assertEqual(
  278. channel.json_body,
  279. {
  280. "errcode": "M_UNKNOWN",
  281. "error": "DNS resolution failure during URL preview generation",
  282. },
  283. )
  284. def test_blacklisted_ip_range(self):
  285. """
  286. Blacklisted IP ranges, IPs found over DNS, are not spidered.
  287. """
  288. self.lookups["example.com"] = [(IPv4Address, "1.1.1.2")]
  289. request, channel = self.make_request(
  290. "GET", "url_preview?url=http://example.com", shorthand=False
  291. )
  292. request.render(self.preview_url)
  293. self.pump()
  294. self.assertEqual(channel.code, 502)
  295. self.assertEqual(
  296. channel.json_body,
  297. {
  298. "errcode": "M_UNKNOWN",
  299. "error": "DNS resolution failure during URL preview generation",
  300. },
  301. )
  302. def test_blacklisted_ip_specific_direct(self):
  303. """
  304. Blacklisted IP addresses, accessed directly, are not spidered.
  305. """
  306. request, channel = self.make_request(
  307. "GET", "url_preview?url=http://192.168.1.1", shorthand=False
  308. )
  309. request.render(self.preview_url)
  310. self.pump()
  311. # No requests made.
  312. self.assertEqual(len(self.reactor.tcpClients), 0)
  313. self.assertEqual(
  314. channel.json_body,
  315. {
  316. "errcode": "M_UNKNOWN",
  317. "error": "IP address blocked by IP blacklist entry",
  318. },
  319. )
  320. self.assertEqual(channel.code, 403)
  321. def test_blacklisted_ip_range_direct(self):
  322. """
  323. Blacklisted IP ranges, accessed directly, are not spidered.
  324. """
  325. request, channel = self.make_request(
  326. "GET", "url_preview?url=http://1.1.1.2", shorthand=False
  327. )
  328. request.render(self.preview_url)
  329. self.pump()
  330. self.assertEqual(channel.code, 403)
  331. self.assertEqual(
  332. channel.json_body,
  333. {
  334. "errcode": "M_UNKNOWN",
  335. "error": "IP address blocked by IP blacklist entry",
  336. },
  337. )
  338. def test_blacklisted_ip_range_whitelisted_ip(self):
  339. """
  340. Blacklisted but then subsequently whitelisted IP addresses can be
  341. spidered.
  342. """
  343. self.lookups["example.com"] = [(IPv4Address, "1.1.1.1")]
  344. request, channel = self.make_request(
  345. "GET", "url_preview?url=http://example.com", shorthand=False
  346. )
  347. request.render(self.preview_url)
  348. self.pump()
  349. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  350. server = AccumulatingProtocol()
  351. server.makeConnection(FakeTransport(client, self.reactor))
  352. client.makeConnection(FakeTransport(server, self.reactor))
  353. client.dataReceived(
  354. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
  355. % (len(self.end_content),)
  356. + self.end_content
  357. )
  358. self.pump()
  359. self.assertEqual(channel.code, 200)
  360. self.assertEqual(
  361. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  362. )
  363. def test_blacklisted_ip_with_external_ip(self):
  364. """
  365. If a hostname resolves a blacklisted IP, even if there's a
  366. non-blacklisted one, it will be rejected.
  367. """
  368. # Hardcode the URL resolving to the IP we want.
  369. self.lookups["example.com"] = [
  370. (IPv4Address, "1.1.1.2"),
  371. (IPv4Address, "8.8.8.8"),
  372. ]
  373. request, channel = self.make_request(
  374. "GET", "url_preview?url=http://example.com", shorthand=False
  375. )
  376. request.render(self.preview_url)
  377. self.pump()
  378. self.assertEqual(channel.code, 502)
  379. self.assertEqual(
  380. channel.json_body,
  381. {
  382. "errcode": "M_UNKNOWN",
  383. "error": "DNS resolution failure during URL preview generation",
  384. },
  385. )
  386. def test_blacklisted_ipv6_specific(self):
  387. """
  388. Blacklisted IP addresses, found via DNS, are not spidered.
  389. """
  390. self.lookups["example.com"] = [
  391. (IPv6Address, "3fff:ffff:ffff:ffff:ffff:ffff:ffff:ffff")
  392. ]
  393. request, channel = self.make_request(
  394. "GET", "url_preview?url=http://example.com", shorthand=False
  395. )
  396. request.render(self.preview_url)
  397. self.pump()
  398. # No requests made.
  399. self.assertEqual(len(self.reactor.tcpClients), 0)
  400. self.assertEqual(channel.code, 502)
  401. self.assertEqual(
  402. channel.json_body,
  403. {
  404. "errcode": "M_UNKNOWN",
  405. "error": "DNS resolution failure during URL preview generation",
  406. },
  407. )
  408. def test_blacklisted_ipv6_range(self):
  409. """
  410. Blacklisted IP ranges, IPs found over DNS, are not spidered.
  411. """
  412. self.lookups["example.com"] = [(IPv6Address, "2001:800::1")]
  413. request, channel = self.make_request(
  414. "GET", "url_preview?url=http://example.com", shorthand=False
  415. )
  416. request.render(self.preview_url)
  417. self.pump()
  418. self.assertEqual(channel.code, 502)
  419. self.assertEqual(
  420. channel.json_body,
  421. {
  422. "errcode": "M_UNKNOWN",
  423. "error": "DNS resolution failure during URL preview generation",
  424. },
  425. )
  426. def test_OPTIONS(self):
  427. """
  428. OPTIONS returns the OPTIONS.
  429. """
  430. request, channel = self.make_request(
  431. "OPTIONS", "url_preview?url=http://example.com", shorthand=False
  432. )
  433. request.render(self.preview_url)
  434. self.pump()
  435. self.assertEqual(channel.code, 200)
  436. self.assertEqual(channel.json_body, {})
  437. def test_accept_language_config_option(self):
  438. """
  439. Accept-Language header is sent to the remote server
  440. """
  441. self.lookups["example.com"] = [(IPv4Address, "8.8.8.8")]
  442. # Build and make a request to the server
  443. request, channel = self.make_request(
  444. "GET", "url_preview?url=http://example.com", shorthand=False
  445. )
  446. request.render(self.preview_url)
  447. self.pump()
  448. # Extract Synapse's tcp client
  449. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  450. # Build a fake remote server to reply with
  451. server = AccumulatingProtocol()
  452. # Connect the two together
  453. server.makeConnection(FakeTransport(client, self.reactor))
  454. client.makeConnection(FakeTransport(server, self.reactor))
  455. # Tell Synapse that it has received some data from the remote server
  456. client.dataReceived(
  457. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
  458. % (len(self.end_content),)
  459. + self.end_content
  460. )
  461. # Move the reactor along until we get a response on our original channel
  462. self.pump()
  463. self.assertEqual(channel.code, 200)
  464. self.assertEqual(
  465. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  466. )
  467. # Check that the server received the Accept-Language header as part
  468. # of the request from Synapse
  469. self.assertIn(
  470. (
  471. b"Accept-Language: en-UK\r\n"
  472. b"Accept-Language: en-US;q=0.9\r\n"
  473. b"Accept-Language: fr;q=0.8\r\n"
  474. b"Accept-Language: *;q=0.7"
  475. ),
  476. server.data,
  477. )