_base.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402
  1. # -*- coding: utf-8 -*-
  2. # Copyright 2014-2016 OpenMarket Ltd
  3. # Copyright 2019 New Vector Ltd
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. import logging
  17. import os
  18. from six.moves import urllib
  19. from twisted.internet import defer
  20. from twisted.protocols.basic import FileSender
  21. from synapse.api.errors import Codes, SynapseError, cs_error
  22. from synapse.http.server import finish_request, respond_with_json
  23. from synapse.logging.context import make_deferred_yieldable
  24. from synapse.util.stringutils import is_ascii
  25. logger = logging.getLogger(__name__)
  26. # list all text content types that will have the charset default to UTF-8 when
  27. # none is given
  28. TEXT_CONTENT_TYPES = [
  29. "text/css",
  30. "text/csv",
  31. "text/html",
  32. "text/calendar",
  33. "text/plain",
  34. "text/javascript",
  35. "application/json",
  36. "application/ld+json",
  37. "application/rtf",
  38. "image/svg+xml",
  39. "text/xml",
  40. ]
  41. def parse_media_id(request):
  42. try:
  43. # This allows users to append e.g. /test.png to the URL. Useful for
  44. # clients that parse the URL to see content type.
  45. server_name, media_id = request.postpath[:2]
  46. if isinstance(server_name, bytes):
  47. server_name = server_name.decode("utf-8")
  48. media_id = media_id.decode("utf8")
  49. file_name = None
  50. if len(request.postpath) > 2:
  51. try:
  52. file_name = urllib.parse.unquote(request.postpath[-1].decode("utf-8"))
  53. except UnicodeDecodeError:
  54. pass
  55. return server_name, media_id, file_name
  56. except Exception:
  57. raise SynapseError(
  58. 404, "Invalid media id token %r" % (request.postpath,), Codes.UNKNOWN
  59. )
  60. def respond_404(request):
  61. respond_with_json(
  62. request,
  63. 404,
  64. cs_error("Not found %r" % (request.postpath,), code=Codes.NOT_FOUND),
  65. send_cors=True,
  66. )
  67. @defer.inlineCallbacks
  68. def respond_with_file(request, media_type, file_path, file_size=None, upload_name=None):
  69. logger.debug("Responding with %r", file_path)
  70. if os.path.isfile(file_path):
  71. if file_size is None:
  72. stat = os.stat(file_path)
  73. file_size = stat.st_size
  74. add_file_headers(request, media_type, file_size, upload_name)
  75. with open(file_path, "rb") as f:
  76. yield make_deferred_yieldable(FileSender().beginFileTransfer(f, request))
  77. finish_request(request)
  78. else:
  79. respond_404(request)
  80. def add_file_headers(request, media_type, file_size, upload_name):
  81. """Adds the correct response headers in preparation for responding with the
  82. media.
  83. Args:
  84. request (twisted.web.http.Request)
  85. media_type (str): The media/content type.
  86. file_size (int): Size in bytes of the media, if known.
  87. upload_name (str): The name of the requested file, if any.
  88. """
  89. def _quote(x):
  90. return urllib.parse.quote(x.encode("utf-8"))
  91. # Default to a UTF-8 charset for text content types.
  92. # ex, uses UTF-8 for 'text/css' but not 'text/css; charset=UTF-16'
  93. if media_type.lower() in TEXT_CONTENT_TYPES:
  94. content_type = media_type + "; charset=UTF-8"
  95. else:
  96. content_type = media_type
  97. request.setHeader(b"Content-Type", content_type.encode("UTF-8"))
  98. if upload_name:
  99. # RFC6266 section 4.1 [1] defines both `filename` and `filename*`.
  100. #
  101. # `filename` is defined to be a `value`, which is defined by RFC2616
  102. # section 3.6 [2] to be a `token` or a `quoted-string`, where a `token`
  103. # is (essentially) a single US-ASCII word, and a `quoted-string` is a
  104. # US-ASCII string surrounded by double-quotes, using backslash as an
  105. # escape charater. Note that %-encoding is *not* permitted.
  106. #
  107. # `filename*` is defined to be an `ext-value`, which is defined in
  108. # RFC5987 section 3.2.1 [3] to be `charset "'" [ language ] "'" value-chars`,
  109. # where `value-chars` is essentially a %-encoded string in the given charset.
  110. #
  111. # [1]: https://tools.ietf.org/html/rfc6266#section-4.1
  112. # [2]: https://tools.ietf.org/html/rfc2616#section-3.6
  113. # [3]: https://tools.ietf.org/html/rfc5987#section-3.2.1
  114. # We avoid the quoted-string version of `filename`, because (a) synapse didn't
  115. # correctly interpret those as of 0.99.2 and (b) they are a bit of a pain and we
  116. # may as well just do the filename* version.
  117. if _can_encode_filename_as_token(upload_name):
  118. disposition = "inline; filename=%s" % (upload_name,)
  119. else:
  120. disposition = "inline; filename*=utf-8''%s" % (_quote(upload_name),)
  121. request.setHeader(b"Content-Disposition", disposition.encode("ascii"))
  122. # cache for at least a day.
  123. # XXX: we might want to turn this off for data we don't want to
  124. # recommend caching as it's sensitive or private - or at least
  125. # select private. don't bother setting Expires as all our
  126. # clients are smart enough to be happy with Cache-Control
  127. request.setHeader(b"Cache-Control", b"public,max-age=86400,s-maxage=86400")
  128. request.setHeader(b"Content-Length", b"%d" % (file_size,))
  129. # separators as defined in RFC2616. SP and HT are handled separately.
  130. # see _can_encode_filename_as_token.
  131. _FILENAME_SEPARATOR_CHARS = {
  132. "(",
  133. ")",
  134. "<",
  135. ">",
  136. "@",
  137. ",",
  138. ";",
  139. ":",
  140. "\\",
  141. '"',
  142. "/",
  143. "[",
  144. "]",
  145. "?",
  146. "=",
  147. "{",
  148. "}",
  149. }
  150. def _can_encode_filename_as_token(x):
  151. for c in x:
  152. # from RFC2616:
  153. #
  154. # token = 1*<any CHAR except CTLs or separators>
  155. #
  156. # separators = "(" | ")" | "<" | ">" | "@"
  157. # | "," | ";" | ":" | "\" | <">
  158. # | "/" | "[" | "]" | "?" | "="
  159. # | "{" | "}" | SP | HT
  160. #
  161. # CHAR = <any US-ASCII character (octets 0 - 127)>
  162. #
  163. # CTL = <any US-ASCII control character
  164. # (octets 0 - 31) and DEL (127)>
  165. #
  166. if ord(c) >= 127 or ord(c) <= 32 or c in _FILENAME_SEPARATOR_CHARS:
  167. return False
  168. return True
  169. @defer.inlineCallbacks
  170. def respond_with_responder(request, responder, media_type, file_size, upload_name=None):
  171. """Responds to the request with given responder. If responder is None then
  172. returns 404.
  173. Args:
  174. request (twisted.web.http.Request)
  175. responder (Responder|None)
  176. media_type (str): The media/content type.
  177. file_size (int|None): Size in bytes of the media. If not known it should be None
  178. upload_name (str|None): The name of the requested file, if any.
  179. """
  180. if not responder:
  181. respond_404(request)
  182. return
  183. logger.debug("Responding to media request with responder %s", responder)
  184. add_file_headers(request, media_type, file_size, upload_name)
  185. try:
  186. with responder:
  187. yield responder.write_to_consumer(request)
  188. except Exception as e:
  189. # The majority of the time this will be due to the client having gone
  190. # away. Unfortunately, Twisted simply throws a generic exception at us
  191. # in that case.
  192. logger.warning("Failed to write to consumer: %s %s", type(e), e)
  193. # Unregister the producer, if it has one, so Twisted doesn't complain
  194. if request.producer:
  195. request.unregisterProducer()
  196. finish_request(request)
  197. class Responder(object):
  198. """Represents a response that can be streamed to the requester.
  199. Responder is a context manager which *must* be used, so that any resources
  200. held can be cleaned up.
  201. """
  202. def write_to_consumer(self, consumer):
  203. """Stream response into consumer
  204. Args:
  205. consumer (IConsumer)
  206. Returns:
  207. Deferred: Resolves once the response has finished being written
  208. """
  209. pass
  210. def __enter__(self):
  211. pass
  212. def __exit__(self, exc_type, exc_val, exc_tb):
  213. pass
  214. class FileInfo(object):
  215. """Details about a requested/uploaded file.
  216. Attributes:
  217. server_name (str): The server name where the media originated from,
  218. or None if local.
  219. file_id (str): The local ID of the file. For local files this is the
  220. same as the media_id
  221. url_cache (bool): If the file is for the url preview cache
  222. thumbnail (bool): Whether the file is a thumbnail or not.
  223. thumbnail_width (int)
  224. thumbnail_height (int)
  225. thumbnail_method (str)
  226. thumbnail_type (str): Content type of thumbnail, e.g. image/png
  227. """
  228. def __init__(
  229. self,
  230. server_name,
  231. file_id,
  232. url_cache=False,
  233. thumbnail=False,
  234. thumbnail_width=None,
  235. thumbnail_height=None,
  236. thumbnail_method=None,
  237. thumbnail_type=None,
  238. ):
  239. self.server_name = server_name
  240. self.file_id = file_id
  241. self.url_cache = url_cache
  242. self.thumbnail = thumbnail
  243. self.thumbnail_width = thumbnail_width
  244. self.thumbnail_height = thumbnail_height
  245. self.thumbnail_method = thumbnail_method
  246. self.thumbnail_type = thumbnail_type
  247. def get_filename_from_headers(headers):
  248. """
  249. Get the filename of the downloaded file by inspecting the
  250. Content-Disposition HTTP header.
  251. Args:
  252. headers (dict[bytes, list[bytes]]): The HTTP request headers.
  253. Returns:
  254. A Unicode string of the filename, or None.
  255. """
  256. content_disposition = headers.get(b"Content-Disposition", [b""])
  257. # No header, bail out.
  258. if not content_disposition[0]:
  259. return
  260. _, params = _parse_header(content_disposition[0])
  261. upload_name = None
  262. # First check if there is a valid UTF-8 filename
  263. upload_name_utf8 = params.get(b"filename*", None)
  264. if upload_name_utf8:
  265. if upload_name_utf8.lower().startswith(b"utf-8''"):
  266. upload_name_utf8 = upload_name_utf8[7:]
  267. # We have a filename*= section. This MUST be ASCII, and any UTF-8
  268. # bytes are %-quoted.
  269. try:
  270. # Once it is decoded, we can then unquote the %-encoded
  271. # parts strictly into a unicode string.
  272. upload_name = urllib.parse.unquote(
  273. upload_name_utf8.decode("ascii"), errors="strict"
  274. )
  275. except UnicodeDecodeError:
  276. # Incorrect UTF-8.
  277. pass
  278. # If there isn't check for an ascii name.
  279. if not upload_name:
  280. upload_name_ascii = params.get(b"filename", None)
  281. if upload_name_ascii and is_ascii(upload_name_ascii):
  282. upload_name = upload_name_ascii.decode("ascii")
  283. # This may be None here, indicating we did not find a matching name.
  284. return upload_name
  285. def _parse_header(line):
  286. """Parse a Content-type like header.
  287. Cargo-culted from `cgi`, but works on bytes rather than strings.
  288. Args:
  289. line (bytes): header to be parsed
  290. Returns:
  291. Tuple[bytes, dict[bytes, bytes]]:
  292. the main content-type, followed by the parameter dictionary
  293. """
  294. parts = _parseparam(b";" + line)
  295. key = next(parts)
  296. pdict = {}
  297. for p in parts:
  298. i = p.find(b"=")
  299. if i >= 0:
  300. name = p[:i].strip().lower()
  301. value = p[i + 1 :].strip()
  302. # strip double-quotes
  303. if len(value) >= 2 and value[0:1] == value[-1:] == b'"':
  304. value = value[1:-1]
  305. value = value.replace(b"\\\\", b"\\").replace(b'\\"', b'"')
  306. pdict[name] = value
  307. return key, pdict
  308. def _parseparam(s):
  309. """Generator which splits the input on ;, respecting double-quoted sequences
  310. Cargo-culted from `cgi`, but works on bytes rather than strings.
  311. Args:
  312. s (bytes): header to be parsed
  313. Returns:
  314. Iterable[bytes]: the split input
  315. """
  316. while s[:1] == b";":
  317. s = s[1:]
  318. # look for the next ;
  319. end = s.find(b";")
  320. # if there is an odd number of " marks between here and the next ;, skip to the
  321. # next ; instead
  322. while end > 0 and (s.count(b'"', 0, end) - s.count(b'\\"', 0, end)) % 2:
  323. end = s.find(b";", end + 1)
  324. if end < 0:
  325. end = len(s)
  326. f = s[:end]
  327. yield f.strip()
  328. s = s[end:]