_base.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401
  1. # -*- coding: utf-8 -*-
  2. # Copyright 2014-2016 OpenMarket Ltd
  3. # Copyright 2019 New Vector Ltd
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. import logging
  17. import os
  18. import urllib
  19. from twisted.internet import defer
  20. from twisted.protocols.basic import FileSender
  21. from synapse.api.errors import Codes, SynapseError, cs_error
  22. from synapse.http.server import finish_request, respond_with_json
  23. from synapse.logging.context import make_deferred_yieldable
  24. from synapse.util.stringutils import is_ascii
  25. logger = logging.getLogger(__name__)
  26. # list all text content types that will have the charset default to UTF-8 when
  27. # none is given
  28. TEXT_CONTENT_TYPES = [
  29. "text/css",
  30. "text/csv",
  31. "text/html",
  32. "text/calendar",
  33. "text/plain",
  34. "text/javascript",
  35. "application/json",
  36. "application/ld+json",
  37. "application/rtf",
  38. "image/svg+xml",
  39. "text/xml",
  40. ]
  41. def parse_media_id(request):
  42. try:
  43. # This allows users to append e.g. /test.png to the URL. Useful for
  44. # clients that parse the URL to see content type.
  45. server_name, media_id = request.postpath[:2]
  46. if isinstance(server_name, bytes):
  47. server_name = server_name.decode("utf-8")
  48. media_id = media_id.decode("utf8")
  49. file_name = None
  50. if len(request.postpath) > 2:
  51. try:
  52. file_name = urllib.parse.unquote(request.postpath[-1].decode("utf-8"))
  53. except UnicodeDecodeError:
  54. pass
  55. return server_name, media_id, file_name
  56. except Exception:
  57. raise SynapseError(
  58. 404, "Invalid media id token %r" % (request.postpath,), Codes.UNKNOWN
  59. )
  60. def respond_404(request):
  61. respond_with_json(
  62. request,
  63. 404,
  64. cs_error("Not found %r" % (request.postpath,), code=Codes.NOT_FOUND),
  65. send_cors=True,
  66. )
  67. @defer.inlineCallbacks
  68. def respond_with_file(request, media_type, file_path, file_size=None, upload_name=None):
  69. logger.debug("Responding with %r", file_path)
  70. if os.path.isfile(file_path):
  71. if file_size is None:
  72. stat = os.stat(file_path)
  73. file_size = stat.st_size
  74. add_file_headers(request, media_type, file_size, upload_name)
  75. with open(file_path, "rb") as f:
  76. yield make_deferred_yieldable(FileSender().beginFileTransfer(f, request))
  77. finish_request(request)
  78. else:
  79. respond_404(request)
  80. def add_file_headers(request, media_type, file_size, upload_name):
  81. """Adds the correct response headers in preparation for responding with the
  82. media.
  83. Args:
  84. request (twisted.web.http.Request)
  85. media_type (str): The media/content type.
  86. file_size (int): Size in bytes of the media, if known.
  87. upload_name (str): The name of the requested file, if any.
  88. """
  89. def _quote(x):
  90. return urllib.parse.quote(x.encode("utf-8"))
  91. # Default to a UTF-8 charset for text content types.
  92. # ex, uses UTF-8 for 'text/css' but not 'text/css; charset=UTF-16'
  93. if media_type.lower() in TEXT_CONTENT_TYPES:
  94. content_type = media_type + "; charset=UTF-8"
  95. else:
  96. content_type = media_type
  97. request.setHeader(b"Content-Type", content_type.encode("UTF-8"))
  98. if upload_name:
  99. # RFC6266 section 4.1 [1] defines both `filename` and `filename*`.
  100. #
  101. # `filename` is defined to be a `value`, which is defined by RFC2616
  102. # section 3.6 [2] to be a `token` or a `quoted-string`, where a `token`
  103. # is (essentially) a single US-ASCII word, and a `quoted-string` is a
  104. # US-ASCII string surrounded by double-quotes, using backslash as an
  105. # escape charater. Note that %-encoding is *not* permitted.
  106. #
  107. # `filename*` is defined to be an `ext-value`, which is defined in
  108. # RFC5987 section 3.2.1 [3] to be `charset "'" [ language ] "'" value-chars`,
  109. # where `value-chars` is essentially a %-encoded string in the given charset.
  110. #
  111. # [1]: https://tools.ietf.org/html/rfc6266#section-4.1
  112. # [2]: https://tools.ietf.org/html/rfc2616#section-3.6
  113. # [3]: https://tools.ietf.org/html/rfc5987#section-3.2.1
  114. # We avoid the quoted-string version of `filename`, because (a) synapse didn't
  115. # correctly interpret those as of 0.99.2 and (b) they are a bit of a pain and we
  116. # may as well just do the filename* version.
  117. if _can_encode_filename_as_token(upload_name):
  118. disposition = "inline; filename=%s" % (upload_name,)
  119. else:
  120. disposition = "inline; filename*=utf-8''%s" % (_quote(upload_name),)
  121. request.setHeader(b"Content-Disposition", disposition.encode("ascii"))
  122. # cache for at least a day.
  123. # XXX: we might want to turn this off for data we don't want to
  124. # recommend caching as it's sensitive or private - or at least
  125. # select private. don't bother setting Expires as all our
  126. # clients are smart enough to be happy with Cache-Control
  127. request.setHeader(b"Cache-Control", b"public,max-age=86400,s-maxage=86400")
  128. request.setHeader(b"Content-Length", b"%d" % (file_size,))
  129. # separators as defined in RFC2616. SP and HT are handled separately.
  130. # see _can_encode_filename_as_token.
  131. _FILENAME_SEPARATOR_CHARS = {
  132. "(",
  133. ")",
  134. "<",
  135. ">",
  136. "@",
  137. ",",
  138. ";",
  139. ":",
  140. "\\",
  141. '"',
  142. "/",
  143. "[",
  144. "]",
  145. "?",
  146. "=",
  147. "{",
  148. "}",
  149. }
  150. def _can_encode_filename_as_token(x):
  151. for c in x:
  152. # from RFC2616:
  153. #
  154. # token = 1*<any CHAR except CTLs or separators>
  155. #
  156. # separators = "(" | ")" | "<" | ">" | "@"
  157. # | "," | ";" | ":" | "\" | <">
  158. # | "/" | "[" | "]" | "?" | "="
  159. # | "{" | "}" | SP | HT
  160. #
  161. # CHAR = <any US-ASCII character (octets 0 - 127)>
  162. #
  163. # CTL = <any US-ASCII control character
  164. # (octets 0 - 31) and DEL (127)>
  165. #
  166. if ord(c) >= 127 or ord(c) <= 32 or c in _FILENAME_SEPARATOR_CHARS:
  167. return False
  168. return True
  169. @defer.inlineCallbacks
  170. def respond_with_responder(request, responder, media_type, file_size, upload_name=None):
  171. """Responds to the request with given responder. If responder is None then
  172. returns 404.
  173. Args:
  174. request (twisted.web.http.Request)
  175. responder (Responder|None)
  176. media_type (str): The media/content type.
  177. file_size (int|None): Size in bytes of the media. If not known it should be None
  178. upload_name (str|None): The name of the requested file, if any.
  179. """
  180. if not responder:
  181. respond_404(request)
  182. return
  183. logger.debug("Responding to media request with responder %s", responder)
  184. add_file_headers(request, media_type, file_size, upload_name)
  185. try:
  186. with responder:
  187. yield responder.write_to_consumer(request)
  188. except Exception as e:
  189. # The majority of the time this will be due to the client having gone
  190. # away. Unfortunately, Twisted simply throws a generic exception at us
  191. # in that case.
  192. logger.warning("Failed to write to consumer: %s %s", type(e), e)
  193. # Unregister the producer, if it has one, so Twisted doesn't complain
  194. if request.producer:
  195. request.unregisterProducer()
  196. finish_request(request)
  197. class Responder(object):
  198. """Represents a response that can be streamed to the requester.
  199. Responder is a context manager which *must* be used, so that any resources
  200. held can be cleaned up.
  201. """
  202. def write_to_consumer(self, consumer):
  203. """Stream response into consumer
  204. Args:
  205. consumer (IConsumer)
  206. Returns:
  207. Deferred: Resolves once the response has finished being written
  208. """
  209. pass
  210. def __enter__(self):
  211. pass
  212. def __exit__(self, exc_type, exc_val, exc_tb):
  213. pass
  214. class FileInfo(object):
  215. """Details about a requested/uploaded file.
  216. Attributes:
  217. server_name (str): The server name where the media originated from,
  218. or None if local.
  219. file_id (str): The local ID of the file. For local files this is the
  220. same as the media_id
  221. url_cache (bool): If the file is for the url preview cache
  222. thumbnail (bool): Whether the file is a thumbnail or not.
  223. thumbnail_width (int)
  224. thumbnail_height (int)
  225. thumbnail_method (str)
  226. thumbnail_type (str): Content type of thumbnail, e.g. image/png
  227. """
  228. def __init__(
  229. self,
  230. server_name,
  231. file_id,
  232. url_cache=False,
  233. thumbnail=False,
  234. thumbnail_width=None,
  235. thumbnail_height=None,
  236. thumbnail_method=None,
  237. thumbnail_type=None,
  238. ):
  239. self.server_name = server_name
  240. self.file_id = file_id
  241. self.url_cache = url_cache
  242. self.thumbnail = thumbnail
  243. self.thumbnail_width = thumbnail_width
  244. self.thumbnail_height = thumbnail_height
  245. self.thumbnail_method = thumbnail_method
  246. self.thumbnail_type = thumbnail_type
  247. def get_filename_from_headers(headers):
  248. """
  249. Get the filename of the downloaded file by inspecting the
  250. Content-Disposition HTTP header.
  251. Args:
  252. headers (dict[bytes, list[bytes]]): The HTTP request headers.
  253. Returns:
  254. A Unicode string of the filename, or None.
  255. """
  256. content_disposition = headers.get(b"Content-Disposition", [b""])
  257. # No header, bail out.
  258. if not content_disposition[0]:
  259. return
  260. _, params = _parse_header(content_disposition[0])
  261. upload_name = None
  262. # First check if there is a valid UTF-8 filename
  263. upload_name_utf8 = params.get(b"filename*", None)
  264. if upload_name_utf8:
  265. if upload_name_utf8.lower().startswith(b"utf-8''"):
  266. upload_name_utf8 = upload_name_utf8[7:]
  267. # We have a filename*= section. This MUST be ASCII, and any UTF-8
  268. # bytes are %-quoted.
  269. try:
  270. # Once it is decoded, we can then unquote the %-encoded
  271. # parts strictly into a unicode string.
  272. upload_name = urllib.parse.unquote(
  273. upload_name_utf8.decode("ascii"), errors="strict"
  274. )
  275. except UnicodeDecodeError:
  276. # Incorrect UTF-8.
  277. pass
  278. # If there isn't check for an ascii name.
  279. if not upload_name:
  280. upload_name_ascii = params.get(b"filename", None)
  281. if upload_name_ascii and is_ascii(upload_name_ascii):
  282. upload_name = upload_name_ascii.decode("ascii")
  283. # This may be None here, indicating we did not find a matching name.
  284. return upload_name
  285. def _parse_header(line):
  286. """Parse a Content-type like header.
  287. Cargo-culted from `cgi`, but works on bytes rather than strings.
  288. Args:
  289. line (bytes): header to be parsed
  290. Returns:
  291. Tuple[bytes, dict[bytes, bytes]]:
  292. the main content-type, followed by the parameter dictionary
  293. """
  294. parts = _parseparam(b";" + line)
  295. key = next(parts)
  296. pdict = {}
  297. for p in parts:
  298. i = p.find(b"=")
  299. if i >= 0:
  300. name = p[:i].strip().lower()
  301. value = p[i + 1 :].strip()
  302. # strip double-quotes
  303. if len(value) >= 2 and value[0:1] == value[-1:] == b'"':
  304. value = value[1:-1]
  305. value = value.replace(b"\\\\", b"\\").replace(b'\\"', b'"')
  306. pdict[name] = value
  307. return key, pdict
  308. def _parseparam(s):
  309. """Generator which splits the input on ;, respecting double-quoted sequences
  310. Cargo-culted from `cgi`, but works on bytes rather than strings.
  311. Args:
  312. s (bytes): header to be parsed
  313. Returns:
  314. Iterable[bytes]: the split input
  315. """
  316. while s[:1] == b";":
  317. s = s[1:]
  318. # look for the next ;
  319. end = s.find(b";")
  320. # if there is an odd number of " marks between here and the next ;, skip to the
  321. # next ; instead
  322. while end > 0 and (s.count(b'"', 0, end) - s.count(b'\\"', 0, end)) % 2:
  323. end = s.find(b";", end + 1)
  324. if end < 0:
  325. end = len(s)
  326. f = s[:end]
  327. yield f.strip()
  328. s = s[end:]