downloads.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496
  1. #!/usr/bin/env python3
  2. # -*- coding: UTF-8 -*-
  3. # Copyright (c) 2019 The ungoogled-chromium Authors. All rights reserved.
  4. # Use of this source code is governed by a BSD-style license that can be
  5. # found in the LICENSE file.
  6. """
  7. Module for the downloading, checking, and unpacking of necessary files into the source tree.
  8. """
  9. import argparse
  10. import configparser
  11. import enum
  12. import hashlib
  13. import shutil
  14. import ssl
  15. import subprocess
  16. import sys
  17. import urllib.request
  18. from pathlib import Path
  19. from _common import ENCODING, USE_REGISTRY, ExtractorEnum, PlatformEnum, \
  20. get_logger, get_chromium_version, get_running_platform, add_common_params
  21. from _extraction import extract_tar_file, extract_with_7z, extract_with_winrar
  22. sys.path.insert(0, str(Path(__file__).parent / 'third_party'))
  23. import schema #pylint: disable=wrong-import-position, wrong-import-order
  24. sys.path.pop(0)
  25. # Constants
  26. class HashesURLEnum(str, enum.Enum):
  27. """Enum for supported hash URL schemes"""
  28. CHROMIUM = 'chromium'
  29. class HashMismatchError(BaseException):
  30. """Exception for computed hashes not matching expected hashes"""
  31. class DownloadInfo: #pylint: disable=too-few-public-methods
  32. """Representation of an downloads.ini file for downloading files"""
  33. _hashes = ('md5', 'sha1', 'sha256', 'sha512')
  34. hash_url_delimiter = '|'
  35. _nonempty_keys = ('url', 'download_filename')
  36. _optional_keys = (
  37. 'version',
  38. 'strip_leading_dirs',
  39. )
  40. _passthrough_properties = (*_nonempty_keys, *_optional_keys, 'extractor', 'output_path')
  41. _ini_vars = {
  42. '_chromium_version': get_chromium_version(),
  43. }
  44. @staticmethod
  45. def _is_hash_url(value):
  46. return value.count(DownloadInfo.hash_url_delimiter) == 2 and value.split(
  47. DownloadInfo.hash_url_delimiter)[0] in iter(HashesURLEnum)
  48. _schema = schema.Schema({
  49. schema.Optional(schema.And(str, len)): {
  50. **{x: schema.And(str, len)
  51. for x in _nonempty_keys},
  52. 'output_path': (lambda x: str(Path(x).relative_to(''))),
  53. **{schema.Optional(x): schema.And(str, len)
  54. for x in _optional_keys},
  55. schema.Optional('extractor'): schema.Or(ExtractorEnum.TAR, ExtractorEnum.SEVENZIP,
  56. ExtractorEnum.WINRAR),
  57. schema.Optional(schema.Or(*_hashes)): schema.And(str, len),
  58. schema.Optional('hash_url'): lambda x: DownloadInfo._is_hash_url(x), #pylint: disable=unnecessary-lambda
  59. }
  60. })
  61. class _DownloadsProperties: #pylint: disable=too-few-public-methods
  62. def __init__(self, section_dict, passthrough_properties, hashes):
  63. self._section_dict = section_dict
  64. self._passthrough_properties = passthrough_properties
  65. self._hashes = hashes
  66. def has_hash_url(self):
  67. """
  68. Returns a boolean indicating whether the current
  69. download has a hash URL"""
  70. return 'hash_url' in self._section_dict
  71. def __getattr__(self, name):
  72. if name in self._passthrough_properties:
  73. return self._section_dict.get(name, fallback=None)
  74. if name == 'hashes':
  75. hashes_dict = {}
  76. for hash_name in (*self._hashes, 'hash_url'):
  77. value = self._section_dict.get(hash_name, fallback=None)
  78. if value:
  79. if hash_name == 'hash_url':
  80. value = value.split(DownloadInfo.hash_url_delimiter)
  81. hashes_dict[hash_name] = value
  82. return hashes_dict
  83. raise AttributeError('"{}" has no attribute "{}"'.format(type(self).__name__, name))
  84. def _parse_data(self, path):
  85. """
  86. Parses an INI file located at path
  87. Raises schema.SchemaError if validation fails
  88. """
  89. def _section_generator(data):
  90. for section in data:
  91. if section == configparser.DEFAULTSECT:
  92. continue
  93. yield section, dict(
  94. filter(lambda x: x[0] not in self._ini_vars, data.items(section)))
  95. new_data = configparser.ConfigParser(defaults=self._ini_vars)
  96. with path.open(encoding=ENCODING) as ini_file:
  97. new_data.read_file(ini_file, source=str(path))
  98. try:
  99. self._schema.validate(dict(_section_generator(new_data)))
  100. except schema.SchemaError as exc:
  101. get_logger().error('downloads.ini failed schema validation (located in %s)', path)
  102. raise exc
  103. return new_data
  104. def __init__(self, ini_paths):
  105. """Reads an iterable of pathlib.Path to download.ini files"""
  106. self._data = configparser.ConfigParser()
  107. for path in ini_paths:
  108. self._data.read_dict(self._parse_data(path))
  109. def __getitem__(self, section):
  110. """
  111. Returns an object with keys as attributes and
  112. values already pre-processed strings
  113. """
  114. return self._DownloadsProperties(self._data[section], self._passthrough_properties,
  115. self._hashes)
  116. def __contains__(self, item):
  117. """
  118. Returns True if item is a name of a section; False otherwise.
  119. """
  120. return self._data.has_section(item)
  121. def __iter__(self):
  122. """Returns an iterator over the section names"""
  123. return iter(self._data.sections())
  124. def properties_iter(self):
  125. """Iterator for the download properties sorted by output path"""
  126. return sorted(map(lambda x: (x, self[x]), self),
  127. key=(lambda x: str(Path(x[1].output_path))))
  128. def check_sections_exist(self, section_names):
  129. """..."""
  130. if not section_names:
  131. return
  132. for name in section_names:
  133. if name not in self:
  134. raise KeyError('"{}" has no section "{}"'.format(type(self).__name__, name))
  135. class _UrlRetrieveReportHook: #pylint: disable=too-few-public-methods
  136. """Hook for urllib.request.urlretrieve to log progress information to console"""
  137. def __init__(self):
  138. self._max_len_printed = 0
  139. self._last_percentage = None
  140. def __call__(self, block_count, block_size, total_size):
  141. # Use total_blocks to handle case total_size < block_size
  142. # total_blocks is ceiling of total_size / block_size
  143. # Ceiling division from: https://stackoverflow.com/a/17511341
  144. total_blocks = -(-total_size // block_size)
  145. if total_blocks > 0:
  146. # Do not needlessly update the console. Since the console is
  147. # updated synchronously, we don't want updating the console to
  148. # bottleneck downloading. Thus, only refresh the output when the
  149. # displayed value should change.
  150. percentage = round(block_count / total_blocks, ndigits=3)
  151. if percentage == self._last_percentage:
  152. return
  153. self._last_percentage = percentage
  154. print('\r' + ' ' * self._max_len_printed, end='')
  155. status_line = 'Progress: {:.1%} of {:,d} B'.format(percentage, total_size)
  156. else:
  157. downloaded_estimate = block_count * block_size
  158. status_line = 'Progress: {:,d} B of unknown size'.format(downloaded_estimate)
  159. self._max_len_printed = len(status_line)
  160. print('\r' + status_line, end='')
  161. def _download_via_urllib(url, file_path, show_progress, disable_ssl_verification):
  162. reporthook = None
  163. if show_progress:
  164. reporthook = _UrlRetrieveReportHook()
  165. if disable_ssl_verification:
  166. # TODO: Remove this or properly implement disabling SSL certificate verification
  167. orig_https_context = ssl._create_default_https_context #pylint: disable=protected-access
  168. ssl._create_default_https_context = ssl._create_unverified_context #pylint: disable=protected-access
  169. try:
  170. urllib.request.urlretrieve(url, str(file_path), reporthook=reporthook)
  171. finally:
  172. # Try to reduce damage of hack by reverting original HTTPS context ASAP
  173. if disable_ssl_verification:
  174. ssl._create_default_https_context = orig_https_context #pylint: disable=protected-access
  175. if show_progress:
  176. print()
  177. def _download_if_needed(file_path, url, show_progress, disable_ssl_verification):
  178. """
  179. Downloads a file from url to the specified path file_path if necessary.
  180. If show_progress is True, download progress is printed to the console.
  181. """
  182. if file_path.exists():
  183. get_logger().info('%s already exists. Skipping download.', file_path)
  184. return
  185. # File name for partially download file
  186. tmp_file_path = file_path.with_name(file_path.name + '.partial')
  187. if tmp_file_path.exists():
  188. get_logger().debug('Resuming downloading URL %s ...', url)
  189. else:
  190. get_logger().debug('Downloading URL %s ...', url)
  191. # Perform download
  192. if shutil.which('curl'):
  193. get_logger().debug('Using curl')
  194. try:
  195. subprocess.run(['curl', '-fL', '-o', str(tmp_file_path), '-C', '-', url], check=True)
  196. except subprocess.CalledProcessError as exc:
  197. get_logger().error('curl failed. Re-run the download command to resume downloading.')
  198. raise exc
  199. else:
  200. get_logger().debug('Using urllib')
  201. _download_via_urllib(url, tmp_file_path, show_progress, disable_ssl_verification)
  202. # Download complete; rename file
  203. tmp_file_path.rename(file_path)
  204. def _chromium_hashes_generator(hashes_path):
  205. with hashes_path.open(encoding=ENCODING) as hashes_file:
  206. hash_lines = hashes_file.read().splitlines()
  207. for hash_name, hash_hex, _ in map(lambda x: x.lower().split(' '), hash_lines):
  208. if hash_name in hashlib.algorithms_available:
  209. yield hash_name, hash_hex
  210. else:
  211. get_logger().warning('Skipping unknown hash algorithm: %s', hash_name)
  212. def _get_hash_pairs(download_properties, cache_dir):
  213. """Generator of (hash_name, hash_hex) for the given download"""
  214. for entry_type, entry_value in download_properties.hashes.items():
  215. if entry_type == 'hash_url':
  216. hash_processor, hash_filename, _ = entry_value
  217. if hash_processor == 'chromium':
  218. yield from _chromium_hashes_generator(cache_dir / hash_filename)
  219. else:
  220. raise ValueError('Unknown hash_url processor: %s' % hash_processor)
  221. else:
  222. yield entry_type, entry_value
  223. def retrieve_downloads(download_info,
  224. cache_dir,
  225. components,
  226. show_progress,
  227. disable_ssl_verification=False):
  228. """
  229. Retrieve downloads into the downloads cache.
  230. download_info is the DowloadInfo of downloads to retrieve.
  231. cache_dir is the pathlib.Path to the downloads cache.
  232. components is a list of component names to download, if not empty.
  233. show_progress is a boolean indicating if download progress is printed to the console.
  234. disable_ssl_verification is a boolean indicating if certificate verification
  235. should be disabled for downloads using HTTPS.
  236. Raises FileNotFoundError if the downloads path does not exist.
  237. Raises NotADirectoryError if the downloads path is not a directory.
  238. """
  239. if not cache_dir.exists():
  240. raise FileNotFoundError(cache_dir)
  241. if not cache_dir.is_dir():
  242. raise NotADirectoryError(cache_dir)
  243. for download_name, download_properties in download_info.properties_iter():
  244. if components and not download_name in components:
  245. continue
  246. get_logger().info('Downloading "%s" to "%s" ...', download_name,
  247. download_properties.download_filename)
  248. download_path = cache_dir / download_properties.download_filename
  249. _download_if_needed(download_path, download_properties.url, show_progress,
  250. disable_ssl_verification)
  251. if download_properties.has_hash_url():
  252. get_logger().info('Downloading hashes for "%s"', download_name)
  253. _, hash_filename, hash_url = download_properties.hashes['hash_url']
  254. _download_if_needed(cache_dir / hash_filename, hash_url, show_progress,
  255. disable_ssl_verification)
  256. def check_downloads(download_info, cache_dir, components, chunk_bytes=262144):
  257. """
  258. Check integrity of the downloads cache.
  259. download_info is the DownloadInfo of downloads to unpack.
  260. cache_dir is the pathlib.Path to the downloads cache.
  261. chunk_bytes is the size for each chunk which need to read.
  262. components is a list of component names to check, if not empty.
  263. Raises source_retrieval.HashMismatchError when the computed and expected hashes do not match.
  264. """
  265. logger = get_logger()
  266. for download_name, download_properties in download_info.properties_iter():
  267. if components and not download_name in components:
  268. continue
  269. get_logger().info('Verifying hashes for "%s" ...', download_name)
  270. download_path = cache_dir / download_properties.download_filename
  271. for hash_name, hash_hex in _get_hash_pairs(download_properties, cache_dir):
  272. logger.info('Verifying %s hash...', hash_name)
  273. hasher = hashlib.new(hash_name)
  274. with download_path.open('rb') as file_obj:
  275. # Read file in chunks. Default is 262144 bytes.
  276. chunk = file_obj.read(chunk_bytes)
  277. while chunk:
  278. hasher.update(chunk)
  279. chunk = file_obj.read(chunk_bytes)
  280. if not hasher.hexdigest().lower() == hash_hex.lower():
  281. raise HashMismatchError(download_path)
  282. def unpack_downloads(download_info,
  283. cache_dir,
  284. components,
  285. output_dir,
  286. skip_unused,
  287. sysroot,
  288. extractors=None):
  289. """
  290. Unpack downloads in the downloads cache to output_dir. Assumes all downloads are retrieved.
  291. download_info is the DownloadInfo of downloads to unpack.
  292. cache_dir is the pathlib.Path directory containing the download cache
  293. components is a list of component names to unpack, if not empty.
  294. output_dir is the pathlib.Path directory to unpack the downloads to.
  295. skip_unused is a boolean that determines if unused paths should be extracted.
  296. sysroot is a string containing a sysroot to unpack if any.
  297. extractors is a dictionary of PlatformEnum to a command or path to the
  298. extractor binary. Defaults to 'tar' for tar, and '_use_registry' for 7-Zip and WinRAR.
  299. May raise undetermined exceptions during archive unpacking.
  300. """
  301. for download_name, download_properties in download_info.properties_iter():
  302. if components and not download_name in components:
  303. continue
  304. download_path = cache_dir / download_properties.download_filename
  305. get_logger().info('Unpacking "%s" to %s ...', download_name,
  306. download_properties.output_path)
  307. extractor_name = download_properties.extractor or ExtractorEnum.TAR
  308. if extractor_name == ExtractorEnum.SEVENZIP:
  309. extractor_func = extract_with_7z
  310. elif extractor_name == ExtractorEnum.WINRAR:
  311. extractor_func = extract_with_winrar
  312. elif extractor_name == ExtractorEnum.TAR:
  313. extractor_func = extract_tar_file
  314. else:
  315. raise NotImplementedError(extractor_name)
  316. if download_properties.strip_leading_dirs is None:
  317. strip_leading_dirs_path = None
  318. else:
  319. strip_leading_dirs_path = Path(download_properties.strip_leading_dirs)
  320. extractor_func(archive_path=download_path,
  321. output_dir=output_dir / Path(download_properties.output_path),
  322. relative_to=strip_leading_dirs_path,
  323. skip_unused=skip_unused,
  324. sysroot=sysroot,
  325. extractors=extractors)
  326. def _add_common_args(parser):
  327. parser.add_argument(
  328. '-i',
  329. '--ini',
  330. type=Path,
  331. nargs='+',
  332. help='The downloads INI to parse for downloads. Can be specified multiple times.')
  333. parser.add_argument('-c',
  334. '--cache',
  335. type=Path,
  336. required=True,
  337. help='Path to the directory to cache downloads.')
  338. def _retrieve_callback(args):
  339. info = DownloadInfo(args.ini)
  340. info.check_sections_exist(args.components)
  341. retrieve_downloads(info, args.cache, args.components, args.show_progress,
  342. args.disable_ssl_verification)
  343. try:
  344. check_downloads(info, args.cache, args.components)
  345. except HashMismatchError as exc:
  346. get_logger().error('File checksum does not match: %s', exc)
  347. sys.exit(1)
  348. def _unpack_callback(args):
  349. extractors = {
  350. ExtractorEnum.SEVENZIP: args.sevenz_path,
  351. ExtractorEnum.WINRAR: args.winrar_path,
  352. ExtractorEnum.TAR: args.tar_path,
  353. }
  354. info = DownloadInfo(args.ini)
  355. info.check_sections_exist(args.components)
  356. unpack_downloads(info, args.cache, args.components, args.output, args.skip_unused, args.sysroot,
  357. extractors)
  358. def main():
  359. """CLI Entrypoint"""
  360. parser = argparse.ArgumentParser(description=__doc__)
  361. add_common_params(parser)
  362. subparsers = parser.add_subparsers(title='Download actions', dest='action')
  363. # retrieve
  364. retrieve_parser = subparsers.add_parser(
  365. 'retrieve',
  366. help='Retrieve and check download files',
  367. description=('Retrieves and checks downloads without unpacking. '
  368. 'The downloader will attempt to use CLI command "curl". '
  369. 'If it is not present, Python\'s urllib will be used. However, only '
  370. 'the CLI-based downloaders can be resumed if the download is aborted.'))
  371. _add_common_args(retrieve_parser)
  372. retrieve_parser.add_argument('--components',
  373. nargs='+',
  374. metavar='COMP',
  375. help='Retrieve only these components. Default: all')
  376. retrieve_parser.add_argument('--hide-progress-bar',
  377. action='store_false',
  378. dest='show_progress',
  379. help='Hide the download progress.')
  380. retrieve_parser.add_argument(
  381. '--disable-ssl-verification',
  382. action='store_true',
  383. help='Disables certification verification for downloads using HTTPS.')
  384. retrieve_parser.set_defaults(callback=_retrieve_callback)
  385. def _default_extractor_path(name):
  386. return USE_REGISTRY if get_running_platform() == PlatformEnum.WINDOWS else name
  387. # unpack
  388. unpack_parser = subparsers.add_parser(
  389. 'unpack',
  390. help='Unpack download files',
  391. description='Verifies hashes of and unpacks download files into the specified directory.')
  392. _add_common_args(unpack_parser)
  393. unpack_parser.add_argument('--components',
  394. nargs='+',
  395. metavar='COMP',
  396. help='Unpack only these components. Default: all')
  397. unpack_parser.add_argument('--tar-path',
  398. default='tar',
  399. help=('(Linux and macOS only) Command or path to the BSD or GNU tar '
  400. 'binary for extraction. Default: %(default)s'))
  401. unpack_parser.add_argument(
  402. '--7z-path',
  403. dest='sevenz_path',
  404. default=_default_extractor_path('7z'),
  405. help=('Command or path to 7-Zip\'s "7z" binary. If "_use_registry" is '
  406. 'specified, determine the path from the registry. Default: %(default)s'))
  407. unpack_parser.add_argument(
  408. '--winrar-path',
  409. dest='winrar_path',
  410. default=USE_REGISTRY,
  411. help=('Command or path to WinRAR\'s "winrar" binary. If "_use_registry" is '
  412. 'specified, determine the path from the registry. Default: %(default)s'))
  413. unpack_parser.add_argument('output', type=Path, help='The directory to unpack to.')
  414. unpack_parser.add_argument('--skip-unused',
  415. action='store_true',
  416. help='Skip extraction of unused directories (CONTINGENT_PATHS).')
  417. unpack_parser.add_argument('--sysroot',
  418. choices=('amd64', 'i386'),
  419. help=('Extracts the sysroot for the given architecture '
  420. 'when --skip-unused is set.'))
  421. unpack_parser.set_defaults(callback=_unpack_callback)
  422. args = parser.parse_args()
  423. args.callback(args)
  424. if __name__ == '__main__':
  425. main()