_extraction.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355
  1. # -*- coding: UTF-8 -*-
  2. # Copyright (c) 2019 The ungoogled-chromium Authors. All rights reserved.
  3. # Use of this source code is governed by a BSD-style license that can be
  4. # found in the LICENSE file.
  5. """
  6. Archive extraction utilities
  7. """
  8. import os
  9. import shutil
  10. import subprocess
  11. import tarfile
  12. from pathlib import Path, PurePosixPath
  13. from _common import (USE_REGISTRY, PlatformEnum, ExtractorEnum, get_logger, get_running_platform)
  14. from prune_binaries import CONTINGENT_PATHS
  15. DEFAULT_EXTRACTORS = {
  16. ExtractorEnum.SEVENZIP: USE_REGISTRY,
  17. ExtractorEnum.TAR: 'tar',
  18. ExtractorEnum.WINRAR: USE_REGISTRY,
  19. }
  20. def _find_7z_by_registry():
  21. """
  22. Return a string to 7-zip's 7z.exe from the Windows Registry.
  23. """
  24. import winreg #pylint: disable=import-error, import-outside-toplevel
  25. sub_key_7zfm = 'SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\App Paths\\7zFM.exe'
  26. try:
  27. with winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, sub_key_7zfm) as key_handle:
  28. sevenzipfm_dir = winreg.QueryValueEx(key_handle, 'Path')[0]
  29. except OSError:
  30. get_logger().exception('Unable to locate 7-zip from the Windows Registry')
  31. raise
  32. sevenzip_path = Path(sevenzipfm_dir, '7z.exe')
  33. if not sevenzip_path.is_file():
  34. get_logger().error('7z.exe not found at path from registry: %s', sevenzip_path)
  35. return sevenzip_path
  36. def _find_winrar_by_registry():
  37. """
  38. Return a string to WinRAR's WinRAR.exe from the Windows Registry.
  39. """
  40. import winreg #pylint: disable=import-error, import-outside-toplevel
  41. sub_key_winrar = 'SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\App Paths\\WinRAR.exe'
  42. try:
  43. with winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, sub_key_winrar) as key_handle:
  44. winrar_dir = winreg.QueryValueEx(key_handle, 'Path')[0]
  45. except OSError:
  46. get_logger().exception('Unable to locale WinRAR from the Windows Registry')
  47. raise
  48. winrar_path = Path(winrar_dir, 'WinRAR.exe')
  49. if not winrar_path.is_file():
  50. get_logger().error('WinRAR.exe not found at path from registry: %s', winrar_path)
  51. return winrar_path
  52. def _find_extractor_by_cmd(extractor_cmd):
  53. """Returns a string path to the binary; None if it couldn't be found"""
  54. if not extractor_cmd:
  55. return None
  56. if Path(extractor_cmd).is_file():
  57. return extractor_cmd
  58. return shutil.which(extractor_cmd)
  59. def _process_relative_to(unpack_root, relative_to):
  60. """
  61. For an extractor that doesn't support an automatic transform, move the extracted
  62. contents from the relative_to/ directory to the unpack_root
  63. If relative_to is None, nothing is done.
  64. """
  65. if relative_to is None:
  66. return
  67. relative_root = unpack_root / relative_to
  68. if not relative_root.is_dir():
  69. get_logger().error('Could not find relative_to directory in extracted files: %s',
  70. relative_to)
  71. raise Exception()
  72. for src_path in relative_root.iterdir():
  73. dest_path = unpack_root / src_path.name
  74. src_path.rename(dest_path)
  75. relative_root.rmdir()
  76. def _extract_tar_with_7z(binary, archive_path, output_dir, relative_to, skip_unused, sysroot):
  77. get_logger().debug('Using 7-zip extractor')
  78. if not relative_to is None and (output_dir / relative_to).exists():
  79. get_logger().error('Temporary unpacking directory already exists: %s',
  80. output_dir / relative_to)
  81. raise Exception()
  82. cmd1 = (binary, 'x', str(archive_path), '-so')
  83. cmd2 = (binary, 'x', '-si', '-aoa', '-ttar', '-o{}'.format(str(output_dir)))
  84. if skip_unused:
  85. for cpath in CONTINGENT_PATHS:
  86. if sysroot and f'{sysroot}-sysroot' in cpath:
  87. continue
  88. cmd2 += ('-x!%s/%s' % (str(relative_to), cpath[:-1]), )
  89. get_logger().debug('7z command line: %s | %s', ' '.join(cmd1), ' '.join(cmd2))
  90. proc1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE)
  91. proc2 = subprocess.Popen(cmd2, stdin=proc1.stdout, stdout=subprocess.PIPE)
  92. proc1.stdout.close()
  93. (stdout_data, stderr_data) = proc2.communicate()
  94. if proc2.returncode != 0:
  95. get_logger().error('7z commands returned non-zero status: %s', proc2.returncode)
  96. get_logger().debug('stdout: %s', stdout_data)
  97. get_logger().debug('stderr: %s', stderr_data)
  98. raise Exception()
  99. _process_relative_to(output_dir, relative_to)
  100. def _extract_tar_with_tar(binary, archive_path, output_dir, relative_to, skip_unused, sysroot):
  101. get_logger().debug('Using BSD or GNU tar extractor')
  102. output_dir.mkdir(exist_ok=True)
  103. cmd = (binary, '-xf', str(archive_path), '-C', str(output_dir))
  104. if skip_unused:
  105. for cpath in CONTINGENT_PATHS:
  106. if sysroot and f'{sysroot}-sysroot' in cpath:
  107. continue
  108. cmd += ('--exclude=%s/%s' % (str(relative_to), cpath[:-1]), )
  109. get_logger().debug('tar command line: %s', ' '.join(cmd))
  110. result = subprocess.run(cmd, check=False)
  111. if result.returncode != 0:
  112. get_logger().error('tar command returned %s', result.returncode)
  113. raise Exception()
  114. # for gnu tar, the --transform option could be used. but to keep compatibility with
  115. # bsdtar on macos, we just do this ourselves
  116. _process_relative_to(output_dir, relative_to)
  117. def _extract_tar_with_winrar(binary, archive_path, output_dir, relative_to, skip_unused, sysroot):
  118. get_logger().debug('Using WinRAR extractor')
  119. output_dir.mkdir(exist_ok=True)
  120. cmd = (binary, 'x', '-o+', str(archive_path), str(output_dir))
  121. if skip_unused:
  122. for cpath in CONTINGENT_PATHS:
  123. if sysroot and f'{sysroot}-sysroot' in cpath:
  124. continue
  125. cmd += ('-x%s%s%s' % (str(relative_to), os.sep, cpath[:-1].replace('/')), )
  126. get_logger().debug('WinRAR command line: %s', ' '.join(cmd))
  127. result = subprocess.run(cmd, check=False)
  128. if result.returncode != 0:
  129. get_logger().error('WinRAR command returned %s', result.returncode)
  130. raise Exception()
  131. _process_relative_to(output_dir, relative_to)
  132. def _extract_tar_with_python(archive_path, output_dir, relative_to, skip_unused, sysroot):
  133. get_logger().debug('Using pure Python tar extractor')
  134. class NoAppendList(list):
  135. """Hack to workaround memory issues with large tar files"""
  136. def append(self, obj):
  137. pass
  138. # Simple hack to check if symlinks are supported
  139. try:
  140. os.symlink('', '')
  141. except FileNotFoundError:
  142. # Symlinks probably supported
  143. symlink_supported = True
  144. except OSError:
  145. # Symlinks probably not supported
  146. get_logger().info('System does not support symlinks. Ignoring them.')
  147. symlink_supported = False
  148. except BaseException:
  149. # Unexpected exception
  150. get_logger().exception('Unexpected exception during symlink support check.')
  151. raise
  152. with tarfile.open(str(archive_path), 'r|%s' % archive_path.suffix[1:]) as tar_file_obj:
  153. tar_file_obj.members = NoAppendList()
  154. for tarinfo in tar_file_obj:
  155. try:
  156. if skip_unused and [
  157. cpath for cpath in CONTINGENT_PATHS
  158. if tarinfo.name.startswith(str(relative_to) + '/' + cpath)
  159. and not (sysroot and f'{sysroot}-sysroot' in cpath)
  160. ]:
  161. continue
  162. if relative_to is None:
  163. destination = output_dir / PurePosixPath(tarinfo.name)
  164. else:
  165. destination = output_dir / PurePosixPath(tarinfo.name).relative_to(relative_to)
  166. if tarinfo.issym() and not symlink_supported:
  167. # In this situation, TarFile.makelink() will try to create a copy of the
  168. # target. But this fails because TarFile.members is empty
  169. # But if symlinks are not supported, it's safe to assume that symlinks
  170. # aren't needed. The only situation where this happens is on Windows.
  171. continue
  172. if tarinfo.islnk():
  173. # Derived from TarFile.extract()
  174. new_target = output_dir / PurePosixPath(
  175. tarinfo.linkname).relative_to(relative_to)
  176. tarinfo._link_target = new_target.as_posix() # pylint: disable=protected-access
  177. if destination.is_symlink():
  178. destination.unlink()
  179. tar_file_obj._extract_member(tarinfo, str(destination)) # pylint: disable=protected-access
  180. except BaseException:
  181. get_logger().exception('Exception thrown for tar member: %s', tarinfo.name)
  182. raise
  183. def extract_tar_file(archive_path, output_dir, relative_to, skip_unused, sysroot, extractors=None):
  184. """
  185. Extract regular or compressed tar archive into the output directory.
  186. archive_path is the pathlib.Path to the archive to unpack
  187. output_dir is a pathlib.Path to the directory to unpack. It must already exist.
  188. relative_to is a pathlib.Path for directories that should be stripped relative to the
  189. root of the archive, or None if no path components should be stripped.
  190. extractors is a dictionary of PlatformEnum to a command or path to the
  191. extractor binary. Defaults to 'tar' for tar, and '_use_registry' for 7-Zip and WinRAR.
  192. """
  193. if extractors is None:
  194. extractors = DEFAULT_EXTRACTORS
  195. current_platform = get_running_platform()
  196. if current_platform == PlatformEnum.WINDOWS:
  197. # Try to use 7-zip first
  198. sevenzip_cmd = extractors.get(ExtractorEnum.SEVENZIP)
  199. if sevenzip_cmd == USE_REGISTRY:
  200. sevenzip_cmd = str(_find_7z_by_registry())
  201. sevenzip_bin = _find_extractor_by_cmd(sevenzip_cmd)
  202. if sevenzip_bin is not None:
  203. _extract_tar_with_7z(sevenzip_bin, archive_path, output_dir, relative_to, skip_unused,
  204. sysroot)
  205. return
  206. # Use WinRAR if 7-zip is not found
  207. winrar_cmd = extractors.get(ExtractorEnum.WINRAR)
  208. if winrar_cmd == USE_REGISTRY:
  209. winrar_cmd = str(_find_winrar_by_registry())
  210. winrar_bin = _find_extractor_by_cmd(winrar_cmd)
  211. if winrar_bin is not None:
  212. _extract_tar_with_winrar(winrar_bin, archive_path, output_dir, relative_to, skip_unused,
  213. sysroot)
  214. return
  215. get_logger().warning(
  216. 'Neither 7-zip nor WinRAR were found. Falling back to Python extractor...')
  217. elif current_platform == PlatformEnum.UNIX:
  218. # NOTE: 7-zip isn't an option because it doesn't preserve file permissions
  219. tar_bin = _find_extractor_by_cmd(extractors.get(ExtractorEnum.TAR))
  220. if not tar_bin is None:
  221. _extract_tar_with_tar(tar_bin, archive_path, output_dir, relative_to, skip_unused,
  222. sysroot)
  223. return
  224. else:
  225. # This is not a normal code path, so make it clear.
  226. raise NotImplementedError(current_platform)
  227. # Fallback to Python-based extractor on all platforms
  228. _extract_tar_with_python(archive_path, output_dir, relative_to, skip_unused, sysroot)
  229. def extract_with_7z(archive_path, output_dir, relative_to, skip_unused, sysroot, extractors=None):
  230. """
  231. Extract archives with 7-zip into the output directory.
  232. Only supports archives with one layer of unpacking, so compressed tar archives don't work.
  233. archive_path is the pathlib.Path to the archive to unpack
  234. output_dir is a pathlib.Path to the directory to unpack. It must already exist.
  235. relative_to is a pathlib.Path for directories that should be stripped relative to the
  236. root of the archive.
  237. extractors is a dictionary of PlatformEnum to a command or path to the
  238. extractor binary. Defaults to 'tar' for tar, and '_use_registry' for 7-Zip.
  239. """
  240. # TODO: It would be nice to extend this to support arbitrary standard IO chaining of 7z
  241. # instances, so _extract_tar_with_7z and other future formats could use this.
  242. if extractors is None:
  243. extractors = DEFAULT_EXTRACTORS
  244. sevenzip_cmd = extractors.get(ExtractorEnum.SEVENZIP)
  245. if sevenzip_cmd == USE_REGISTRY:
  246. if not get_running_platform() == PlatformEnum.WINDOWS:
  247. get_logger().error('"%s" for 7-zip is only available on Windows', sevenzip_cmd)
  248. raise Exception()
  249. sevenzip_cmd = str(_find_7z_by_registry())
  250. sevenzip_bin = _find_extractor_by_cmd(sevenzip_cmd)
  251. if not relative_to is None and (output_dir / relative_to).exists():
  252. get_logger().error('Temporary unpacking directory already exists: %s',
  253. output_dir / relative_to)
  254. raise Exception()
  255. cmd = (sevenzip_bin, 'x', str(archive_path), '-aoa', '-o{}'.format(str(output_dir)))
  256. if skip_unused:
  257. for cpath in CONTINGENT_PATHS:
  258. if sysroot and f'{sysroot}-sysroot' in cpath:
  259. continue
  260. cmd += ('-x!%s/%s' % (str(relative_to), cpath[:-1]), )
  261. get_logger().debug('7z command line: %s', ' '.join(cmd))
  262. result = subprocess.run(cmd, check=False)
  263. if result.returncode != 0:
  264. get_logger().error('7z command returned %s', result.returncode)
  265. raise Exception()
  266. _process_relative_to(output_dir, relative_to)
  267. def extract_with_winrar(archive_path,
  268. output_dir,
  269. relative_to,
  270. skip_unused,
  271. sysroot,
  272. extractors=None):
  273. """
  274. Extract archives with WinRAR into the output directory.
  275. Only supports archives with one layer of unpacking, so compressed tar archives don't work.
  276. archive_path is the pathlib.Path to the archive to unpack
  277. output_dir is a pathlib.Path to the directory to unpack. It must already exist.
  278. relative_to is a pathlib.Path for directories that should be stripped relative to the
  279. root of the archive.
  280. extractors is a dictionary of PlatformEnum to a command or path to the
  281. extractor binary. Defaults to 'tar' for tar, and '_use_registry' for WinRAR.
  282. """
  283. if extractors is None:
  284. extractors = DEFAULT_EXTRACTORS
  285. winrar_cmd = extractors.get(ExtractorEnum.WINRAR)
  286. if winrar_cmd == USE_REGISTRY:
  287. if not get_running_platform() == PlatformEnum.WINDOWS:
  288. get_logger().error('"%s" for WinRAR is only available on Windows', winrar_cmd)
  289. raise Exception()
  290. winrar_cmd = str(_find_winrar_by_registry())
  291. winrar_bin = _find_extractor_by_cmd(winrar_cmd)
  292. if not relative_to is None and (output_dir / relative_to).exists():
  293. get_logger().error('Temporary unpacking directory already exists: %s',
  294. output_dir / relative_to)
  295. raise Exception()
  296. cmd = (winrar_bin, 'x', '-o+', str(archive_path), str(output_dir))
  297. if skip_unused:
  298. for cpath in CONTINGENT_PATHS:
  299. if sysroot and f'{sysroot}-sysroot' in cpath:
  300. continue
  301. cmd += ('-x%s%s%s' % (str(relative_to), os.sep, cpath[:-1].replace('/', os.sep)), )
  302. get_logger().debug('WinRAR command line: %s', ' '.join(cmd))
  303. result = subprocess.run(cmd, check=False)
  304. if result.returncode != 0:
  305. get_logger().error('WinRAR command returned %s', result.returncode)
  306. raise Exception()
  307. _process_relative_to(output_dir, relative_to)