dl_github_archive.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427
  1. #!/usr/bin/env python
  2. #
  3. # Copyright (c) 2018 Yousong Zhou <yszhou4tech@gmail.com>
  4. #
  5. # This is free software, licensed under the GNU General Public License v2.
  6. # See /LICENSE for more information.
  7. import argparse
  8. import calendar
  9. import datetime
  10. import errno
  11. import fcntl
  12. import hashlib
  13. import json
  14. import os
  15. import os.path
  16. import re
  17. import shutil
  18. import ssl
  19. import subprocess
  20. import sys
  21. import time
  22. import urllib2
  23. TMPDIR = os.environ.get('TMP_DIR') or '/tmp'
  24. TMPDIR_DL = os.path.join(TMPDIR, 'dl')
  25. class PathException(Exception): pass
  26. class DownloadGitHubError(Exception): pass
  27. class Path(object):
  28. """Context class for preparing and cleaning up directories.
  29. If ```preclean` is ``False``, ``path`` will NOT be removed on context enter
  30. If ``path`` ``isdir``, then it will be created on context enter.
  31. If ``keep`` is True, then ``path`` will NOT be removed on context exit
  32. """
  33. def __init__(self, path, isdir=True, preclean=False, keep=False):
  34. self.path = path
  35. self.isdir = isdir
  36. self.preclean = preclean
  37. self.keep = keep
  38. def __enter__(self):
  39. if self.preclean:
  40. self.rm_all(self.path)
  41. if self.isdir:
  42. self.mkdir_all(self.path)
  43. return self
  44. def __exit__(self, exc_type, exc_value, traceback):
  45. if not self.keep:
  46. self.rm_all(self.path)
  47. @staticmethod
  48. def mkdir_all(path):
  49. """Same as mkdir -p."""
  50. names = os.path.split(path)
  51. p = ''
  52. for name in names:
  53. p = os.path.join(p, name)
  54. Path._mkdir(p)
  55. @staticmethod
  56. def _rmdir_dir(dir_):
  57. names = Path._listdir(dir_)
  58. for name in names:
  59. p = os.path.join(dir_, name)
  60. Path.rm_all(p)
  61. Path._rmdir(dir_)
  62. @staticmethod
  63. def _mkdir(path):
  64. Path._os_func(os.mkdir, path, errno.EEXIST)
  65. @staticmethod
  66. def _rmdir(path):
  67. Path._os_func(os.rmdir, path, errno.ENOENT)
  68. @staticmethod
  69. def _remove(path):
  70. Path._os_func(os.remove, path, errno.ENOENT)
  71. @staticmethod
  72. def _listdir(path):
  73. return Path._os_func(os.listdir, path, errno.ENOENT, default=[])
  74. @staticmethod
  75. def _os_func(func, path, errno, default=None):
  76. """Call func(path) in an idempotent way.
  77. On exception ``ex``, if the type is OSError and ``ex.errno == errno``,
  78. return ``default``, otherwise, re-raise
  79. """
  80. try:
  81. return func(path)
  82. except OSError as e:
  83. if e.errno == errno:
  84. return default
  85. else:
  86. raise
  87. @staticmethod
  88. def rm_all(path):
  89. """Same as rm -r."""
  90. if os.path.islink(path):
  91. Path._remove(path)
  92. elif os.path.isdir(path):
  93. Path._rmdir_dir(path)
  94. else:
  95. Path._remove(path)
  96. @staticmethod
  97. def untar(path, into=None):
  98. """Extract tarball at ``path`` into subdir ``into``.
  99. return subdir name if and only if there exists one, otherwise raise PathException
  100. """
  101. args = ('tar', '-C', into, '-xzf', path, '--no-same-permissions')
  102. subprocess.check_call(args, preexec_fn=lambda: os.umask(0o22))
  103. dirs = os.listdir(into)
  104. if len(dirs) == 1:
  105. return dirs[0]
  106. else:
  107. raise PathException('untar %s: expecting a single subdir, got %s' % (path, dirs))
  108. @staticmethod
  109. def tar(path, subdir, into=None, ts=None):
  110. """Pack ``path`` into tarball ``into``."""
  111. # --sort=name requires a recent build of GNU tar
  112. args = ['tar', '--numeric-owner', '--owner=0', '--group=0', '--sort=name']
  113. args += ['-C', path, '-cf', into, subdir]
  114. envs = os.environ.copy()
  115. if ts is not None:
  116. args.append('--mtime=@%d' % ts)
  117. if into.endswith('.xz'):
  118. envs['XZ_OPT'] = '-7e'
  119. args.append('-J')
  120. elif into.endswith('.bz2'):
  121. args.append('-j')
  122. elif into.endswith('.gz'):
  123. args.append('-z')
  124. envs['GZIP'] = '-n'
  125. else:
  126. raise PathException('unknown compression type %s' % into)
  127. subprocess.check_call(args, env=envs)
  128. class GitHubCommitTsCache(object):
  129. __cachef = 'github.commit.ts.cache'
  130. __cachen = 2048
  131. def __init__(self):
  132. Path.mkdir_all(TMPDIR_DL)
  133. self.cachef = os.path.join(TMPDIR_DL, self.__cachef)
  134. self.cache = {}
  135. def get(self, k):
  136. """Get timestamp with key ``k``."""
  137. fileno = os.open(self.cachef, os.O_RDONLY | os.O_CREAT)
  138. with os.fdopen(fileno) as fin:
  139. try:
  140. fcntl.lockf(fileno, fcntl.LOCK_SH)
  141. self._cache_init(fin)
  142. if k in self.cache:
  143. ts = self.cache[k][0]
  144. return ts
  145. finally:
  146. fcntl.lockf(fileno, fcntl.LOCK_UN)
  147. return None
  148. def set(self, k, v):
  149. """Update timestamp with ``k``."""
  150. fileno = os.open(self.cachef, os.O_RDWR | os.O_CREAT)
  151. with os.fdopen(fileno, 'w+') as f:
  152. try:
  153. fcntl.lockf(fileno, fcntl.LOCK_EX)
  154. self._cache_init(f)
  155. self.cache[k] = (v, int(time.time()))
  156. self._cache_flush(f)
  157. finally:
  158. fcntl.lockf(fileno, fcntl.LOCK_UN)
  159. def _cache_init(self, fin):
  160. for line in fin:
  161. k, ts, updated = line.split()
  162. ts = int(ts)
  163. updated = int(updated)
  164. self.cache[k] = (ts, updated)
  165. def _cache_flush(self, fout):
  166. cache = sorted(self.cache.iteritems(), cmp=lambda a, b: b[1][1] - a[1][1])
  167. cache = cache[:self.__cachen]
  168. self.cache = {}
  169. os.ftruncate(fout.fileno(), 0)
  170. fout.seek(0, os.SEEK_SET)
  171. for k, ent in cache:
  172. ts = ent[0]
  173. updated = ent[1]
  174. line = '{0} {1} {2}\n'.format(k, ts, updated)
  175. fout.write(line)
  176. class DownloadGitHubTarball(object):
  177. """Download and repack archive tarabll from GitHub.
  178. Compared with the method of packing after cloning the whole repo, this
  179. method is more friendly to users with fragile internet connection.
  180. However, there are limitations with this method
  181. - GitHub imposes a 60 reqs/hour limit for unauthenticated API access.
  182. This affects fetching commit date for reproducible tarballs. Download
  183. through the archive link is not affected.
  184. - GitHub archives do not contain source codes for submodules.
  185. - GitHub archives seem to respect .gitattributes and ignore pathes with
  186. export-ignore attributes.
  187. For the first two issues, the method will fail loudly to allow fallback to
  188. clone-then-pack method.
  189. As for the 3rd issue, to make sure that this method only produces identical
  190. tarballs as the fallback method, we require the expected hash value to be
  191. supplied. That means the first tarball will need to be prepared by the
  192. clone-then-pack method
  193. """
  194. __repo_url_regex = re.compile(r'^(?:https|git)://github.com/(?P<owner>[^/]+)/(?P<repo>[^/]+)')
  195. def __init__(self, args):
  196. self.dl_dir = args.dl_dir
  197. self.version = args.version
  198. self.subdir = args.subdir
  199. self.source = args.source
  200. self.url = args.url
  201. self._init_owner_repo()
  202. self.xhash = args.hash
  203. self._init_hasher()
  204. self.commit_ts = None # lazy load commit timestamp
  205. self.commit_ts_cache = GitHubCommitTsCache()
  206. self.name = 'github-tarball'
  207. def download(self):
  208. """Download and repack GitHub archive tarball."""
  209. self._init_commit_ts()
  210. with Path(TMPDIR_DL, keep=True) as dir_dl:
  211. # fetch tarball from GitHub
  212. tarball_path = os.path.join(dir_dl.path, self.subdir + '.tar.gz.dl')
  213. with Path(tarball_path, isdir=False):
  214. self._fetch(tarball_path)
  215. # unpack
  216. d = os.path.join(dir_dl.path, self.subdir + '.untar')
  217. with Path(d, preclean=True) as dir_untar:
  218. tarball_prefix = Path.untar(tarball_path, into=dir_untar.path)
  219. dir0 = os.path.join(dir_untar.path, tarball_prefix)
  220. dir1 = os.path.join(dir_untar.path, self.subdir)
  221. # submodules check
  222. if self._has_submodule(dir0):
  223. raise self._error('Fetching submodules is not yet supported')
  224. # rename subdir
  225. os.rename(dir0, dir1)
  226. # repack
  227. into=os.path.join(TMPDIR_DL, self.source)
  228. Path.tar(dir_untar.path, self.subdir, into=into, ts=self.commit_ts)
  229. try:
  230. self._hash_check(into)
  231. except Exception:
  232. Path.rm_all(into)
  233. raise
  234. # move to target location
  235. file1 = os.path.join(self.dl_dir, self.source)
  236. if into != file1:
  237. shutil.move(into, file1)
  238. def _has_submodule(self, dir_):
  239. m = os.path.join(dir_, '.gitmodules')
  240. try:
  241. st = os.stat(m)
  242. return st.st_size > 0
  243. except OSError as e:
  244. return e.errno != errno.ENOENT
  245. def _init_owner_repo(self):
  246. m = self.__repo_url_regex.search(self.url)
  247. if m is None:
  248. raise self._error('Invalid github url: {}'.format(self.url))
  249. owner = m.group('owner')
  250. repo = m.group('repo')
  251. if repo.endswith('.git'):
  252. repo = repo[:-4]
  253. self.owner = owner
  254. self.repo = repo
  255. def _init_hasher(self):
  256. xhash = self.xhash
  257. if len(xhash) == 64:
  258. self.hasher = hashlib.sha256()
  259. elif len(xhash) == 32:
  260. self.hasher = hashlib.md5()
  261. else:
  262. raise self._error('Requires sha256sum for verification')
  263. self.xhash = xhash
  264. def _hash_check(self, f):
  265. with open(f, 'rb') as fin:
  266. while True:
  267. d = fin.read(4096)
  268. if not d:
  269. break
  270. self.hasher.update(d)
  271. xhash = self.hasher.hexdigest()
  272. if xhash != self.xhash:
  273. raise self._error('Wrong hash (probably caused by .gitattributes), expecting {}, got {}'.format(self.xhash, xhash))
  274. def _init_commit_ts(self):
  275. if self.commit_ts is not None:
  276. return
  277. # GitHub provides 2 APIs[1,2] for fetching commit data. API[1] is more
  278. # terse while API[2] provides more verbose info such as commit diff
  279. # etc. That's the main reason why API[1] is preferred: the response
  280. # size is predictable.
  281. #
  282. # However, API[1] only accepts complete commit sha1sum as the parameter
  283. # while API[2] is more liberal accepting also partial commit id and
  284. # tags, etc.
  285. #
  286. # [1] Get a single commit, Repositories, https://developer.github.com/v3/repos/commits/#get-a-single-commit
  287. # [2] Git Commits, Git Data, https://developer.github.com/v3/git/commits/#get-a-commit
  288. apis = [
  289. {
  290. 'url': self._make_repo_url_path('git', 'commits', self.version),
  291. 'attr_path': ('committer', 'date'),
  292. }, {
  293. 'url': self._make_repo_url_path('commits', self.version),
  294. 'attr_path': ('commit', 'committer', 'date'),
  295. },
  296. ]
  297. version_is_sha1sum = len(self.version) == 40
  298. if not version_is_sha1sum:
  299. apis.insert(0, apis.pop())
  300. for api in apis:
  301. url = api['url']
  302. attr_path = api['attr_path']
  303. try:
  304. ct = self.commit_ts_cache.get(url)
  305. if ct is not None:
  306. self.commit_ts = ct
  307. return
  308. ct = self._init_commit_ts_remote_get(url, attr_path)
  309. self.commit_ts = ct
  310. self.commit_ts_cache.set(url, ct)
  311. return
  312. except Exception:
  313. pass
  314. raise self._error('Cannot fetch commit ts: {}'.format(url))
  315. def _init_commit_ts_remote_get(self, url, attrpath):
  316. resp = self._make_request(url)
  317. data = resp.read()
  318. date = json.loads(data)
  319. for attr in attrpath:
  320. date = date[attr]
  321. date = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%SZ')
  322. date = date.timetuple()
  323. ct = calendar.timegm(date)
  324. return ct
  325. def _fetch(self, path):
  326. """Fetch tarball of the specified version ref."""
  327. ref = self.version
  328. url = self._make_repo_url_path('tarball', ref)
  329. resp = self._make_request(url)
  330. with open(path, 'wb') as fout:
  331. while True:
  332. d = resp.read(4096)
  333. if not d:
  334. break
  335. fout.write(d)
  336. def _make_repo_url_path(self, *args):
  337. url = '/repos/{0}/{1}'.format(self.owner, self.repo)
  338. if args:
  339. url += '/' + '/'.join(args)
  340. return url
  341. def _make_request(self, path):
  342. """Request GitHub API endpoint on ``path``."""
  343. url = 'https://api.github.com' + path
  344. headers = {
  345. 'Accept': 'application/vnd.github.v3+json',
  346. 'User-Agent': 'libreCMC',
  347. }
  348. req = urllib2.Request(url, headers=headers)
  349. sslcontext = ssl._create_unverified_context()
  350. fileobj = urllib2.urlopen(req, context=sslcontext)
  351. return fileobj
  352. def _error(self, msg):
  353. return DownloadGitHubError('{}: {}'.format(self.source, msg))
  354. def main():
  355. parser = argparse.ArgumentParser()
  356. parser.add_argument('--dl-dir', default=os.getcwd(), help='Download dir')
  357. parser.add_argument('--url', help='Download URL')
  358. parser.add_argument('--subdir', help='Source code subdir name')
  359. parser.add_argument('--version', help='Source code version')
  360. parser.add_argument('--source', help='Source tarball filename')
  361. parser.add_argument('--hash', help='Source tarball\'s expected sha256sum')
  362. args = parser.parse_args()
  363. try:
  364. method = DownloadGitHubTarball(args)
  365. method.download()
  366. except Exception as ex:
  367. sys.stderr.write('{}: Download from {} failed\n'.format(args.source, args.url))
  368. sys.stderr.write('{}\n'.format(ex))
  369. sys.exit(1)
  370. if __name__ == '__main__':
  371. main()