domain_substitution.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370
  1. #!/usr/bin/env python3
  2. # -*- coding: UTF-8 -*-
  3. # Copyright (c) 2019 The ungoogled-chromium Authors. All rights reserved.
  4. # Use of this source code is governed by a BSD-style license that can be
  5. # found in the LICENSE file.
  6. """
  7. Substitute domain names in the source tree with blockable strings.
  8. """
  9. from pathlib import Path
  10. import argparse
  11. import collections
  12. import contextlib
  13. import io
  14. import os
  15. import stat
  16. import re
  17. import tarfile
  18. import tempfile
  19. import zlib
  20. from _extraction import extract_tar_file
  21. from _common import ENCODING, get_logger, add_common_params
  22. # Encodings to try on source tree files
  23. TREE_ENCODINGS = ('UTF-8', 'ISO-8859-1')
  24. # Constants for domain substitution cache
  25. _INDEX_LIST = 'cache_index.list'
  26. _INDEX_HASH_DELIMITER = '|'
  27. _ORIG_DIR = 'orig'
  28. # Constants for timestamp manipulation
  29. # Delta between all file timestamps in nanoseconds
  30. _TIMESTAMP_DELTA = 1 * 10**9
  31. class DomainRegexList:
  32. """Representation of a domain_regex.list file"""
  33. _regex_pair_tuple = collections.namedtuple('DomainRegexPair', ('pattern', 'replacement'))
  34. # Constants for format:
  35. _PATTERN_REPLACE_DELIM = '#'
  36. def __init__(self, path):
  37. self._data = tuple(filter(len, path.read_text().splitlines()))
  38. # Cache of compiled regex pairs
  39. self._compiled_regex = None
  40. def _compile_regex(self, line):
  41. """Generates a regex pair tuple for the given line"""
  42. pattern, replacement = line.split(self._PATTERN_REPLACE_DELIM)
  43. return self._regex_pair_tuple(re.compile(pattern), replacement)
  44. @property
  45. def regex_pairs(self):
  46. """
  47. Returns a tuple of compiled regex pairs
  48. """
  49. if not self._compiled_regex:
  50. self._compiled_regex = tuple(map(self._compile_regex, self._data))
  51. return self._compiled_regex
  52. @property
  53. def search_regex(self):
  54. """
  55. Returns a single expression to search for domains
  56. """
  57. return re.compile('|'.join(
  58. map(lambda x: x.split(self._PATTERN_REPLACE_DELIM, 1)[0], self._data)))
  59. # Private Methods
  60. def _substitute_path(path, regex_iter):
  61. """
  62. Perform domain substitution on path and add it to the domain substitution cache.
  63. path is a pathlib.Path to the file to be domain substituted.
  64. regex_iter is an iterable of regular expression namedtuple like from
  65. config.DomainRegexList.regex_pairs()
  66. Returns a tuple of the CRC32 hash of the substituted raw content and the
  67. original raw content; None for both entries if no substitutions were made.
  68. Raises FileNotFoundError if path does not exist.
  69. Raises UnicodeDecodeError if path's contents cannot be decoded.
  70. """
  71. if not os.access(path, os.W_OK):
  72. # If the patch cannot be written to, it cannot be opened for updating
  73. print(str(path) + " cannot be opened for writing! Adding write permission...")
  74. path.chmod(path.stat().st_mode | stat.S_IWUSR)
  75. with path.open('r+b') as input_file:
  76. original_content = input_file.read()
  77. if not original_content:
  78. return (None, None)
  79. content = None
  80. encoding = None
  81. for encoding in TREE_ENCODINGS:
  82. try:
  83. content = original_content.decode(encoding)
  84. break
  85. except UnicodeDecodeError:
  86. continue
  87. if not content:
  88. raise UnicodeDecodeError('Unable to decode with any encoding: %s' % path)
  89. file_subs = 0
  90. for regex_pair in regex_iter:
  91. content, sub_count = regex_pair.pattern.subn(regex_pair.replacement, content)
  92. file_subs += sub_count
  93. if file_subs > 0:
  94. substituted_content = content.encode(encoding)
  95. input_file.seek(0)
  96. input_file.write(content.encode(encoding))
  97. input_file.truncate()
  98. return (zlib.crc32(substituted_content), original_content)
  99. return (None, None)
  100. def _validate_file_index(index_file, resolved_tree, cache_index_files):
  101. """
  102. Validation of file index and hashes against the source tree.
  103. Updates cache_index_files
  104. Returns True if the file index is valid; False otherwise
  105. """
  106. all_hashes_valid = True
  107. crc32_regex = re.compile(r'^[a-zA-Z0-9]{8}$')
  108. for entry in index_file.read().decode(ENCODING).splitlines():
  109. try:
  110. relative_path, file_hash = entry.split(_INDEX_HASH_DELIMITER)
  111. except ValueError as exc:
  112. get_logger().error('Could not split entry "%s": %s', entry, exc)
  113. continue
  114. if not relative_path or not file_hash:
  115. get_logger().error('Entry %s of domain substitution cache file index is not valid',
  116. _INDEX_HASH_DELIMITER.join((relative_path, file_hash)))
  117. all_hashes_valid = False
  118. continue
  119. if not crc32_regex.match(file_hash):
  120. get_logger().error('File index hash for %s does not appear to be a CRC32 hash',
  121. relative_path)
  122. all_hashes_valid = False
  123. continue
  124. if zlib.crc32((resolved_tree / relative_path).read_bytes()) != int(file_hash, 16):
  125. get_logger().error('Hashes do not match for: %s', relative_path)
  126. all_hashes_valid = False
  127. continue
  128. if relative_path in cache_index_files:
  129. get_logger().error('File %s shows up at least twice in the file index', relative_path)
  130. all_hashes_valid = False
  131. continue
  132. cache_index_files.add(relative_path)
  133. return all_hashes_valid
  134. @contextlib.contextmanager
  135. def _update_timestamp(path: os.PathLike, set_new: bool) -> None:
  136. """
  137. Context manager to set the timestamp of the path to plus or
  138. minus a fixed delta, regardless of modifications within the context.
  139. if set_new is True, the delta is added. Otherwise, the delta is subtracted.
  140. """
  141. stats = os.stat(path)
  142. if set_new:
  143. new_timestamp = (stats.st_atime_ns + _TIMESTAMP_DELTA, stats.st_mtime_ns + _TIMESTAMP_DELTA)
  144. else:
  145. new_timestamp = (stats.st_atime_ns - _TIMESTAMP_DELTA, stats.st_mtime_ns - _TIMESTAMP_DELTA)
  146. try:
  147. yield
  148. finally:
  149. os.utime(path, ns=new_timestamp)
  150. # Public Methods
  151. def apply_substitution(regex_path, files_path, source_tree, domainsub_cache):
  152. """
  153. Substitute domains in source_tree with files and substitutions,
  154. and save the pre-domain substitution archive to presubdom_archive.
  155. regex_path is a pathlib.Path to domain_regex.list
  156. files_path is a pathlib.Path to domain_substitution.list
  157. source_tree is a pathlib.Path to the source tree.
  158. domainsub_cache is a pathlib.Path to the domain substitution cache.
  159. Raises NotADirectoryError if the patches directory is not a directory or does not exist
  160. Raises FileNotFoundError if the source tree or required directory does not exist.
  161. Raises FileExistsError if the domain substitution cache already exists.
  162. Raises ValueError if an entry in the domain substitution list contains the file index
  163. hash delimiter.
  164. """
  165. if not source_tree.exists():
  166. raise FileNotFoundError(source_tree)
  167. if not regex_path.exists():
  168. raise FileNotFoundError(regex_path)
  169. if not files_path.exists():
  170. raise FileNotFoundError(files_path)
  171. if domainsub_cache and domainsub_cache.exists():
  172. raise FileExistsError(domainsub_cache)
  173. resolved_tree = source_tree.resolve()
  174. regex_pairs = DomainRegexList(regex_path).regex_pairs
  175. fileindex_content = io.BytesIO()
  176. with tarfile.open(
  177. str(domainsub_cache), 'w:%s' % domainsub_cache.suffix[1:],
  178. compresslevel=1) if domainsub_cache else open(os.devnull, 'w') as cache_tar:
  179. for relative_path in filter(len, files_path.read_text().splitlines()):
  180. if _INDEX_HASH_DELIMITER in relative_path:
  181. if domainsub_cache:
  182. # Cache tar will be incomplete; remove it for convenience
  183. cache_tar.close()
  184. domainsub_cache.unlink()
  185. raise ValueError(
  186. 'Path "%s" contains the file index hash delimiter "%s"' % relative_path,
  187. _INDEX_HASH_DELIMITER)
  188. path = resolved_tree / relative_path
  189. if not path.exists():
  190. get_logger().warning('Skipping non-existant path: %s', path)
  191. continue
  192. if path.is_symlink():
  193. get_logger().warning('Skipping path that has become a symlink: %s', path)
  194. continue
  195. with _update_timestamp(path, set_new=True):
  196. crc32_hash, orig_content = _substitute_path(path, regex_pairs)
  197. if crc32_hash is None:
  198. get_logger().info('Path has no substitutions: %s', relative_path)
  199. continue
  200. if domainsub_cache:
  201. fileindex_content.write('{}{}{:08x}\n'.format(relative_path, _INDEX_HASH_DELIMITER,
  202. crc32_hash).encode(ENCODING))
  203. orig_tarinfo = tarfile.TarInfo(str(Path(_ORIG_DIR) / relative_path))
  204. orig_tarinfo.size = len(orig_content)
  205. with io.BytesIO(orig_content) as orig_file:
  206. cache_tar.addfile(orig_tarinfo, orig_file)
  207. if domainsub_cache:
  208. fileindex_tarinfo = tarfile.TarInfo(_INDEX_LIST)
  209. fileindex_tarinfo.size = fileindex_content.tell()
  210. fileindex_content.seek(0)
  211. cache_tar.addfile(fileindex_tarinfo, fileindex_content)
  212. def revert_substitution(domainsub_cache, source_tree):
  213. """
  214. Revert domain substitution on source_tree using the pre-domain
  215. substitution archive presubdom_archive.
  216. It first checks if the hashes of the substituted files match the hashes
  217. computed during the creation of the domain substitution cache, raising
  218. KeyError if there are any mismatches. Then, it proceeds to
  219. reverting files in the source_tree.
  220. domainsub_cache is removed only if all the files from the domain substitution cache
  221. were relocated to the source tree.
  222. domainsub_cache is a pathlib.Path to the domain substitution cache.
  223. source_tree is a pathlib.Path to the source tree.
  224. Raises KeyError if:
  225. * There is a hash mismatch while validating the cache
  226. * The cache's file index is corrupt or missing
  227. * The cache is corrupt or is not consistent with the file index
  228. Raises FileNotFoundError if the source tree or domain substitution cache do not exist.
  229. """
  230. # This implementation trades disk space/wear for performance (unless a ramdisk is used
  231. # for the source tree)
  232. # Assumptions made for this process:
  233. # * The correct tar file was provided (so no huge amount of space is wasted)
  234. # * The tar file is well-behaved (e.g. no files extracted outside of destination path)
  235. # * Cache file index and cache contents are already consistent (i.e. no files exclusive to
  236. # one or the other)
  237. if not domainsub_cache:
  238. get_logger().error('Cache file must be specified.')
  239. if not domainsub_cache.exists():
  240. raise FileNotFoundError(domainsub_cache)
  241. if not source_tree.exists():
  242. raise FileNotFoundError(source_tree)
  243. resolved_tree = source_tree.resolve()
  244. cache_index_files = set() # All files in the file index
  245. with tempfile.TemporaryDirectory(
  246. prefix='domsubcache_files', dir=str(resolved_tree)) as tmp_extract_name:
  247. extract_path = Path(tmp_extract_name)
  248. get_logger().debug('Extracting domain substitution cache...')
  249. extract_tar_file(domainsub_cache, extract_path, None)
  250. # Validate source tree file hashes match
  251. get_logger().debug('Validating substituted files in source tree...')
  252. with (extract_path / _INDEX_LIST).open('rb') as index_file: #pylint: disable=no-member
  253. if not _validate_file_index(index_file, resolved_tree, cache_index_files):
  254. raise KeyError('Domain substitution cache file index is corrupt or hashes mismatch '
  255. 'the source tree.')
  256. # Move original files over substituted ones
  257. get_logger().debug('Moving original files over substituted ones...')
  258. for relative_path in cache_index_files:
  259. with _update_timestamp(resolved_tree / relative_path, set_new=False):
  260. (extract_path / _ORIG_DIR / relative_path).replace(resolved_tree / relative_path)
  261. # Quick check for unused files in cache
  262. orig_has_unused = False
  263. for orig_path in (extract_path / _ORIG_DIR).rglob('*'): #pylint: disable=no-member
  264. if orig_path.is_file():
  265. get_logger().warning('Unused file from cache: %s', orig_path)
  266. orig_has_unused = True
  267. if orig_has_unused:
  268. get_logger().warning('Cache contains unused files. Not removing.')
  269. else:
  270. domainsub_cache.unlink()
  271. def _callback(args):
  272. """CLI Callback"""
  273. if args.reverting:
  274. revert_substitution(args.cache, args.directory)
  275. else:
  276. apply_substitution(args.regex, args.files, args.directory, args.cache)
  277. def main():
  278. """CLI Entrypoint"""
  279. parser = argparse.ArgumentParser()
  280. add_common_params(parser)
  281. parser.set_defaults(callback=_callback)
  282. subparsers = parser.add_subparsers(title='', dest='packaging')
  283. # apply
  284. apply_parser = subparsers.add_parser(
  285. 'apply',
  286. help='Apply domain substitution',
  287. description='Applies domain substitution and creates the domain substitution cache.')
  288. apply_parser.add_argument(
  289. '-r', '--regex', type=Path, required=True, help='Path to domain_regex.list')
  290. apply_parser.add_argument(
  291. '-f', '--files', type=Path, required=True, help='Path to domain_substitution.list')
  292. apply_parser.add_argument(
  293. '-c',
  294. '--cache',
  295. type=Path,
  296. help='The path to the domain substitution cache. The path must not already exist.')
  297. apply_parser.add_argument(
  298. 'directory', type=Path, help='The directory to apply domain substitution')
  299. apply_parser.set_defaults(reverting=False)
  300. # revert
  301. revert_parser = subparsers.add_parser(
  302. 'revert',
  303. help='Revert domain substitution',
  304. description='Reverts domain substitution based only on the domain substitution cache.')
  305. revert_parser.add_argument(
  306. 'directory', type=Path, help='The directory to reverse domain substitution')
  307. revert_parser.add_argument(
  308. '-c',
  309. '--cache',
  310. type=Path,
  311. required=True,
  312. help=('The path to the domain substitution cache. '
  313. 'The path must exist and will be removed if successful.'))
  314. revert_parser.set_defaults(reverting=True)
  315. args = parser.parse_args()
  316. args.callback(args)
  317. if __name__ == '__main__':
  318. main()