update_lists.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389
  1. #!/usr/bin/env python3
  2. # Copyright (c) 2019 The ungoogled-chromium Authors. All rights reserved.
  3. # Use of this source code is governed by a BSD-style license that can be
  4. # found in the LICENSE file.
  5. """
  6. Update binary pruning and domain substitution lists automatically.
  7. It will download and unpack into the source tree as necessary.
  8. No binary pruning or domain substitution will be applied to the source tree after
  9. the process has finished.
  10. """
  11. import argparse
  12. import os
  13. import sys
  14. from itertools import repeat
  15. from multiprocessing import Pool
  16. from pathlib import Path, PurePosixPath
  17. sys.path.insert(0, str(Path(__file__).resolve().parent.parent / 'utils'))
  18. from _common import get_logger
  19. from domain_substitution import DomainRegexList, TREE_ENCODINGS
  20. from prune_binaries import CONTINGENT_PATHS
  21. sys.path.pop(0)
  22. # Encoding for output files
  23. _ENCODING = 'UTF-8'
  24. # NOTE: Include patterns have precedence over exclude patterns
  25. # pathlib.Path.match() paths to include in binary pruning
  26. PRUNING_INCLUDE_PATTERNS = [
  27. 'components/domain_reliability/baked_in_configs/*',
  28. # Removals for patches/core/ungoogled-chromium/remove-unused-preferences-fields.patch
  29. 'components/safe_browsing/core/common/safe_browsing_prefs.cc',
  30. 'components/safe_browsing/core/common/safe_browsing_prefs.h',
  31. 'components/signin/public/base/signin_pref_names.cc',
  32. 'components/signin/public/base/signin_pref_names.h',
  33. ]
  34. # pathlib.Path.match() paths to exclude from binary pruning
  35. PRUNING_EXCLUDE_PATTERNS = [
  36. 'chrome/common/win/eventlog_messages.mc', # TODO: False positive textfile
  37. # Exclusions for DOM distiller (contains model data only)
  38. 'components/dom_distiller/core/data/distillable_page_model_new.bin',
  39. 'components/dom_distiller/core/data/long_page_model.bin',
  40. # Exclusions for GeoLanguage data
  41. # Details: https://docs.google.com/document/d/18WqVHz5F9vaUiE32E8Ge6QHmku2QSJKvlqB9JjnIM-g/edit
  42. # Introduced with: https://chromium.googlesource.com/chromium/src/+/6647da61
  43. 'components/language/content/browser/ulp_language_code_locator/geolanguage-data_rank0.bin',
  44. 'components/language/content/browser/ulp_language_code_locator/geolanguage-data_rank1.bin',
  45. 'components/language/content/browser/ulp_language_code_locator/geolanguage-data_rank2.bin',
  46. # Exclusion for required prebuilt object for Windows arm64 builds
  47. 'third_party/crashpad/crashpad/util/misc/capture_context_win_arm64.obj',
  48. 'third_party/icu/common/icudtl.dat', # Exclusion for ICU data
  49. # Exclusion for Android
  50. 'build/android/chromium-debug.keystore',
  51. 'third_party/icu/android/icudtl.dat',
  52. 'third_party/icu/common/icudtb.dat',
  53. # Exclusion for rollup v4.0+
  54. 'third_party/node/node_modules/@rollup/wasm-node/dist/wasm-node/bindings_wasm_bg.wasm',
  55. # Exclusion for performance tracing
  56. 'third_party/perfetto/src/trace_processor/importers/proto/atoms.descriptor',
  57. # Exclusions for safe file extensions
  58. '*.avif',
  59. '*.ttf',
  60. '*.png',
  61. '*.jpg',
  62. '*.webp',
  63. '*.gif',
  64. '*.ico',
  65. '*.mp3',
  66. '*.wav',
  67. '*.flac',
  68. '*.icns',
  69. '*.woff',
  70. '*.woff2',
  71. '*makefile',
  72. '*.profdata',
  73. '*.xcf',
  74. '*.cur',
  75. '*.pdf',
  76. '*.ai',
  77. '*.h',
  78. '*.c',
  79. '*.cpp',
  80. '*.cc',
  81. '*.mk',
  82. '*.bmp',
  83. '*.py',
  84. '*.xml',
  85. '*.html',
  86. '*.js',
  87. '*.json',
  88. '*.txt',
  89. '*.xtb'
  90. ]
  91. # NOTE: Domain substitution path prefix exclusion has precedence over inclusion patterns
  92. # Paths to exclude by prefixes of the POSIX representation for domain substitution
  93. DOMAIN_EXCLUDE_PREFIXES = [
  94. 'components/test/',
  95. 'net/http/transport_security_state_static.json',
  96. 'net/http/transport_security_state_static_pins.json',
  97. # Exclusions for Visual Studio Project generation with GN (PR #445)
  98. 'tools/gn/',
  99. # Exclusions for files covered with other patches/unnecessary
  100. 'components/search_engines/prepopulated_engines.json',
  101. 'third_party/blink/renderer/core/dom/document.cc',
  102. ]
  103. # pathlib.Path.match() patterns to include in domain substitution
  104. DOMAIN_INCLUDE_PATTERNS = [
  105. '*.h', '*.hh', '*.hpp', '*.hxx', '*.cc', '*.cpp', '*.cxx', '*.c', '*.h', '*.json', '*.js',
  106. '*.html', '*.htm', '*.css', '*.py*', '*.grd*', '*.sql', '*.idl', '*.mk', '*.gyp*', 'makefile',
  107. '*.ts', '*.txt', '*.xml', '*.mm', '*.jinja*', '*.gn', '*.gni'
  108. ]
  109. # Binary-detection constant
  110. _TEXTCHARS = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)) - {0x7f})
  111. class UnusedPatterns: #pylint: disable=too-few-public-methods
  112. """Tracks unused prefixes and patterns"""
  113. _all_names = ('pruning_include_patterns', 'pruning_exclude_patterns', 'domain_include_patterns',
  114. 'domain_exclude_prefixes')
  115. def __init__(self):
  116. # Initialize all tracked patterns and prefixes in sets
  117. # Users will discard elements that are used
  118. for name in self._all_names:
  119. setattr(self, name, set(globals()[name.upper()]))
  120. def log_unused(self, error=True):
  121. """
  122. Logs unused patterns and prefixes
  123. Returns True if there are unused patterns or prefixes; False otherwise
  124. """
  125. have_unused = False
  126. log = get_logger().error if error else get_logger().info
  127. for name in self._all_names:
  128. current_set = getattr(self, name, None)
  129. if current_set:
  130. log('Unused from %s: %s', name.upper(), current_set)
  131. have_unused = True
  132. return have_unused
  133. def _is_binary(bytes_data):
  134. """
  135. Returns True if the data seems to be binary data (i.e. not human readable); False otherwise
  136. """
  137. # From: https://stackoverflow.com/a/7392391
  138. return bool(bytes_data.translate(None, _TEXTCHARS))
  139. def _dir_empty(path):
  140. """
  141. Returns True if the directory is empty; False otherwise
  142. path is a pathlib.Path or string to a directory to test.
  143. """
  144. try:
  145. next(os.scandir(str(path)))
  146. except StopIteration:
  147. return True
  148. return False
  149. def should_prune(path, relative_path, used_pep_set, used_pip_set):
  150. """
  151. Returns True if a path should be pruned from the source tree; False otherwise
  152. path is the pathlib.Path to the file from the current working directory.
  153. relative_path is the pathlib.Path to the file from the source tree
  154. used_pep_set is a list of PRUNING_EXCLUDE_PATTERNS that have been matched
  155. used_pip_set is a list of PRUNING_INCLUDE_PATTERNS that have been matched
  156. """
  157. # Match against include patterns
  158. for pattern in filter(relative_path.match, PRUNING_INCLUDE_PATTERNS):
  159. used_pip_set.add(pattern)
  160. return True
  161. # Match against exclude patterns
  162. for pattern in filter(Path(str(relative_path).lower()).match, PRUNING_EXCLUDE_PATTERNS):
  163. used_pep_set.add(pattern)
  164. return False
  165. # Do binary data detection
  166. with path.open('rb') as file_obj:
  167. if _is_binary(file_obj.read()):
  168. return True
  169. # Passed all filtering; do not prune
  170. return False
  171. def _check_regex_match(file_path, search_regex):
  172. """
  173. Returns True if a regex pattern matches a file; False otherwise
  174. file_path is a pathlib.Path to the file to test
  175. search_regex is a compiled regex object to search for domain names
  176. """
  177. with file_path.open("rb") as file_obj:
  178. file_bytes = file_obj.read()
  179. content = None
  180. for encoding in TREE_ENCODINGS:
  181. try:
  182. content = file_bytes.decode(encoding)
  183. break
  184. except UnicodeDecodeError:
  185. continue
  186. if not search_regex.search(content) is None:
  187. return True
  188. return False
  189. def should_domain_substitute(path, relative_path, search_regex, used_dep_set, used_dip_set):
  190. """
  191. Returns True if a path should be domain substituted in the source tree; False otherwise
  192. path is the pathlib.Path to the file from the current working directory.
  193. relative_path is the pathlib.Path to the file from the source tree.
  194. used_dep_set is a list of DOMAIN_EXCLUDE_PREFIXES that have been matched
  195. used_dip_set is a list of DOMAIN_INCLUDE_PATTERNS that have been matched
  196. """
  197. relative_path_posix = relative_path.as_posix().lower()
  198. for include_pattern in DOMAIN_INCLUDE_PATTERNS:
  199. if PurePosixPath(relative_path_posix).match(include_pattern):
  200. used_dip_set.add(include_pattern)
  201. for exclude_prefix in DOMAIN_EXCLUDE_PREFIXES:
  202. if relative_path_posix.startswith(exclude_prefix):
  203. used_dep_set.add(exclude_prefix)
  204. return False
  205. return _check_regex_match(path, search_regex)
  206. return False
  207. def compute_lists_proc(path, source_tree, search_regex):
  208. """
  209. Adds the path to appropriate lists to be used by compute_lists.
  210. path is the pathlib.Path to the file from the current working directory.
  211. source_tree is a pathlib.Path to the source tree
  212. search_regex is a compiled regex object to search for domain names
  213. """
  214. used_pep_set = set() # PRUNING_EXCLUDE_PATTERNS
  215. used_pip_set = set() # PRUNING_INCLUDE_PATTERNS
  216. used_dep_set = set() # DOMAIN_EXCLUDE_PREFIXES
  217. used_dip_set = set() # DOMAIN_INCLUDE_PATTERNS
  218. pruning_set = set()
  219. domain_substitution_set = set()
  220. symlink_set = set()
  221. if path.is_file():
  222. relative_path = path.relative_to(source_tree)
  223. if not any(cpath in str(relative_path.as_posix()) for cpath in CONTINGENT_PATHS):
  224. if path.is_symlink():
  225. try:
  226. resolved_relative_posix = path.resolve().relative_to(source_tree).as_posix()
  227. symlink_set.add((resolved_relative_posix, relative_path.as_posix()))
  228. except ValueError:
  229. # Symlink leads out of the source tree
  230. pass
  231. elif not any(skip in ('.git', '__pycache__', 'uc_staging') for skip in path.parts):
  232. try:
  233. if should_prune(path, relative_path, used_pep_set, used_pip_set):
  234. pruning_set.add(relative_path.as_posix())
  235. elif should_domain_substitute(path, relative_path, search_regex, used_dep_set,
  236. used_dip_set):
  237. domain_substitution_set.add(relative_path.as_posix())
  238. except: #pylint: disable=bare-except
  239. get_logger().exception('Unhandled exception while processing %s', relative_path)
  240. return (used_pep_set, used_pip_set, used_dep_set, used_dip_set, pruning_set,
  241. domain_substitution_set, symlink_set)
  242. def compute_lists(source_tree, search_regex, processes): # pylint: disable=too-many-locals
  243. """
  244. Compute the binary pruning and domain substitution lists of the source tree.
  245. Returns a tuple of three items in the following order:
  246. 1. The sorted binary pruning list
  247. 2. The sorted domain substitution list
  248. 3. An UnusedPatterns object
  249. source_tree is a pathlib.Path to the source tree
  250. search_regex is a compiled regex object to search for domain names
  251. processes is the maximum number of worker processes to create
  252. """
  253. pruning_set = set()
  254. domain_substitution_set = set()
  255. symlink_set = set() # POSIX resolved path -> set of POSIX symlink paths
  256. source_tree = source_tree.resolve()
  257. unused_patterns = UnusedPatterns()
  258. # Launch multiple processes iterating over the source tree
  259. with Pool(processes) as procpool:
  260. returned_data = procpool.starmap(
  261. compute_lists_proc,
  262. zip(source_tree.rglob('*'), repeat(source_tree), repeat(search_regex)))
  263. # Handle the returned data
  264. for (used_pep_set, used_pip_set, used_dep_set, used_dip_set, returned_pruning_set,
  265. returned_domain_sub_set, returned_symlink_set) in returned_data:
  266. # pragma pylint: disable=no-member
  267. unused_patterns.pruning_exclude_patterns.difference_update(used_pep_set)
  268. unused_patterns.pruning_include_patterns.difference_update(used_pip_set)
  269. unused_patterns.domain_exclude_prefixes.difference_update(used_dep_set)
  270. unused_patterns.domain_include_patterns.difference_update(used_dip_set)
  271. # pragma pylint: enable=no-member
  272. pruning_set.update(returned_pruning_set)
  273. domain_substitution_set.update(returned_domain_sub_set)
  274. symlink_set.update(returned_symlink_set)
  275. # Prune symlinks for pruned files
  276. for (resolved, symlink) in symlink_set:
  277. if resolved in pruning_set:
  278. pruning_set.add(symlink)
  279. return sorted(pruning_set), sorted(domain_substitution_set), unused_patterns
  280. def main(args_list=None):
  281. """CLI entrypoint"""
  282. parser = argparse.ArgumentParser(description=__doc__)
  283. parser.add_argument('--pruning',
  284. metavar='PATH',
  285. type=Path,
  286. default='pruning.list',
  287. help='The path to store pruning.list. Default: %(default)s')
  288. parser.add_argument('--domain-substitution',
  289. metavar='PATH',
  290. type=Path,
  291. default='domain_substitution.list',
  292. help='The path to store domain_substitution.list. Default: %(default)s')
  293. parser.add_argument('--domain-regex',
  294. metavar='PATH',
  295. type=Path,
  296. default='domain_regex.list',
  297. help='The path to domain_regex.list. Default: %(default)s')
  298. parser.add_argument('-t',
  299. '--tree',
  300. metavar='PATH',
  301. type=Path,
  302. required=True,
  303. help='The path to the source tree to use.')
  304. parser.add_argument(
  305. '--processes',
  306. metavar='NUM',
  307. type=int,
  308. default=None,
  309. help=
  310. 'The maximum number of worker processes to create. Defaults to the number of system CPUs.')
  311. parser.add_argument('--domain-exclude-prefix',
  312. metavar='PREFIX',
  313. type=str,
  314. action='append',
  315. help='Additional exclusion for domain_substitution.list.')
  316. parser.add_argument('--no-error-unused',
  317. action='store_false',
  318. dest='error_unused',
  319. help='Do not treat unused patterns/prefixes as an error.')
  320. args = parser.parse_args(args_list)
  321. if args.domain_exclude_prefix is not None:
  322. DOMAIN_EXCLUDE_PREFIXES.extend(args.domain_exclude_prefix)
  323. if args.tree.exists() and not _dir_empty(args.tree):
  324. get_logger().info('Using existing source tree at %s', args.tree)
  325. else:
  326. get_logger().error('No source tree found. Aborting.')
  327. sys.exit(1)
  328. get_logger().info('Computing lists...')
  329. pruning_set, domain_substitution_set, unused_patterns = compute_lists(
  330. args.tree,
  331. DomainRegexList(args.domain_regex).search_regex, args.processes)
  332. with args.pruning.open('w', encoding=_ENCODING) as file_obj:
  333. file_obj.writelines('%s\n' % line for line in pruning_set)
  334. with args.domain_substitution.open('w', encoding=_ENCODING) as file_obj:
  335. file_obj.writelines('%s\n' % line for line in domain_substitution_set)
  336. if unused_patterns.log_unused(args.error_unused) and args.error_unused:
  337. get_logger().error('Please update or remove unused patterns and/or prefixes. '
  338. 'The lists have still been updated with the remaining valid entries.')
  339. sys.exit(1)
  340. if __name__ == "__main__":
  341. main()