From 9719a9a5a9b01d1f74e01996ae3c326f1b55ced0 Mon Sep 17 00:00:00 2001 From: Ricky Stewart Date: Mon, 17 Aug 2020 15:19:34 +0000 Subject: [PATCH] Bug 1636797 - In `hash.py`, enumerate files from the VCS rather than searching the filesystem directly r=ahal This resolves a long-standing issue in development where `mach artifact` (and therefore `mach bootstrap`) would fail unpredictably if you had dirty, but ignored, files in your checkout. Resolving this problem often required unwieldy `hg purge`/`git ignore` incantations that are easy to get wrong. This patch addresses the problem by doing what we "should" have been doing all along, and consulting the VCS to list tracked files rather than listing EVERY file on disk and applying heuristics to determine whether they should be included in the hash. Differential Revision: https://phabricator.services.mozilla.com/D86780 --- config/check_js_msg_encoding.py | 9 ++--- python/mozbuild/mozpack/files.py | 40 ++++++++++++++++++- .../mozversioncontrol/__init__.py | 23 +++++++---- taskcluster/taskgraph/util/hash.py | 9 ++++- 4 files changed, 66 insertions(+), 15 deletions(-) diff --git a/config/check_js_msg_encoding.py b/config/check_js_msg_encoding.py index 6a0ece99b739..197a51e1fb7f 100644 --- a/config/check_js_msg_encoding.py +++ b/config/check_js_msg_encoding.py @@ -54,11 +54,10 @@ def check_files(): with get_repository_from_env() as repo: root = repo.path - for filename in repo.get_files_in_working_directory(): - if filename.endswith('.msg'): - if filename not in ignore_files: - if not check_single_file(os.path.join(root, filename)): - result = False + for filename, _ in repo.get_tracked_files_finder().find('**/*.msg'): + if filename not in ignore_files: + if not check_single_file(os.path.join(root, filename)): + result = False return result diff --git a/python/mozbuild/mozpack/files.py b/python/mozbuild/mozpack/files.py index 215ec950d431..e7e13a081892 100644 --- a/python/mozbuild/mozpack/files.py +++ b/python/mozbuild/mozpack/files.py @@ -4,6 +4,7 @@ from __future__ import absolute_import, print_function, unicode_literals +import bisect import codecs import errno import inspect @@ -15,11 +16,12 @@ import stat import subprocess import uuid import mozbuild.makeutil as makeutil -from itertools import chain +from itertools import chain, takewhile from mozbuild.preprocessor import Preprocessor from mozbuild.util import ( FileAvoidWrite, ensure_unicode, + memoize ) from mozpack.executables import ( is_executable, @@ -1238,3 +1240,39 @@ class MercurialRevisionFinder(BaseFinder): self._files[path] = f return f + + +class FileListFinder(BaseFinder): + """Finder for a literal list of file names.""" + + def __init__(self, files): + """files must be a sorted list.""" + self._files = files + + @memoize + def _match(self, pattern): + """Return a sorted list of all files matching the given pattern.""" + # We don't use the utility _find_helper method because it's not tuned + # for performance in the way that we would like this class to be. That's + # a possible avenue for refactoring here. + ret = [] + # We do this as an optimization to figure out where in the sorted list + # to search and where to stop searching. + components = pattern.split('/') + prefix = '/'.join(takewhile(lambda s: '*' not in s, components)) + start = bisect.bisect_left(self._files, prefix) + for i in six.moves.range(start, len(self._files)): + f = self._files[i] + if not f.startswith(prefix): + break + # Skip hidden files while scanning. + if '/.' in f[len(prefix):]: + continue + if mozpath.match(f, pattern): + ret.append(f) + return ret + + def find(self, pattern): + pattern = pattern.strip('/') + for path in self._match(pattern): + yield path, File(path) diff --git a/python/mozversioncontrol/mozversioncontrol/__init__.py b/python/mozversioncontrol/mozversioncontrol/__init__.py index ba3296fc5ab7..6ce4b47408b0 100644 --- a/python/mozversioncontrol/mozversioncontrol/__init__.py +++ b/python/mozversioncontrol/mozversioncontrol/__init__.py @@ -14,6 +14,7 @@ import sys from mozbuild.util import ensure_subprocess_env from mozfile import which +from mozpack.files import FileListFinder class MissingVCSTool(Exception): @@ -216,8 +217,14 @@ class Repository(object): ''' @abc.abstractmethod - def get_files_in_working_directory(self): - """Obtain a list of managed files in the working directory.""" + def get_tracked_files_finder(self): + """Obtain a mozpack.files.BaseFinder of managed files in the working + directory. + + The Finder will have its list of all files in the repo cached for its + entire lifetime, so operations on the Finder will not track with, for + example, commits to the repo during the Finder's lifetime. + """ @abc.abstractmethod def working_directory_clean(self, untracked=False, ignored=False): @@ -419,10 +426,11 @@ class HgRepository(Repository): return self._run('forget', *paths) - def get_files_in_working_directory(self): + def get_tracked_files_finder(self): # Can return backslashes on Windows. Normalize to forward slashes. - return list(p.replace('\\', '/') for p in - self._run(b'files', b'-0').split('\0') if p) + files = list(p.replace('\\', '/') for p in + self._run(b'files', b'-0').split('\0') if p) + return FileListFinder(files) def working_directory_clean(self, untracked=False, ignored=False): args = ['status', '--modified', '--added', '--removed', @@ -549,8 +557,9 @@ class GitRepository(Repository): return self._run('reset', *paths) - def get_files_in_working_directory(self): - return [p for p in self._run('ls-files', '-z').split('\0') if p] + def get_tracked_files_finder(self): + files = [p for p in self._run('ls-files', '-z').split('\0') if p] + return FileListFinder(files) def working_directory_clean(self, untracked=False, ignored=False): args = ['status', '--porcelain'] diff --git a/taskcluster/taskgraph/util/hash.py b/taskcluster/taskgraph/util/hash.py index 2db28ee1c5e7..42692162f161 100644 --- a/taskcluster/taskgraph/util/hash.py +++ b/taskcluster/taskgraph/util/hash.py @@ -4,8 +4,8 @@ from __future__ import absolute_import, print_function, unicode_literals from mozbuild.util import memoize -from mozpack.files import FileFinder import mozpack.path as mozpath +from mozversioncontrol import get_repository_object import hashlib import io import six @@ -21,6 +21,11 @@ def hash_path(path): return hashlib.sha256(fh.read()).hexdigest() +@memoize +def get_file_finder(base_path): + return get_repository_object(base_path).get_tracked_files_finder() + + def hash_paths(base_path, patterns): """ Give a list of path patterns, return a digest of the contents of all @@ -30,7 +35,7 @@ def hash_paths(base_path, patterns): Each file is hashed. The list of all hashes and file paths is then itself hashed to produce the result. """ - finder = FileFinder(base_path) + finder = get_file_finder(base_path) h = hashlib.sha256() files = {} for pattern in patterns: