Bug 1497898 - Update the gitignore implementation to work as an iterator filter, r=ato

This updates the gitignore implemenation to take input like os.walk but with additional stat data for the files. It also makes several useful optimistaions: * Avoid using regex when just matching a literal * Identify patterns that can only match the final component of a path and run those against that component rather than the full path. * Add the possibility of providing a dictionary of paths to gitignore statuses as a cache. This dramatically reduces the amount of time we spend in gitignore processing when updating the manifest. Depends on D8223 Differential Revision: https://phabricator.services.mozilla.com/D8224 --HG-- extra : moz-landing-system : lando
2018-11-16 18:48:34 +00:00 · 2018-11-16 18:48:34 +00:00 · 62e4d2f4f2
--- a/testing/web-platform/tests/tools/gitignore/gitignore.py
+++ b/testing/web-platform/tests/tools/gitignore/gitignore.py
@ -1,26 +1,29 @@
 import re
 import os
+import itertools
+from six import itervalues, iteritems
+from collections import defaultdict
+

 end_space = re.compile(r"([^\\]\s)*$")


-def fnmatch_translate(pat, allow_component_only=True):
+def fnmatch_translate(pat):
    parts = []
-    seq = False
+    seq = None
    i = 0
-    component_pattern = False
+    any_char = "[^/]"
    if pat[0] == "/":
        parts.append("^")
-        any_char = "[^/]"
-        if pat[0] == "/":
-            pat = pat[1:]
+        pat = pat[1:]
    else:
-        any_char = "."
-        if allow_component_only and "/" not in pat:
-            component_pattern = True
-            parts.append("^")
-        else:
-            parts.append("^(?:.*/)?")
+        # By default match the entire path up to a /
+        # but if / doesn't appear in the pattern we will mark is as
+        # a name pattern and just produce a pattern that matches against
+        # the filename
+        parts.append("^(?:.*/)?")
+
+    name_pattern = True
    if pat[-1] == "/":
        # If the last character is / match this directory or any subdirectory
        pat = pat[:-1]
@ -36,11 +39,10 @@ def fnmatch_translate(pat, allow_component_only=True):
                parts.append(re.escape(c))
            else:
                raise ValueError
-        elif seq:
+        elif seq is not None:
+            # TODO: this doesn't really handle invalid sequences in the right way
            if c == "]":
-                seq = False
-                # First two cases are to deal with the case where / is the only character
-                # in the sequence but path_name is True so it shouldn't match anything
+                seq = None
                if parts[-1] == "[":
                    parts = parts[:-1]
                elif parts[-1] == "^" and parts[-2] == "[":
@ -56,28 +58,39 @@ def fnmatch_translate(pat, allow_component_only=True):
            if i < len(pat) - 1 and pat[i+1] in ("!", "^"):
                parts.append("^")
                i += 1
-            seq = True
+            seq = i
        elif c == "*":
            if i < len(pat) - 1 and pat[i+1] == "*":
-                parts.append(any_char + "*")
+                if i > 0 and pat[i-1] != "/":
+                    raise ValueError
+                parts.append(".*")
                i += 1
-                if i < len(pat) - 1 and pat[i+1] == "*":
+                if i < len(pat) - 1 and pat[i+1] != "/":
                    raise ValueError
            else:
                parts.append(any_char + "*")
        elif c == "?":
            parts.append(any_char)
+        elif c == "/" and not seq:
+            name_pattern = False
+            parts.append(c)
        else:
            parts.append(re.escape(c))
        i += 1

-    if seq:
+    if name_pattern:
+        parts[0] = "^"
+
+    if seq is not None:
        raise ValueError
    parts.append(suffix)
    try:
-        return component_pattern, re.compile("".join(parts))
+        return name_pattern, re.compile("".join(parts))
    except Exception:
-        raise
+        raise ValueError
+
+# Regexp matching rules that have to be converted to patterns
+pattern_re = re.compile(r".*[\*\[\?]")


 def parse_line(line):
@ -94,11 +107,19 @@ def parse_line(line):
    if dir_only:
        line = line[:-1]

-    return invert, dir_only, fnmatch_translate(line, dir_only)
+    # Could make a special case for **/foo, but we don't have any patterns like that
+    if not invert and not pattern_re.match(line):
+        literal = True
+        pattern = tuple(line.rsplit("/", 1))
+    else:
+        pattern = fnmatch_translate(line)
+        literal = False
+
+    return invert, dir_only, literal, pattern


 class PathFilter(object):
-    def __init__(self, root, extras=None):
+    def __init__(self, root, extras=None, cache=None):
        if root:
            ignore_path = os.path.join(root, ".gitignore")
        else:
@ -108,51 +129,119 @@ class PathFilter(object):
            return
        self.trivial = False

-        self.rules_file = []
-        self.rules_dir = []
+        self.literals_file = defaultdict(dict)
+        self.literals_dir = defaultdict(dict)
+        self.patterns_file = []
+        self.patterns_dir = []
+        self.cache = cache or {}

        if extras is None:
            extras = []

        if ignore_path and os.path.exists(ignore_path):
-            self._read_ignore(ignore_path)
+            args = ignore_path, extras
+        else:
+            args = None, extras
+        self._read_ignore(*args)

-        for item in extras:
-            self._read_line(item)
-
-    def _read_ignore(self, ignore_path):
-        with open(ignore_path) as f:
-            for line in f:
-                self._read_line(line)
+    def _read_ignore(self, ignore_path, extras):
+        if ignore_path is not None:
+            with open(ignore_path) as f:
+                for line in f:
+                    self._read_line(line)
+        for line in extras:
+            self._read_line(line)

    def _read_line(self, line):
        parsed = parse_line(line)
        if not parsed:
            return
-        invert, dir_only, regexp = parsed
-        if dir_only:
-            self.rules_dir.append((regexp, invert))
+        invert, dir_only, literal, rule = parsed
+
+        if invert:
+            # For exclude rules, we attach the rules to all preceeding patterns, so
+            # that we can match patterns out of order and check if they were later
+            # overriden by an exclude rule
+            assert not literal
+            if not dir_only:
+                rules_iter = itertools.chain(
+                    itertools.chain(*(iteritems(item) for item in itervalues(self.literals_dir))),
+                    itertools.chain(*(iteritems(item) for item in itervalues(self.literals_file))),
+                    self.patterns_dir,
+                    self.patterns_file)
+            else:
+                rules_iter = itertools.chain(
+                    itertools.chain(*(iteritems(item) for item in itervalues(self.literals_dir))),
+                    self.patterns_dir)
+
+            for rules in rules_iter:
+                rules[1].append(rule)
        else:
-            self.rules_file.append((regexp, invert))
+            if literal:
+                if len(rule) == 1:
+                    dir_name, pattern = None, rule[0]
+                else:
+                    dir_name, pattern = rule
+                self.literals_dir[dir_name][pattern] = []
+                if not dir_only:
+                    self.literals_file[dir_name][pattern] = []
+            else:
+                self.patterns_dir.append((rule, []))
+                if not dir_only:
+                    self.patterns_file.append((rule, []))

-    def __call__(self, path):
-        if os.path.sep != "/":
-            path = path.replace(os.path.sep, "/")
+    def filter(self, iterator):
+        empty = {}
+        for dirpath, dirnames, filenames in iterator:
+            orig_dirpath = dirpath
+            if os.path.sep != "/":
+                dirpath = dirpath.replace(os.path.sep, "/")

+            keep_dirs = []
+            keep_files = []
+
+            for iter_items, literals, patterns, target, suffix in [
+                    (dirnames, self.literals_dir, self.patterns_dir, keep_dirs, "/"),
+                    (filenames, self.literals_file, self.patterns_file, keep_files, "")]:
+                for item in iter_items:
+                    name = item[0]
+                    if dirpath:
+                        path = "%s/%s" % (dirpath, name) + suffix
+                    else:
+                        path = name + suffix
+                    if path in self.cache:
+                        if not self.cache[path]:
+                            target.append(item)
+                        continue
+                    for rule_dir in [None, dirpath]:
+                        if name in literals.get(rule_dir, empty):
+                            exclude = literals[rule_dir][name]
+                            if not any(rule.match(path) for rule in exclude):
+                                # Skip this item
+                                self.cache[path] = True
+                                break
+                    else:
+                        for (component_only, pattern), exclude in patterns:
+                            if component_only:
+                                match = pattern.match(name)
+                            else:
+                                match = pattern.match(path)
+                            if match:
+                                if not any(rule.match(name if name_only else path)
+                                           for name_only, rule in exclude):
+                                    # Skip this item
+                                    self.cache[path] = True
+                                    break
+                        else:
+                            self.cache[path] = False
+                            target.append(item)
+
+            dirnames[:] = keep_dirs
+            assert ".git" not in dirnames
+            yield orig_dirpath, dirnames, keep_files
+
+    def __call__(self, iterator):
        if self.trivial:
-            return True
+            return iterator

-        path_is_dir = path[-1] == "/"
-        if path_is_dir:
-            path = path[:-1]
-            rules = self.rules_dir
-        else:
-            rules = self.rules_file
-
-        include = True
-        for regexp, invert in rules:
-            if not include and invert and regexp.match(path):
-                include = True
-            elif include and not invert and regexp.match(path):
-                include = False
-        return include
+        return self.filter(iterator)
--- a/testing/web-platform/tests/tools/gitignore/tests/test_gitignore.py
+++ b/testing/web-platform/tests/tools/gitignore/tests/test_gitignore.py
@ -3,80 +3,98 @@ import pytest
 from ..gitignore import fnmatch_translate, PathFilter

 match_data = [
-    ("foo", False, ["a/foo", "foo"]),
-    ("*.a", False, ["foo.a", "a/foo.a", "a/b/foo.a", "a.a/foo.a"]),
-    ("*.py[co]", False, ["a.pyc", "a.pyo", "a/b/c.pyc"]),
-    ("\\#*", False, ["#a", "a/#b"]),
-    ("*#", False, ["a#", "a/b#", "#a#"]),
-    ("/*.c", False, ["a.c", ".c"]),
+    ("foo", True, ["a/foo", "foo"]),
+    ("*.a", True, ["foo.a", "a/foo.a", "a/b/foo.a", "a.a/foo.a"]),
+    ("*.py[co]", True, ["a.pyc", "a.pyo", "a/b/c.pyc"]),
+    ("\\#*", True, ["#a", "a/#b"]),
+    ("*#", True, ["a#", "a/b#", "#a#"]),
+    ("/*.c", True, ["a.c", ".c"]),
    ("**/b", False, ["a/b", "a/c/b"]),
    ("*b", True, ["ab"]),
-    ("**/b", True, ["a/b"]),
-    ("a/", True, ["a", "a/b", "a/b/c"])
+    ("*b", True, ["a/b"]),
+    ("**/b", False, ["a/b"]),
+    ("a/", True, ["a"]),
+    ("a[/]b", True, []),
+    ("**/b", False, ["a/c/b"]),
+    ("a?c", True, ["abc"]),
+    ("a[^b]c", True, ["acc"]),
+    ("a[b-c]c", True, ["abc", "acc"]),
+    ("a[^]c", True, ["ac"]),  # This is probably wrong
+    ("a[^]c", True, ["ac"]),  # This is probably wrong
 ]

 mismatch_data = [
-    ("foo", False, ["foob", "afoo"]),
-    ("*.a", False, ["a", "foo:a", "a.a/foo"]),
-    ("*.py[co]", False, ["a.pyd", "pyo"]),
-    ("/*.c", False, ["a/b.c"]),
-    ("*b", True, ["a/b"]),
-    ("**b", True, ["a/b"]),
-    ("a[/]b", True, ["a/b"]),
-    ("**/b", True, ["a/c/b"]),
-    ("a", True, ["ab"])
+    ("foo", True, ["foob", "afoo"]),
+    ("*.a", True, ["a", "foo:a", "a.a/foo"]),
+    ("*.py[co]", True, ["a.pyd", "pyo", "a.py"]),
+    ("a", True, ["ab"]),
+    ("a?c", True, ["ac", "abbc"]),
+    ("a[^b]c", True, ["abc"]),
+    ("a[b-c]c", True, ["adc"]),
 ]

 invalid_data = [
    "[a",
    "***/foo",
    "a\\",
+    "**b",
+    "b**/",
+    "[[]"
 ]

 filter_data = [
-    ("foo", True),
-    ("a", False),
-    ("a/b", False),
-    ("a/c", True),
-    ("a/c/", False),
-    ("c/b", True)
+    (["foo", "bar/", "/a", "*.py"],
+     [("", ["foo", "bar", "baz"], ["a"]),
+      ("baz", ["a"], ["foo", "bar"])],
+     [(["baz"], []),
+      (["a"], ["bar"])]),
+    (["#foo", "", "a*", "!a.py"],
+     [("", ["foo"], ["a", "a.foo", "a.py"])],
+     [(["foo"], ["a.py"])]),
 ]


 def expand_data(compact_data):
-    for pattern, path_name, inputs in compact_data:
+    for pattern, name_only, inputs in compact_data:
        for input in inputs:
-            yield pattern, input, path_name
+            yield pattern, name_only, input


-@pytest.mark.parametrize("pattern, input, path_name", expand_data(match_data))
-def tests_match(pattern, input, path_name):
-    regexp = fnmatch_translate(pattern, path_name)
+@pytest.mark.parametrize("pattern, name_only, input", expand_data(match_data))
+def tests_match(pattern, name_only, input):
+    name_only_result, regexp = fnmatch_translate(pattern)
+    assert name_only_result == name_only
+    if name_only:
+        input = input.rsplit("/", 1)[-1]
    assert regexp.match(input) is not None


-@pytest.mark.parametrize("pattern, input, path_name", expand_data(mismatch_data))
-def tests_no_match(pattern, input, path_name):
-    regexp = fnmatch_translate(pattern, path_name)
+@pytest.mark.parametrize("pattern, name_only, input", expand_data(mismatch_data))
+def tests_no_match(pattern, name_only, input):
+    name_only_result, regexp = fnmatch_translate(pattern)
+    assert name_only_result == name_only
+    if name_only:
+        input = input.rsplit("/", 1)[-1]
    assert regexp.match(input) is None


@pytest.mark.parametrize("pattern", invalid_data)
 def tests_invalid(pattern):
    with pytest.raises(ValueError):
-        fnmatch_translate(pattern, False)
-    with pytest.raises(ValueError):
-        fnmatch_translate(pattern, True)
+        fnmatch_translate(pattern)


-@pytest.mark.parametrize("path, expected", filter_data)
-def test_path_filter(path, expected):
-    extras = [
-        "#foo",
-        "a  ",
-        "**/b",
-        "a/c/",
-        "!c/b",
-    ]
-    f = PathFilter(None, extras)
-    assert f(path) == expected
+@pytest.mark.parametrize("rules, input, expected", filter_data)
+def test_path_filter(rules, input, expected):
+    f = PathFilter(None, rules)
+    # Add some fake stat data
+    for i, item in enumerate(input):
+        repl = [input[i][0]]
+        for j in [1, 2]:
+            repl.append([(name, None) for name in input[i][j]])
+        input[i] = tuple(repl)
+
+    for i, output in enumerate(f(input)):
+        assert output[0] == input[i][0]
+        for j in [1, 2]:
+            assert [item[0] for item in output[j]] == expected[i][j-1]