Bug 1497898 - Update the gitignore implementation to work as an iterator filter, r=ato

This updates the gitignore implemenation to take input like os.walk
but with additional stat data for the files. It also makes several
useful optimistaions:

 * Avoid using regex when just matching a literal
 * Identify patterns that can only match the final component of a path
   and run those against that component rather than the full path.
 * Add the possibility of providing a dictionary of paths to gitignore
   statuses as a cache.

This dramatically reduces the amount of time we spend in gitignore
processing when updating the manifest.

Depends on D8223

Differential Revision: https://phabricator.services.mozilla.com/D8224

--HG--
extra : moz-landing-system : lando
This commit is contained in:
James Graham 2018-11-02 17:13:59 +00:00
parent cf2776ad83
commit d273ec6723
2 changed files with 208 additions and 101 deletions

View File

@ -1,26 +1,29 @@
import re
import os
import itertools
from six import itervalues, iteritems
from collections import defaultdict
end_space = re.compile(r"([^\\]\s)*$")
def fnmatch_translate(pat, allow_component_only=True):
def fnmatch_translate(pat):
parts = []
seq = False
seq = None
i = 0
component_pattern = False
any_char = "[^/]"
if pat[0] == "/":
parts.append("^")
any_char = "[^/]"
if pat[0] == "/":
pat = pat[1:]
pat = pat[1:]
else:
any_char = "."
if allow_component_only and "/" not in pat:
component_pattern = True
parts.append("^")
else:
parts.append("^(?:.*/)?")
# By default match the entire path up to a /
# but if / doesn't appear in the pattern we will mark is as
# a name pattern and just produce a pattern that matches against
# the filename
parts.append("^(?:.*/)?")
name_pattern = True
if pat[-1] == "/":
# If the last character is / match this directory or any subdirectory
pat = pat[:-1]
@ -36,11 +39,10 @@ def fnmatch_translate(pat, allow_component_only=True):
parts.append(re.escape(c))
else:
raise ValueError
elif seq:
elif seq is not None:
# TODO: this doesn't really handle invalid sequences in the right way
if c == "]":
seq = False
# First two cases are to deal with the case where / is the only character
# in the sequence but path_name is True so it shouldn't match anything
seq = None
if parts[-1] == "[":
parts = parts[:-1]
elif parts[-1] == "^" and parts[-2] == "[":
@ -56,28 +58,39 @@ def fnmatch_translate(pat, allow_component_only=True):
if i < len(pat) - 1 and pat[i+1] in ("!", "^"):
parts.append("^")
i += 1
seq = True
seq = i
elif c == "*":
if i < len(pat) - 1 and pat[i+1] == "*":
parts.append(any_char + "*")
if i > 0 and pat[i-1] != "/":
raise ValueError
parts.append(".*")
i += 1
if i < len(pat) - 1 and pat[i+1] == "*":
if i < len(pat) - 1 and pat[i+1] != "/":
raise ValueError
else:
parts.append(any_char + "*")
elif c == "?":
parts.append(any_char)
elif c == "/" and not seq:
name_pattern = False
parts.append(c)
else:
parts.append(re.escape(c))
i += 1
if seq:
if name_pattern:
parts[0] = "^"
if seq is not None:
raise ValueError
parts.append(suffix)
try:
return component_pattern, re.compile("".join(parts))
return name_pattern, re.compile("".join(parts))
except Exception:
raise
raise ValueError
# Regexp matching rules that have to be converted to patterns
pattern_re = re.compile(r".*[\*\[\?]")
def parse_line(line):
@ -94,11 +107,19 @@ def parse_line(line):
if dir_only:
line = line[:-1]
return invert, dir_only, fnmatch_translate(line, dir_only)
# Could make a special case for **/foo, but we don't have any patterns like that
if not invert and not pattern_re.match(line):
literal = True
pattern = tuple(line.rsplit("/", 1))
else:
pattern = fnmatch_translate(line)
literal = False
return invert, dir_only, literal, pattern
class PathFilter(object):
def __init__(self, root, extras=None):
def __init__(self, root, extras=None, cache=None):
if root:
ignore_path = os.path.join(root, ".gitignore")
else:
@ -108,51 +129,119 @@ class PathFilter(object):
return
self.trivial = False
self.rules_file = []
self.rules_dir = []
self.literals_file = defaultdict(dict)
self.literals_dir = defaultdict(dict)
self.patterns_file = []
self.patterns_dir = []
self.cache = cache or {}
if extras is None:
extras = []
if ignore_path and os.path.exists(ignore_path):
self._read_ignore(ignore_path)
args = ignore_path, extras
else:
args = None, extras
self._read_ignore(*args)
for item in extras:
self._read_line(item)
def _read_ignore(self, ignore_path):
with open(ignore_path) as f:
for line in f:
self._read_line(line)
def _read_ignore(self, ignore_path, extras):
if ignore_path is not None:
with open(ignore_path) as f:
for line in f:
self._read_line(line)
for line in extras:
self._read_line(line)
def _read_line(self, line):
parsed = parse_line(line)
if not parsed:
return
invert, dir_only, regexp = parsed
if dir_only:
self.rules_dir.append((regexp, invert))
invert, dir_only, literal, rule = parsed
if invert:
# For exclude rules, we attach the rules to all preceeding patterns, so
# that we can match patterns out of order and check if they were later
# overriden by an exclude rule
assert not literal
if not dir_only:
rules_iter = itertools.chain(
itertools.chain(*(iteritems(item) for item in itervalues(self.literals_dir))),
itertools.chain(*(iteritems(item) for item in itervalues(self.literals_file))),
self.patterns_dir,
self.patterns_file)
else:
rules_iter = itertools.chain(
itertools.chain(*(iteritems(item) for item in itervalues(self.literals_dir))),
self.patterns_dir)
for rules in rules_iter:
rules[1].append(rule)
else:
self.rules_file.append((regexp, invert))
if literal:
if len(rule) == 1:
dir_name, pattern = None, rule[0]
else:
dir_name, pattern = rule
self.literals_dir[dir_name][pattern] = []
if not dir_only:
self.literals_file[dir_name][pattern] = []
else:
self.patterns_dir.append((rule, []))
if not dir_only:
self.patterns_file.append((rule, []))
def __call__(self, path):
if os.path.sep != "/":
path = path.replace(os.path.sep, "/")
def filter(self, iterator):
empty = {}
for dirpath, dirnames, filenames in iterator:
orig_dirpath = dirpath
if os.path.sep != "/":
dirpath = dirpath.replace(os.path.sep, "/")
keep_dirs = []
keep_files = []
for iter_items, literals, patterns, target, suffix in [
(dirnames, self.literals_dir, self.patterns_dir, keep_dirs, "/"),
(filenames, self.literals_file, self.patterns_file, keep_files, "")]:
for item in iter_items:
name = item[0]
if dirpath:
path = "%s/%s" % (dirpath, name) + suffix
else:
path = name + suffix
if path in self.cache:
if not self.cache[path]:
target.append(item)
continue
for rule_dir in [None, dirpath]:
if name in literals.get(rule_dir, empty):
exclude = literals[rule_dir][name]
if not any(rule.match(path) for rule in exclude):
# Skip this item
self.cache[path] = True
break
else:
for (component_only, pattern), exclude in patterns:
if component_only:
match = pattern.match(name)
else:
match = pattern.match(path)
if match:
if not any(rule.match(name if name_only else path)
for name_only, rule in exclude):
# Skip this item
self.cache[path] = True
break
else:
self.cache[path] = False
target.append(item)
dirnames[:] = keep_dirs
assert ".git" not in dirnames
yield orig_dirpath, dirnames, keep_files
def __call__(self, iterator):
if self.trivial:
return True
return iterator
path_is_dir = path[-1] == "/"
if path_is_dir:
path = path[:-1]
rules = self.rules_dir
else:
rules = self.rules_file
include = True
for regexp, invert in rules:
if not include and invert and regexp.match(path):
include = True
elif include and not invert and regexp.match(path):
include = False
return include
return self.filter(iterator)

View File

@ -3,80 +3,98 @@ import pytest
from ..gitignore import fnmatch_translate, PathFilter
match_data = [
("foo", False, ["a/foo", "foo"]),
("*.a", False, ["foo.a", "a/foo.a", "a/b/foo.a", "a.a/foo.a"]),
("*.py[co]", False, ["a.pyc", "a.pyo", "a/b/c.pyc"]),
("\\#*", False, ["#a", "a/#b"]),
("*#", False, ["a#", "a/b#", "#a#"]),
("/*.c", False, ["a.c", ".c"]),
("foo", True, ["a/foo", "foo"]),
("*.a", True, ["foo.a", "a/foo.a", "a/b/foo.a", "a.a/foo.a"]),
("*.py[co]", True, ["a.pyc", "a.pyo", "a/b/c.pyc"]),
("\\#*", True, ["#a", "a/#b"]),
("*#", True, ["a#", "a/b#", "#a#"]),
("/*.c", True, ["a.c", ".c"]),
("**/b", False, ["a/b", "a/c/b"]),
("*b", True, ["ab"]),
("**/b", True, ["a/b"]),
("a/", True, ["a", "a/b", "a/b/c"])
("*b", True, ["a/b"]),
("**/b", False, ["a/b"]),
("a/", True, ["a"]),
("a[/]b", True, []),
("**/b", False, ["a/c/b"]),
("a?c", True, ["abc"]),
("a[^b]c", True, ["acc"]),
("a[b-c]c", True, ["abc", "acc"]),
("a[^]c", True, ["ac"]), # This is probably wrong
("a[^]c", True, ["ac"]), # This is probably wrong
]
mismatch_data = [
("foo", False, ["foob", "afoo"]),
("*.a", False, ["a", "foo:a", "a.a/foo"]),
("*.py[co]", False, ["a.pyd", "pyo"]),
("/*.c", False, ["a/b.c"]),
("*b", True, ["a/b"]),
("**b", True, ["a/b"]),
("a[/]b", True, ["a/b"]),
("**/b", True, ["a/c/b"]),
("a", True, ["ab"])
("foo", True, ["foob", "afoo"]),
("*.a", True, ["a", "foo:a", "a.a/foo"]),
("*.py[co]", True, ["a.pyd", "pyo", "a.py"]),
("a", True, ["ab"]),
("a?c", True, ["ac", "abbc"]),
("a[^b]c", True, ["abc"]),
("a[b-c]c", True, ["adc"]),
]
invalid_data = [
"[a",
"***/foo",
"a\\",
"**b",
"b**/",
"[[]"
]
filter_data = [
("foo", True),
("a", False),
("a/b", False),
("a/c", True),
("a/c/", False),
("c/b", True)
(["foo", "bar/", "/a", "*.py"],
[("", ["foo", "bar", "baz"], ["a"]),
("baz", ["a"], ["foo", "bar"])],
[(["baz"], []),
(["a"], ["bar"])]),
(["#foo", "", "a*", "!a.py"],
[("", ["foo"], ["a", "a.foo", "a.py"])],
[(["foo"], ["a.py"])]),
]
def expand_data(compact_data):
for pattern, path_name, inputs in compact_data:
for pattern, name_only, inputs in compact_data:
for input in inputs:
yield pattern, input, path_name
yield pattern, name_only, input
@pytest.mark.parametrize("pattern, input, path_name", expand_data(match_data))
def tests_match(pattern, input, path_name):
regexp = fnmatch_translate(pattern, path_name)
@pytest.mark.parametrize("pattern, name_only, input", expand_data(match_data))
def tests_match(pattern, name_only, input):
name_only_result, regexp = fnmatch_translate(pattern)
assert name_only_result == name_only
if name_only:
input = input.rsplit("/", 1)[-1]
assert regexp.match(input) is not None
@pytest.mark.parametrize("pattern, input, path_name", expand_data(mismatch_data))
def tests_no_match(pattern, input, path_name):
regexp = fnmatch_translate(pattern, path_name)
@pytest.mark.parametrize("pattern, name_only, input", expand_data(mismatch_data))
def tests_no_match(pattern, name_only, input):
name_only_result, regexp = fnmatch_translate(pattern)
assert name_only_result == name_only
if name_only:
input = input.rsplit("/", 1)[-1]
assert regexp.match(input) is None
@pytest.mark.parametrize("pattern", invalid_data)
def tests_invalid(pattern):
with pytest.raises(ValueError):
fnmatch_translate(pattern, False)
with pytest.raises(ValueError):
fnmatch_translate(pattern, True)
fnmatch_translate(pattern)
@pytest.mark.parametrize("path, expected", filter_data)
def test_path_filter(path, expected):
extras = [
"#foo",
"a ",
"**/b",
"a/c/",
"!c/b",
]
f = PathFilter(None, extras)
assert f(path) == expected
@pytest.mark.parametrize("rules, input, expected", filter_data)
def test_path_filter(rules, input, expected):
f = PathFilter(None, rules)
# Add some fake stat data
for i, item in enumerate(input):
repl = [input[i][0]]
for j in [1, 2]:
repl.append([(name, None) for name in input[i][j]])
input[i] = tuple(repl)
for i, output in enumerate(f(input)):
assert output[0] == input[i][0]
for j in [1, 2]:
assert [item[0] for item in output[j]] == expected[i][j-1]