mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-11-28 23:31:56 +00:00
Bug 1497898 - Update the gitignore implementation to work as an iterator filter, r=ato
This updates the gitignore implemenation to take input like os.walk but with additional stat data for the files. It also makes several useful optimistaions: * Avoid using regex when just matching a literal * Identify patterns that can only match the final component of a path and run those against that component rather than the full path. * Add the possibility of providing a dictionary of paths to gitignore statuses as a cache. This dramatically reduces the amount of time we spend in gitignore processing when updating the manifest. Depends on D8223 Differential Revision: https://phabricator.services.mozilla.com/D8224 --HG-- extra : moz-landing-system : lando
This commit is contained in:
parent
cf2776ad83
commit
d273ec6723
@ -1,26 +1,29 @@
|
||||
import re
|
||||
import os
|
||||
import itertools
|
||||
from six import itervalues, iteritems
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
end_space = re.compile(r"([^\\]\s)*$")
|
||||
|
||||
|
||||
def fnmatch_translate(pat, allow_component_only=True):
|
||||
def fnmatch_translate(pat):
|
||||
parts = []
|
||||
seq = False
|
||||
seq = None
|
||||
i = 0
|
||||
component_pattern = False
|
||||
any_char = "[^/]"
|
||||
if pat[0] == "/":
|
||||
parts.append("^")
|
||||
any_char = "[^/]"
|
||||
if pat[0] == "/":
|
||||
pat = pat[1:]
|
||||
pat = pat[1:]
|
||||
else:
|
||||
any_char = "."
|
||||
if allow_component_only and "/" not in pat:
|
||||
component_pattern = True
|
||||
parts.append("^")
|
||||
else:
|
||||
parts.append("^(?:.*/)?")
|
||||
# By default match the entire path up to a /
|
||||
# but if / doesn't appear in the pattern we will mark is as
|
||||
# a name pattern and just produce a pattern that matches against
|
||||
# the filename
|
||||
parts.append("^(?:.*/)?")
|
||||
|
||||
name_pattern = True
|
||||
if pat[-1] == "/":
|
||||
# If the last character is / match this directory or any subdirectory
|
||||
pat = pat[:-1]
|
||||
@ -36,11 +39,10 @@ def fnmatch_translate(pat, allow_component_only=True):
|
||||
parts.append(re.escape(c))
|
||||
else:
|
||||
raise ValueError
|
||||
elif seq:
|
||||
elif seq is not None:
|
||||
# TODO: this doesn't really handle invalid sequences in the right way
|
||||
if c == "]":
|
||||
seq = False
|
||||
# First two cases are to deal with the case where / is the only character
|
||||
# in the sequence but path_name is True so it shouldn't match anything
|
||||
seq = None
|
||||
if parts[-1] == "[":
|
||||
parts = parts[:-1]
|
||||
elif parts[-1] == "^" and parts[-2] == "[":
|
||||
@ -56,28 +58,39 @@ def fnmatch_translate(pat, allow_component_only=True):
|
||||
if i < len(pat) - 1 and pat[i+1] in ("!", "^"):
|
||||
parts.append("^")
|
||||
i += 1
|
||||
seq = True
|
||||
seq = i
|
||||
elif c == "*":
|
||||
if i < len(pat) - 1 and pat[i+1] == "*":
|
||||
parts.append(any_char + "*")
|
||||
if i > 0 and pat[i-1] != "/":
|
||||
raise ValueError
|
||||
parts.append(".*")
|
||||
i += 1
|
||||
if i < len(pat) - 1 and pat[i+1] == "*":
|
||||
if i < len(pat) - 1 and pat[i+1] != "/":
|
||||
raise ValueError
|
||||
else:
|
||||
parts.append(any_char + "*")
|
||||
elif c == "?":
|
||||
parts.append(any_char)
|
||||
elif c == "/" and not seq:
|
||||
name_pattern = False
|
||||
parts.append(c)
|
||||
else:
|
||||
parts.append(re.escape(c))
|
||||
i += 1
|
||||
|
||||
if seq:
|
||||
if name_pattern:
|
||||
parts[0] = "^"
|
||||
|
||||
if seq is not None:
|
||||
raise ValueError
|
||||
parts.append(suffix)
|
||||
try:
|
||||
return component_pattern, re.compile("".join(parts))
|
||||
return name_pattern, re.compile("".join(parts))
|
||||
except Exception:
|
||||
raise
|
||||
raise ValueError
|
||||
|
||||
# Regexp matching rules that have to be converted to patterns
|
||||
pattern_re = re.compile(r".*[\*\[\?]")
|
||||
|
||||
|
||||
def parse_line(line):
|
||||
@ -94,11 +107,19 @@ def parse_line(line):
|
||||
if dir_only:
|
||||
line = line[:-1]
|
||||
|
||||
return invert, dir_only, fnmatch_translate(line, dir_only)
|
||||
# Could make a special case for **/foo, but we don't have any patterns like that
|
||||
if not invert and not pattern_re.match(line):
|
||||
literal = True
|
||||
pattern = tuple(line.rsplit("/", 1))
|
||||
else:
|
||||
pattern = fnmatch_translate(line)
|
||||
literal = False
|
||||
|
||||
return invert, dir_only, literal, pattern
|
||||
|
||||
|
||||
class PathFilter(object):
|
||||
def __init__(self, root, extras=None):
|
||||
def __init__(self, root, extras=None, cache=None):
|
||||
if root:
|
||||
ignore_path = os.path.join(root, ".gitignore")
|
||||
else:
|
||||
@ -108,51 +129,119 @@ class PathFilter(object):
|
||||
return
|
||||
self.trivial = False
|
||||
|
||||
self.rules_file = []
|
||||
self.rules_dir = []
|
||||
self.literals_file = defaultdict(dict)
|
||||
self.literals_dir = defaultdict(dict)
|
||||
self.patterns_file = []
|
||||
self.patterns_dir = []
|
||||
self.cache = cache or {}
|
||||
|
||||
if extras is None:
|
||||
extras = []
|
||||
|
||||
if ignore_path and os.path.exists(ignore_path):
|
||||
self._read_ignore(ignore_path)
|
||||
args = ignore_path, extras
|
||||
else:
|
||||
args = None, extras
|
||||
self._read_ignore(*args)
|
||||
|
||||
for item in extras:
|
||||
self._read_line(item)
|
||||
|
||||
def _read_ignore(self, ignore_path):
|
||||
with open(ignore_path) as f:
|
||||
for line in f:
|
||||
self._read_line(line)
|
||||
def _read_ignore(self, ignore_path, extras):
|
||||
if ignore_path is not None:
|
||||
with open(ignore_path) as f:
|
||||
for line in f:
|
||||
self._read_line(line)
|
||||
for line in extras:
|
||||
self._read_line(line)
|
||||
|
||||
def _read_line(self, line):
|
||||
parsed = parse_line(line)
|
||||
if not parsed:
|
||||
return
|
||||
invert, dir_only, regexp = parsed
|
||||
if dir_only:
|
||||
self.rules_dir.append((regexp, invert))
|
||||
invert, dir_only, literal, rule = parsed
|
||||
|
||||
if invert:
|
||||
# For exclude rules, we attach the rules to all preceeding patterns, so
|
||||
# that we can match patterns out of order and check if they were later
|
||||
# overriden by an exclude rule
|
||||
assert not literal
|
||||
if not dir_only:
|
||||
rules_iter = itertools.chain(
|
||||
itertools.chain(*(iteritems(item) for item in itervalues(self.literals_dir))),
|
||||
itertools.chain(*(iteritems(item) for item in itervalues(self.literals_file))),
|
||||
self.patterns_dir,
|
||||
self.patterns_file)
|
||||
else:
|
||||
rules_iter = itertools.chain(
|
||||
itertools.chain(*(iteritems(item) for item in itervalues(self.literals_dir))),
|
||||
self.patterns_dir)
|
||||
|
||||
for rules in rules_iter:
|
||||
rules[1].append(rule)
|
||||
else:
|
||||
self.rules_file.append((regexp, invert))
|
||||
if literal:
|
||||
if len(rule) == 1:
|
||||
dir_name, pattern = None, rule[0]
|
||||
else:
|
||||
dir_name, pattern = rule
|
||||
self.literals_dir[dir_name][pattern] = []
|
||||
if not dir_only:
|
||||
self.literals_file[dir_name][pattern] = []
|
||||
else:
|
||||
self.patterns_dir.append((rule, []))
|
||||
if not dir_only:
|
||||
self.patterns_file.append((rule, []))
|
||||
|
||||
def __call__(self, path):
|
||||
if os.path.sep != "/":
|
||||
path = path.replace(os.path.sep, "/")
|
||||
def filter(self, iterator):
|
||||
empty = {}
|
||||
for dirpath, dirnames, filenames in iterator:
|
||||
orig_dirpath = dirpath
|
||||
if os.path.sep != "/":
|
||||
dirpath = dirpath.replace(os.path.sep, "/")
|
||||
|
||||
keep_dirs = []
|
||||
keep_files = []
|
||||
|
||||
for iter_items, literals, patterns, target, suffix in [
|
||||
(dirnames, self.literals_dir, self.patterns_dir, keep_dirs, "/"),
|
||||
(filenames, self.literals_file, self.patterns_file, keep_files, "")]:
|
||||
for item in iter_items:
|
||||
name = item[0]
|
||||
if dirpath:
|
||||
path = "%s/%s" % (dirpath, name) + suffix
|
||||
else:
|
||||
path = name + suffix
|
||||
if path in self.cache:
|
||||
if not self.cache[path]:
|
||||
target.append(item)
|
||||
continue
|
||||
for rule_dir in [None, dirpath]:
|
||||
if name in literals.get(rule_dir, empty):
|
||||
exclude = literals[rule_dir][name]
|
||||
if not any(rule.match(path) for rule in exclude):
|
||||
# Skip this item
|
||||
self.cache[path] = True
|
||||
break
|
||||
else:
|
||||
for (component_only, pattern), exclude in patterns:
|
||||
if component_only:
|
||||
match = pattern.match(name)
|
||||
else:
|
||||
match = pattern.match(path)
|
||||
if match:
|
||||
if not any(rule.match(name if name_only else path)
|
||||
for name_only, rule in exclude):
|
||||
# Skip this item
|
||||
self.cache[path] = True
|
||||
break
|
||||
else:
|
||||
self.cache[path] = False
|
||||
target.append(item)
|
||||
|
||||
dirnames[:] = keep_dirs
|
||||
assert ".git" not in dirnames
|
||||
yield orig_dirpath, dirnames, keep_files
|
||||
|
||||
def __call__(self, iterator):
|
||||
if self.trivial:
|
||||
return True
|
||||
return iterator
|
||||
|
||||
path_is_dir = path[-1] == "/"
|
||||
if path_is_dir:
|
||||
path = path[:-1]
|
||||
rules = self.rules_dir
|
||||
else:
|
||||
rules = self.rules_file
|
||||
|
||||
include = True
|
||||
for regexp, invert in rules:
|
||||
if not include and invert and regexp.match(path):
|
||||
include = True
|
||||
elif include and not invert and regexp.match(path):
|
||||
include = False
|
||||
return include
|
||||
return self.filter(iterator)
|
||||
|
@ -3,80 +3,98 @@ import pytest
|
||||
from ..gitignore import fnmatch_translate, PathFilter
|
||||
|
||||
match_data = [
|
||||
("foo", False, ["a/foo", "foo"]),
|
||||
("*.a", False, ["foo.a", "a/foo.a", "a/b/foo.a", "a.a/foo.a"]),
|
||||
("*.py[co]", False, ["a.pyc", "a.pyo", "a/b/c.pyc"]),
|
||||
("\\#*", False, ["#a", "a/#b"]),
|
||||
("*#", False, ["a#", "a/b#", "#a#"]),
|
||||
("/*.c", False, ["a.c", ".c"]),
|
||||
("foo", True, ["a/foo", "foo"]),
|
||||
("*.a", True, ["foo.a", "a/foo.a", "a/b/foo.a", "a.a/foo.a"]),
|
||||
("*.py[co]", True, ["a.pyc", "a.pyo", "a/b/c.pyc"]),
|
||||
("\\#*", True, ["#a", "a/#b"]),
|
||||
("*#", True, ["a#", "a/b#", "#a#"]),
|
||||
("/*.c", True, ["a.c", ".c"]),
|
||||
("**/b", False, ["a/b", "a/c/b"]),
|
||||
("*b", True, ["ab"]),
|
||||
("**/b", True, ["a/b"]),
|
||||
("a/", True, ["a", "a/b", "a/b/c"])
|
||||
("*b", True, ["a/b"]),
|
||||
("**/b", False, ["a/b"]),
|
||||
("a/", True, ["a"]),
|
||||
("a[/]b", True, []),
|
||||
("**/b", False, ["a/c/b"]),
|
||||
("a?c", True, ["abc"]),
|
||||
("a[^b]c", True, ["acc"]),
|
||||
("a[b-c]c", True, ["abc", "acc"]),
|
||||
("a[^]c", True, ["ac"]), # This is probably wrong
|
||||
("a[^]c", True, ["ac"]), # This is probably wrong
|
||||
]
|
||||
|
||||
mismatch_data = [
|
||||
("foo", False, ["foob", "afoo"]),
|
||||
("*.a", False, ["a", "foo:a", "a.a/foo"]),
|
||||
("*.py[co]", False, ["a.pyd", "pyo"]),
|
||||
("/*.c", False, ["a/b.c"]),
|
||||
("*b", True, ["a/b"]),
|
||||
("**b", True, ["a/b"]),
|
||||
("a[/]b", True, ["a/b"]),
|
||||
("**/b", True, ["a/c/b"]),
|
||||
("a", True, ["ab"])
|
||||
("foo", True, ["foob", "afoo"]),
|
||||
("*.a", True, ["a", "foo:a", "a.a/foo"]),
|
||||
("*.py[co]", True, ["a.pyd", "pyo", "a.py"]),
|
||||
("a", True, ["ab"]),
|
||||
("a?c", True, ["ac", "abbc"]),
|
||||
("a[^b]c", True, ["abc"]),
|
||||
("a[b-c]c", True, ["adc"]),
|
||||
]
|
||||
|
||||
invalid_data = [
|
||||
"[a",
|
||||
"***/foo",
|
||||
"a\\",
|
||||
"**b",
|
||||
"b**/",
|
||||
"[[]"
|
||||
]
|
||||
|
||||
filter_data = [
|
||||
("foo", True),
|
||||
("a", False),
|
||||
("a/b", False),
|
||||
("a/c", True),
|
||||
("a/c/", False),
|
||||
("c/b", True)
|
||||
(["foo", "bar/", "/a", "*.py"],
|
||||
[("", ["foo", "bar", "baz"], ["a"]),
|
||||
("baz", ["a"], ["foo", "bar"])],
|
||||
[(["baz"], []),
|
||||
(["a"], ["bar"])]),
|
||||
(["#foo", "", "a*", "!a.py"],
|
||||
[("", ["foo"], ["a", "a.foo", "a.py"])],
|
||||
[(["foo"], ["a.py"])]),
|
||||
]
|
||||
|
||||
|
||||
def expand_data(compact_data):
|
||||
for pattern, path_name, inputs in compact_data:
|
||||
for pattern, name_only, inputs in compact_data:
|
||||
for input in inputs:
|
||||
yield pattern, input, path_name
|
||||
yield pattern, name_only, input
|
||||
|
||||
|
||||
@pytest.mark.parametrize("pattern, input, path_name", expand_data(match_data))
|
||||
def tests_match(pattern, input, path_name):
|
||||
regexp = fnmatch_translate(pattern, path_name)
|
||||
@pytest.mark.parametrize("pattern, name_only, input", expand_data(match_data))
|
||||
def tests_match(pattern, name_only, input):
|
||||
name_only_result, regexp = fnmatch_translate(pattern)
|
||||
assert name_only_result == name_only
|
||||
if name_only:
|
||||
input = input.rsplit("/", 1)[-1]
|
||||
assert regexp.match(input) is not None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("pattern, input, path_name", expand_data(mismatch_data))
|
||||
def tests_no_match(pattern, input, path_name):
|
||||
regexp = fnmatch_translate(pattern, path_name)
|
||||
@pytest.mark.parametrize("pattern, name_only, input", expand_data(mismatch_data))
|
||||
def tests_no_match(pattern, name_only, input):
|
||||
name_only_result, regexp = fnmatch_translate(pattern)
|
||||
assert name_only_result == name_only
|
||||
if name_only:
|
||||
input = input.rsplit("/", 1)[-1]
|
||||
assert regexp.match(input) is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("pattern", invalid_data)
|
||||
def tests_invalid(pattern):
|
||||
with pytest.raises(ValueError):
|
||||
fnmatch_translate(pattern, False)
|
||||
with pytest.raises(ValueError):
|
||||
fnmatch_translate(pattern, True)
|
||||
fnmatch_translate(pattern)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("path, expected", filter_data)
|
||||
def test_path_filter(path, expected):
|
||||
extras = [
|
||||
"#foo",
|
||||
"a ",
|
||||
"**/b",
|
||||
"a/c/",
|
||||
"!c/b",
|
||||
]
|
||||
f = PathFilter(None, extras)
|
||||
assert f(path) == expected
|
||||
@pytest.mark.parametrize("rules, input, expected", filter_data)
|
||||
def test_path_filter(rules, input, expected):
|
||||
f = PathFilter(None, rules)
|
||||
# Add some fake stat data
|
||||
for i, item in enumerate(input):
|
||||
repl = [input[i][0]]
|
||||
for j in [1, 2]:
|
||||
repl.append([(name, None) for name in input[i][j]])
|
||||
input[i] = tuple(repl)
|
||||
|
||||
for i, output in enumerate(f(input)):
|
||||
assert output[0] == input[i][0]
|
||||
for j in [1, 2]:
|
||||
assert [item[0] for item in output[j]] == expected[i][j-1]
|
||||
|
Loading…
Reference in New Issue
Block a user