gecko-dev/tools/github-sync/converter.py
Kartikaya Gupta 447162a66b Bug 1635809 - Use |file| instead of |modifies| in the hg revset query to catch file deletions. r=kvark
Closer look at the man pages shows that the file() query will catch any file
changes, whereas the modifies() query will only capture files that were
modified, which excludes strict deletions.

Depends on D74065

Differential Revision: https://phabricator.services.mozilla.com/D74066
2020-05-06 16:09:23 +00:00

436 lines
16 KiB
Python
Executable File

#!/usr/bin/env python3
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import os
import re
import subprocess
import sys
import pygit2
import hglib
DEBUG = False
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
def debugprint(*args, **kwargs):
if DEBUG:
eprint(*args, **kwargs)
class HgCommit:
def __init__(self, parent1, parent2):
self.parents = []
if parent1 == NULL_PARENT_REV:
raise Exception("Encountered a hg changeset with no parents! We don't handle this....")
self.parents.append(parent1)
if parent2 != NULL_PARENT_REV:
self.parents.append(parent2)
self.touches_sync_code = False
self.children = []
def add_child(self, rev):
self.children.append(rev)
class GitCommit:
def __init__(self, hg_rev, commit_obj):
self.hg_rev = hg_rev
self.commit_obj = commit_obj
def load_git_repository():
commit_map = dict()
# First, scan the tags for "mozilla-xxx" that keep track of manually synchronized changes
sync_tags = filter(
lambda ref: ref.startswith('refs/tags/mozilla-'),
list(downstream_git_repo.references))
for desc in sync_tags:
commit = downstream_git_repo.lookup_reference(desc).peel()
# cut out the revision hash from the output
hg_rev = desc[18:]
commit_map[hg_rev] = GitCommit(hg_rev, commit)
debugprint("Loaded pre-existing tag hg %s -> git %s" % (hg_rev, commit.oid))
# Next, scan the commits for a specific message format
re_commitmsg = re.compile(
r"^\[(ghsync|wrupdater)\] From https://hg.mozilla.org/mozilla-central/rev/([0-9a-fA-F]+)$",
re.MULTILINE)
for commit in downstream_git_repo.walk(downstream_git_repo.head.target):
m = re_commitmsg.search(commit.message)
if not m:
continue
hg_rev = m.group(2)
commit_map[hg_rev] = GitCommit(hg_rev, commit)
debugprint("Loaded pre-existing commit hg %s -> git %s" % (hg_rev, commit.oid))
return commit_map
def timeof(git_commit):
return git_commit.commit_obj.commit_time + git_commit.commit_obj.commit_time_offset
def find_newest_commit(commit_map):
newest_hg_rev = None
newest_commit_time = None
for hg_rev, git_commit in commit_map.items():
if newest_hg_rev is None or timeof(git_commit) > newest_commit_time:
newest_hg_rev = hg_rev
newest_commit_time = timeof(git_commit)
return newest_hg_rev
def get_single_rev(revset):
output = subprocess.check_output(['hg', 'log', '-r', revset, '--template', '{node}'])
output = str(output, "ascii")
return output
def get_multiple_revs(revset, template):
output = subprocess.check_output(['hg', 'log', '-r', revset, '--template', template + '\\n'])
for line in output.splitlines():
yield str(line, "ascii")
def get_base_hg_rev(commit_map):
base_hg_rev = find_newest_commit(commit_map)
eprint("Using %s as base hg revision" % base_hg_rev)
return base_hg_rev
def load_hg_commits(commits, query):
for cset in get_multiple_revs(query, '{node} {p1node} {p2node}'):
tokens = cset.split()
commits[tokens[0]] = HgCommit(tokens[1], tokens[2])
return commits
def get_real_base_hg_rev(hg_data, commit_map):
# Some of the HG commits we want to port to github may have landed on codelines
# that branched off central prior to base_hg_rev. So when we create the git
# equivalents, they will have parents that are not the HEAD of the git repo,
# but instead will be descendants of older commits in the git repo. In order
# to do this correctly, we need to find the hg-equivalents of all of those
# possible git parents. So first we identify all the "tail" hg revisions in
# our hg_data set (think "tail" as in opposite of "head" which is the tipmost
# commit). The "tail" hg revisions are the ones for which we don't have their
# ancestors in hg_data.
tails = []
for (rev, cset) in hg_data.items():
for parent in cset.parents:
if parent not in hg_data:
tails.append(rev)
eprint("Found hg tail revisions %s" % tails)
# Then we find their common ancestor, which will be some ancestor of base_hg_rev
# from which those codelines.
if len(tails) == 0:
common_ancestor = get_single_rev('.')
else:
common_ancestor = get_single_rev('ancestor(' + ','.join(tails) + ')')
eprint("Found common ancestor of tail revisions: %s" % common_ancestor)
# And then we find the newest git commit whose hg-equivalent is an ancestor of
# that common ancestor, to make sure we are starting from a known hg/git
# commit pair.
for git_commit in sorted(commit_map.values(), key=timeof, reverse=True):
new_base = get_single_rev('ancestor(' + common_ancestor + ',' + git_commit.hg_rev + ')')
if new_base == common_ancestor:
eprint(
"Pre-existing git commit %s from hg rev %s is descendant of common ancestor; %s" %
(git_commit.commit_obj.id, git_commit.hg_rev, "walking back further..."))
continue
if new_base != git_commit.hg_rev:
eprint(
"Pre-existing git commit %s from hg rev %s is on sibling branch"
" of common ancestor; %s" %
(git_commit.commit_obj.id, git_commit.hg_rev, "walking back further..."))
continue
eprint(
"Pre-existing git commit %s from hg rev %s is sufficiently old; stopping walk" %
(git_commit.commit_obj.id, git_commit.hg_rev))
common_ancestor = new_base
break
return common_ancestor
# Now we prune out all the uninteresting changesets from hg_commits. The
# uninteresting ones are ones that don't touch the target code, are not merges,
# and are not referenced by mozilla tags in the git repo.
# We do this by rewriting the parents to the "interesting" ancestor.
def prune_boring(rev):
while rev in hg_commits:
parent_pruned = False
for i in range(len(hg_commits[rev].parents)):
parent_rev = hg_commits[rev].parents[i]
if parent_rev not in hg_commits:
continue
if hg_commits[parent_rev].touches_sync_code:
continue
if len(hg_commits[parent_rev].parents) > 1:
continue
if parent_rev in hg_to_git_commit_map:
continue
# If we get here, then `parent_rev` is a boring revision and we can
# prune it. Connect `rev` to its grandparent, and prune the parent
grandparent_rev = hg_commits[parent_rev].parents[0]
hg_commits[rev].parents[i] = grandparent_rev
# eprint("Pruned %s as boring parent of %s, using %s now" %
# (parent_rev, rev, grandparent_rev))
parent_pruned = True
if parent_pruned:
# If we pruned a parent, process `rev` again as we might want to
# prune more parents
continue
# If we get here, all of `rev`s parents are interesting, so we can't
# prune them. Move up to the parent rev and start processing that, or
# if we have multiple parents then recurse on those nodes.
if len(hg_commits[rev].parents) == 1:
rev = hg_commits[rev].parents[0]
continue
for parent_rev in hg_commits[rev].parents:
prune_boring(parent_rev)
return
class FakeCommit:
def __init__(self, oid):
self.oid = oid
def fake_commit(hg_rev, parent1, parent2):
if parent1 is None:
eprint("ERROR: Trying to build on None")
exit(1)
oid = "githash_%s" % hash(parent1)
eprint("Fake-built %s" % oid)
return FakeCommit(oid)
def build_tree(builder, treedata):
for (name, value) in treedata.items():
if isinstance(value, dict):
subbuilder = downstream_git_repo.TreeBuilder()
build_tree(subbuilder, value)
builder.insert(name, subbuilder.write(), pygit2.GIT_FILEMODE_TREE)
else:
(filemode, contents) = value
blob_oid = downstream_git_repo.create_blob(contents)
builder.insert(name, blob_oid, filemode)
def author_to_signature(author):
pieces = author.strip().split('<')
if len(pieces) != 2 or pieces[1][-1] != '>':
# We could probably handle this better
return pygit2.Signature(author, '')
name = pieces[0].strip()
email = pieces[1][:-1].strip()
return pygit2.Signature(name, email)
def real_commit(hg_rev, parent1, parent2):
filetree = dict()
manifest = mozilla_hg_repo.manifest(rev=hg_rev)
for (nodeid, permission, executable, symlink, filename) in manifest:
if not filename.startswith(relative_path.encode('utf-8')):
continue
if symlink:
filemode = pygit2.GIT_FILEMODE_LINK
elif executable:
filemode = pygit2.GIT_FILEMODE_BLOB_EXECUTABLE
else:
filemode = pygit2.GIT_FILEMODE_BLOB
filecontent = mozilla_hg_repo.cat([filename], rev=hg_rev)
subtree = filetree
for component in filename.split(b'/')[2:-1]:
subtree = subtree.setdefault(component.decode("latin-1"), dict())
filename = filename.split(b'/')[-1]
subtree[filename.decode("latin-1")] = (filemode, filecontent)
builder = downstream_git_repo.TreeBuilder()
build_tree(builder, filetree)
tree_oid = builder.write()
parent1_obj = downstream_git_repo.get(parent1)
if parent1_obj.tree_id == tree_oid:
eprint("Early-exit; tree matched that of parent git commit %s" % parent1)
return parent1_obj
if parent2 is not None:
parent2_obj = downstream_git_repo.get(parent2)
if parent2_obj.tree_id == tree_oid:
eprint("Early-exit; tree matched that of parent git commit %s" % parent2)
return parent2_obj
hg_rev_obj = mozilla_hg_repo.log(revrange=hg_rev, limit=1)[0]
commit_author = hg_rev_obj[4].decode("latin-1")
commit_message = hg_rev_obj[5].decode("latin-1")
commit_message += '\n\n[ghsync] From https://hg.mozilla.org/mozilla-central/rev/%s'\
% hg_rev + '\n'
parents = [parent1]
if parent2 is not None:
parents.append(parent2)
commit_oid = downstream_git_repo.create_commit(
None,
author_to_signature(commit_author),
# TODO(kats): use a more appropriate email
pygit2.Signature('github-sync', 'graphics-team@mozilla.staktrace.com'),
commit_message,
tree_oid,
parents,
)
eprint("Built git commit %s" % commit_oid)
return downstream_git_repo.get(commit_oid)
def try_commit(hg_rev, parent1, parent2=None):
if False:
return fake_commit(hg_rev, parent1, parent2)
else:
return real_commit(hg_rev, parent1, parent2)
def build_git_commits(rev):
debugprint("build_git_commit(%s)..." % rev)
if rev in hg_to_git_commit_map:
debugprint(" maps to %s" % hg_to_git_commit_map[rev].commit_obj.oid)
return hg_to_git_commit_map[rev].commit_obj.oid
if rev not in hg_commits:
debugprint(" not in hg_commits")
return None
if len(hg_commits[rev].parents) == 1:
git_parent = build_git_commits(hg_commits[rev].parents[0])
if not hg_commits[rev].touches_sync_code:
eprint("WARNING: Found rev %s that is non-merge and not related to the target" % rev)
return git_parent
eprint("Building git equivalent for %s on top of %s" % (rev, git_parent))
commit_obj = try_commit(rev, git_parent)
hg_to_git_commit_map[rev] = GitCommit(rev, commit_obj)
debugprint(" built %s as %s" % (rev, commit_obj.oid))
return commit_obj.oid
git_parent_1 = build_git_commits(hg_commits[rev].parents[0])
git_parent_2 = build_git_commits(hg_commits[rev].parents[1])
if git_parent_1 is None or git_parent_2 is None or git_parent_1 == git_parent_2:
git_parent = git_parent_1 if git_parent_2 is None else git_parent_2
if not hg_commits[rev].touches_sync_code:
debugprint(
" %s is merge with no parents or doesn't touch WR, returning %s"
% (rev, git_parent))
return git_parent
eprint(
"WARNING: Found merge rev %s whose parents have identical target code"
", but modifies the target" % rev)
eprint("Building git equivalent for %s on top of %s" % (rev, git_parent))
commit_obj = try_commit(rev, git_parent)
hg_to_git_commit_map[rev] = GitCommit(rev, commit_obj)
debugprint(" built %s as %s" % (rev, commit_obj.oid))
return commit_obj.oid
# An actual merge
eprint("Building git equivalent for %s on top of %s, %s" % (rev, git_parent_1, git_parent_2))
commit_obj = try_commit(rev, git_parent_1, git_parent_2)
hg_to_git_commit_map[rev] = GitCommit(rev, commit_obj)
debugprint(" built %s as %s" % (rev, commit_obj.oid))
return commit_obj.oid
def pretty_print(rev, cset):
desc = " %s" % rev
desc += " parents: %s" % cset.parents
if rev in hg_to_git_commit_map:
desc += " git: %s" % hg_to_git_commit_map[rev].commit_obj.oid
if rev == hg_tip:
desc += " (tip)"
return desc
if len(sys.argv) < 3:
eprint("Usage: %s <local-checkout-path> <repo-relative-path>" % sys.argv[0])
eprint("Current dir must be the mozilla hg repo")
exit(1)
local_checkout_path = sys.argv[1]
relative_path = sys.argv[2]
mozilla_hg_path = os.getcwd()
NULL_PARENT_REV = '0000000000000000000000000000000000000000'
downstream_git_repo = pygit2.Repository(pygit2.discover_repository(local_checkout_path))
mozilla_hg_repo = hglib.open(mozilla_hg_path)
hg_to_git_commit_map = load_git_repository()
base_hg_rev = get_base_hg_rev(hg_to_git_commit_map)
if base_hg_rev is None:
eprint("Found no sync commits or 'mozilla-xxx' tags")
exit(1)
hg_commits = load_hg_commits(dict(), 'only(.,' + base_hg_rev + ')')
eprint("Initial set has %s changesets" % len(hg_commits))
base_hg_rev = get_real_base_hg_rev(hg_commits, hg_to_git_commit_map)
eprint("Using hg rev %s as common ancestor of all interesting changesets" % base_hg_rev)
# Refresh hg_commits with our wider dataset
hg_tip = get_single_rev('.')
wider_range = "%s::%s" % (base_hg_rev, hg_tip)
hg_commits = load_hg_commits(hg_commits, wider_range)
eprint("Updated set has %s changesets" % len(hg_commits))
if DEBUG:
eprint("Graph of descendants of %s" % base_hg_rev)
output = subprocess.check_output(
['hg', 'log', '--graph',
'-r', 'descendants(' + base_hg_rev + ')',
'--template', '{node} {desc|firstline}\\n'])
for line in output.splitlines():
eprint(line.decode('utf-8', 'ignore'))
# Also flag any changes that touch the project
query = '(' + wider_range + ') & file("glob:' + relative_path + '/**")'
for cset in get_multiple_revs(query, '{node}'):
debugprint("Changeset %s modifies %s" % (cset, relative_path))
hg_commits[cset].touches_sync_code = True
eprint(
"Identified %s changesets that touch the target code" %
sum([1 if v.touches_sync_code else 0 for (k, v) in hg_commits.items()]))
prune_boring(hg_tip)
# hg_tip itself might be boring
if not hg_commits[hg_tip].touches_sync_code and len(hg_commits[hg_tip].parents) == 1:
new_tip = hg_commits[hg_tip].parents[0]
eprint("Pruned tip %s as boring, using %s now" % (hg_tip, new_tip))
hg_tip = new_tip
eprint("--- Interesting changesets ---")
for (rev, cset) in hg_commits.items():
if cset.touches_sync_code or len(cset.parents) > 1 or rev in hg_to_git_commit_map:
eprint(pretty_print(rev, cset))
if DEBUG:
eprint("--- Other changesets (not really interesting) ---")
for (rev, cset) in hg_commits.items():
if not (cset.touches_sync_code or len(cset.parents) > 1 or rev in hg_to_git_commit_map):
eprint(pretty_print(rev, cset))
git_tip = build_git_commits(hg_tip)
if git_tip is None:
eprint("No new changesets generated, exiting.")
else:
downstream_git_repo.create_reference('refs/heads/github-sync', git_tip, force=True)
eprint("Updated github-sync branch to %s, done!" % git_tip)