#!/usr/bin/env python3 # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. import os import re import subprocess import sys import pygit2 import hglib DEBUG = False def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) def debugprint(*args, **kwargs): if DEBUG: eprint(*args, **kwargs) class HgCommit: def __init__(self, parent1, parent2): self.parents = [] if parent1 == NULL_PARENT_REV: raise Exception("Encountered a hg changeset with no parents! We don't handle this....") self.parents.append(parent1) if parent2 != NULL_PARENT_REV: self.parents.append(parent2) self.touches_sync_code = False self.children = [] def add_child(self, rev): self.children.append(rev) class GitCommit: def __init__(self, hg_rev, commit_obj): self.hg_rev = hg_rev self.commit_obj = commit_obj def load_git_repository(): commit_map = dict() # First, scan the tags for "mozilla-xxx" that keep track of manually synchronized changes sync_tags = filter( lambda ref: ref.startswith('refs/tags/mozilla-'), list(downstream_git_repo.references)) for desc in sync_tags: commit = downstream_git_repo.lookup_reference(desc).peel() # cut out the revision hash from the output hg_rev = desc[18:] commit_map[hg_rev] = GitCommit(hg_rev, commit) debugprint("Loaded pre-existing tag hg %s -> git %s" % (hg_rev, commit.oid)) # Next, scan the commits for a specific message format re_commitmsg = re.compile( r"^\[(ghsync|wrupdater)\] From https://hg.mozilla.org/mozilla-central/rev/([0-9a-fA-F]+)$", re.MULTILINE) for commit in downstream_git_repo.walk(downstream_git_repo.head.target): m = re_commitmsg.search(commit.message) if not m: continue hg_rev = m.group(2) commit_map[hg_rev] = GitCommit(hg_rev, commit) debugprint("Loaded pre-existing commit hg %s -> git %s" % (hg_rev, commit.oid)) return commit_map def timeof(git_commit): return git_commit.commit_obj.commit_time + git_commit.commit_obj.commit_time_offset def find_newest_commit(commit_map): newest_hg_rev = None newest_commit_time = None for hg_rev, git_commit in commit_map.items(): if newest_hg_rev is None or timeof(git_commit) > newest_commit_time: newest_hg_rev = hg_rev newest_commit_time = timeof(git_commit) return newest_hg_rev def get_single_rev(revset): output = subprocess.check_output(['hg', 'log', '-r', revset, '--template', '{node}']) output = str(output, "ascii") return output def get_multiple_revs(revset, template): output = subprocess.check_output(['hg', 'log', '-r', revset, '--template', template + '\\n']) for line in output.splitlines(): yield str(line, "ascii") def get_base_hg_rev(commit_map): base_hg_rev = find_newest_commit(commit_map) eprint("Using %s as base hg revision" % base_hg_rev) return base_hg_rev def load_hg_commits(commits, query): for cset in get_multiple_revs(query, '{node} {p1node} {p2node}'): tokens = cset.split() commits[tokens[0]] = HgCommit(tokens[1], tokens[2]) return commits def get_real_base_hg_rev(hg_data, commit_map): # Some of the HG commits we want to port to github may have landed on codelines # that branched off central prior to base_hg_rev. So when we create the git # equivalents, they will have parents that are not the HEAD of the git repo, # but instead will be descendants of older commits in the git repo. In order # to do this correctly, we need to find the hg-equivalents of all of those # possible git parents. So first we identify all the "tail" hg revisions in # our hg_data set (think "tail" as in opposite of "head" which is the tipmost # commit). The "tail" hg revisions are the ones for which we don't have their # ancestors in hg_data. tails = [] for (rev, cset) in hg_data.items(): for parent in cset.parents: if parent not in hg_data: tails.append(rev) eprint("Found hg tail revisions %s" % tails) # Then we find their common ancestor, which will be some ancestor of base_hg_rev # from which those codelines. if len(tails) == 0: common_ancestor = get_single_rev('.') else: common_ancestor = get_single_rev('ancestor(' + ','.join(tails) + ')') eprint("Found common ancestor of tail revisions: %s" % common_ancestor) # And then we find the newest git commit whose hg-equivalent is an ancestor of # that common ancestor, to make sure we are starting from a known hg/git # commit pair. for git_commit in sorted(commit_map.values(), key=timeof, reverse=True): new_base = get_single_rev('ancestor(' + common_ancestor + ',' + git_commit.hg_rev + ')') if new_base == common_ancestor: eprint( "Pre-existing git commit %s from hg rev %s is descendant of common ancestor; %s" % (git_commit.commit_obj.id, git_commit.hg_rev, "walking back further...")) continue if new_base != git_commit.hg_rev: eprint( "Pre-existing git commit %s from hg rev %s is on sibling branch" " of common ancestor; %s" % (git_commit.commit_obj.id, git_commit.hg_rev, "walking back further...")) continue eprint( "Pre-existing git commit %s from hg rev %s is sufficiently old; stopping walk" % (git_commit.commit_obj.id, git_commit.hg_rev)) common_ancestor = new_base break return common_ancestor # Now we prune out all the uninteresting changesets from hg_commits. The # uninteresting ones are ones that don't touch the target code, are not merges, # and are not referenced by mozilla tags in the git repo. # We do this by rewriting the parents to the "interesting" ancestor. def prune_boring(rev): while rev in hg_commits: parent_pruned = False for i in range(len(hg_commits[rev].parents)): parent_rev = hg_commits[rev].parents[i] if parent_rev not in hg_commits: continue if hg_commits[parent_rev].touches_sync_code: continue if len(hg_commits[parent_rev].parents) > 1: continue if parent_rev in hg_to_git_commit_map: continue # If we get here, then `parent_rev` is a boring revision and we can # prune it. Connect `rev` to its grandparent, and prune the parent grandparent_rev = hg_commits[parent_rev].parents[0] hg_commits[rev].parents[i] = grandparent_rev # eprint("Pruned %s as boring parent of %s, using %s now" % # (parent_rev, rev, grandparent_rev)) parent_pruned = True if parent_pruned: # If we pruned a parent, process `rev` again as we might want to # prune more parents continue # If we get here, all of `rev`s parents are interesting, so we can't # prune them. Move up to the parent rev and start processing that, or # if we have multiple parents then recurse on those nodes. if len(hg_commits[rev].parents) == 1: rev = hg_commits[rev].parents[0] continue for parent_rev in hg_commits[rev].parents: prune_boring(parent_rev) return class FakeCommit: def __init__(self, oid): self.oid = oid def fake_commit(hg_rev, parent1, parent2): if parent1 is None: eprint("ERROR: Trying to build on None") exit(1) oid = "githash_%s" % hash(parent1) eprint("Fake-built %s" % oid) return FakeCommit(oid) def build_tree(builder, treedata): for (name, value) in treedata.items(): if isinstance(value, dict): subbuilder = downstream_git_repo.TreeBuilder() build_tree(subbuilder, value) builder.insert(name, subbuilder.write(), pygit2.GIT_FILEMODE_TREE) else: (filemode, contents) = value blob_oid = downstream_git_repo.create_blob(contents) builder.insert(name, blob_oid, filemode) def author_to_signature(author): pieces = author.strip().split('<') if len(pieces) != 2 or pieces[1][-1] != '>': # We could probably handle this better return pygit2.Signature(author, '') name = pieces[0].strip() email = pieces[1][:-1].strip() return pygit2.Signature(name, email) def real_commit(hg_rev, parent1, parent2): filetree = dict() manifest = mozilla_hg_repo.manifest(rev=hg_rev) for (nodeid, permission, executable, symlink, filename) in manifest: if not filename.startswith(relative_path.encode('utf-8')): continue if symlink: filemode = pygit2.GIT_FILEMODE_LINK elif executable: filemode = pygit2.GIT_FILEMODE_BLOB_EXECUTABLE else: filemode = pygit2.GIT_FILEMODE_BLOB filecontent = mozilla_hg_repo.cat([filename], rev=hg_rev) subtree = filetree for component in filename.split(b'/')[2:-1]: subtree = subtree.setdefault(component.decode("latin-1"), dict()) filename = filename.split(b'/')[-1] subtree[filename.decode("latin-1")] = (filemode, filecontent) builder = downstream_git_repo.TreeBuilder() build_tree(builder, filetree) tree_oid = builder.write() parent1_obj = downstream_git_repo.get(parent1) if parent1_obj.tree_id == tree_oid: eprint("Early-exit; tree matched that of parent git commit %s" % parent1) return parent1_obj if parent2 is not None: parent2_obj = downstream_git_repo.get(parent2) if parent2_obj.tree_id == tree_oid: eprint("Early-exit; tree matched that of parent git commit %s" % parent2) return parent2_obj hg_rev_obj = mozilla_hg_repo.log(revrange=hg_rev, limit=1)[0] commit_author = hg_rev_obj[4].decode("latin-1") commit_message = hg_rev_obj[5].decode("latin-1") commit_message += '\n\n[ghsync] From https://hg.mozilla.org/mozilla-central/rev/%s'\ % hg_rev + '\n' parents = [parent1] if parent2 is not None: parents.append(parent2) commit_oid = downstream_git_repo.create_commit( None, author_to_signature(commit_author), # TODO(kats): use a more appropriate email pygit2.Signature('github-sync', 'graphics-team@mozilla.staktrace.com'), commit_message, tree_oid, parents, ) eprint("Built git commit %s" % commit_oid) return downstream_git_repo.get(commit_oid) def try_commit(hg_rev, parent1, parent2=None): if False: return fake_commit(hg_rev, parent1, parent2) else: return real_commit(hg_rev, parent1, parent2) def build_git_commits(rev): debugprint("build_git_commit(%s)..." % rev) if rev in hg_to_git_commit_map: debugprint(" maps to %s" % hg_to_git_commit_map[rev].commit_obj.oid) return hg_to_git_commit_map[rev].commit_obj.oid if rev not in hg_commits: debugprint(" not in hg_commits") return None if len(hg_commits[rev].parents) == 1: git_parent = build_git_commits(hg_commits[rev].parents[0]) if not hg_commits[rev].touches_sync_code: eprint("WARNING: Found rev %s that is non-merge and not related to the target" % rev) return git_parent eprint("Building git equivalent for %s on top of %s" % (rev, git_parent)) commit_obj = try_commit(rev, git_parent) hg_to_git_commit_map[rev] = GitCommit(rev, commit_obj) debugprint(" built %s as %s" % (rev, commit_obj.oid)) return commit_obj.oid git_parent_1 = build_git_commits(hg_commits[rev].parents[0]) git_parent_2 = build_git_commits(hg_commits[rev].parents[1]) if git_parent_1 is None or git_parent_2 is None or git_parent_1 == git_parent_2: git_parent = git_parent_1 if git_parent_2 is None else git_parent_2 if not hg_commits[rev].touches_sync_code: debugprint( " %s is merge with no parents or doesn't touch WR, returning %s" % (rev, git_parent)) return git_parent eprint( "WARNING: Found merge rev %s whose parents have identical target code" ", but modifies the target" % rev) eprint("Building git equivalent for %s on top of %s" % (rev, git_parent)) commit_obj = try_commit(rev, git_parent) hg_to_git_commit_map[rev] = GitCommit(rev, commit_obj) debugprint(" built %s as %s" % (rev, commit_obj.oid)) return commit_obj.oid # An actual merge eprint("Building git equivalent for %s on top of %s, %s" % (rev, git_parent_1, git_parent_2)) commit_obj = try_commit(rev, git_parent_1, git_parent_2) hg_to_git_commit_map[rev] = GitCommit(rev, commit_obj) debugprint(" built %s as %s" % (rev, commit_obj.oid)) return commit_obj.oid def pretty_print(rev, cset): desc = " %s" % rev desc += " parents: %s" % cset.parents if rev in hg_to_git_commit_map: desc += " git: %s" % hg_to_git_commit_map[rev].commit_obj.oid if rev == hg_tip: desc += " (tip)" return desc if len(sys.argv) < 3: eprint("Usage: %s " % sys.argv[0]) eprint("Current dir must be the mozilla hg repo") exit(1) local_checkout_path = sys.argv[1] relative_path = sys.argv[2] mozilla_hg_path = os.getcwd() NULL_PARENT_REV = '0000000000000000000000000000000000000000' downstream_git_repo = pygit2.Repository(pygit2.discover_repository(local_checkout_path)) mozilla_hg_repo = hglib.open(mozilla_hg_path) hg_to_git_commit_map = load_git_repository() base_hg_rev = get_base_hg_rev(hg_to_git_commit_map) if base_hg_rev is None: eprint("Found no sync commits or 'mozilla-xxx' tags") exit(1) hg_commits = load_hg_commits(dict(), 'only(.,' + base_hg_rev + ')') eprint("Initial set has %s changesets" % len(hg_commits)) base_hg_rev = get_real_base_hg_rev(hg_commits, hg_to_git_commit_map) eprint("Using hg rev %s as common ancestor of all interesting changesets" % base_hg_rev) # Refresh hg_commits with our wider dataset hg_tip = get_single_rev('.') wider_range = "%s::%s" % (base_hg_rev, hg_tip) hg_commits = load_hg_commits(hg_commits, wider_range) eprint("Updated set has %s changesets" % len(hg_commits)) if DEBUG: eprint("Graph of descendants of %s" % base_hg_rev) output = subprocess.check_output( ['hg', 'log', '--graph', '-r', 'descendants(' + base_hg_rev + ')', '--template', '{node} {desc|firstline}\\n']) for line in output.splitlines(): eprint(line.decode('utf-8', 'ignore')) # Also flag any changes that touch the project query = '(' + wider_range + ') & file("glob:' + relative_path + '/**")' for cset in get_multiple_revs(query, '{node}'): debugprint("Changeset %s modifies %s" % (cset, relative_path)) hg_commits[cset].touches_sync_code = True eprint( "Identified %s changesets that touch the target code" % sum([1 if v.touches_sync_code else 0 for (k, v) in hg_commits.items()])) prune_boring(hg_tip) # hg_tip itself might be boring if not hg_commits[hg_tip].touches_sync_code and len(hg_commits[hg_tip].parents) == 1: new_tip = hg_commits[hg_tip].parents[0] eprint("Pruned tip %s as boring, using %s now" % (hg_tip, new_tip)) hg_tip = new_tip eprint("--- Interesting changesets ---") for (rev, cset) in hg_commits.items(): if cset.touches_sync_code or len(cset.parents) > 1 or rev in hg_to_git_commit_map: eprint(pretty_print(rev, cset)) if DEBUG: eprint("--- Other changesets (not really interesting) ---") for (rev, cset) in hg_commits.items(): if not (cset.touches_sync_code or len(cset.parents) > 1 or rev in hg_to_git_commit_map): eprint(pretty_print(rev, cset)) git_tip = build_git_commits(hg_tip) if git_tip is None: eprint("No new changesets generated, exiting.") else: downstream_git_repo.create_reference('refs/heads/github-sync', git_tip, force=True) eprint("Updated github-sync branch to %s, done!" % git_tip)