#! /usr/bin/env python3.3 """ Given a list of Git/Perforce branch associations and a list of Git commits, identify which branch of Perforce file hierarchy should receive each Git commit. UML diagram at doc/p4gf_branch_id.uml.pdf 1. Order pushed branches into a sequence: 1. master-ish 2. named fully populated 3. named lightweight 2. For each named branch in the push, in the above sequence, O(m) pushed branches assign commits to that branch: x 2.1. If named branch existed pre-push Perform a tree walk, rooted at old ref location, O(m) x O(n) commits calculating which children can reach the old ref (set a bool per commit) Later, when calculating this branch's path from old to new ref, we'll consider only those commits that can reach the old ref location. If named branch did not exist pre-push, then it doesn't matter what path we choose. Don't bother calculating reachability, don't check for it later when calculating path. 2.2. Starting at new branch ref head and working back O(m) x O(n) commits in branch path through parent links to the start of this branch, which is either * old ref location if one exists, or * any commit already copied to Perforce if this is a new branch ref Choose which parent to follow as first of: 1. unassigned first-parent *reachable 2. unassigned any parent *reachable 3. assigned first-parent *reachable 4. assigned any parent *reachable If we had an old ref location, then consider only parent commits that can read the old ref location (calculated in 2.1 above) If this commit is not yet assigned, assign to this branch. If commit already assigned, honor the preivous assignment. At this point, all *pushed* branches now have complete paths to their roots. Paths will often overlap. Yes this results in a named branch likely picking up completely unrelated and different commits than what the original author intended. We're just trying to reuse existing branches here, not recreate unknowable intentions. 3. At this point, there are usually still many commits with no branch assignments. Assign them to anonymous branches. Iterate through the 'git rev-list --date-order' list, O(n) commits from newest/childmost to oldest/parentmost order. For each commit with no branch assignment O(p) new anonymous branches 3.1. Reuse existing, or create new anonymous lightweight branch for this commit 3.2. Repeat 2.2: assign this anonymous branch to O(p) x O(n) commits in branch path this commit and a single chain of parents back to the start of pushed history. Yes this results in a single anonymous branch that strangely re-appears over and over again for distant and unrelated islands of previously unassigned commits. That's okay. Better one branch, re-appearing 5 times, than 5 branches, each with only one changelist. Either way requires the same number of file branch/merge/integ records, but this way creates fewer branches of Perforce depot hiearchy: less to render in P4V's Revision Graph, less to track in //P4GF_DEPOT/branches/... . Now every commit has one branch assignment. 4. For each pushed branch head O(m) pushed branches If that head's commit is not assigned to this branch add this branch as a second-or-later assignment. So that when Git Fusion receives a push with multiple heads on the same commit, later code will know to repeat that commit on each of the branches, leaving each Perforce depot branch's #head state matching Git's head commit state. 5. Thin the memory footprint. O(n) commits Discard anything we no longer need. Replace our larger Assign instances with much smaller AssignFrozen instances """ import configparser import logging import os import re import p4gf_env_config # pylint:disable=unused-import import p4gf_branch import p4gf_const import p4gf_histogram from p4gf_l10n import _, NTR import p4gf_log from p4gf_object_type import ObjectType import p4gf_proc from p4gf_profiler import Timer import p4gf_progress_reporter as ProgressReporter import p4gf_util # cannot use __name__ since it will often be "__main__" LOG = logging.getLogger("p4gf_branch_id") LOG_GRAPH = LOG.getChild('graph') LOG_TIME = LOG.getChild('time') # Set to None to disable very noisy dump of log. _DUMP_LOG = (NTR('git'), NTR('log'), NTR('--graph'), NTR('--no-color'), NTR('--format=%H %s')) TUNNEL_UNLIMITED = -1 class Assigner: """Group all the related data in a single object we can pass around and call. New for first-push use, set all before calling assign(): connect_to_previous_branch_sha1 tunnel_max_ct tunnel_assign """ def __init__(self , branch_dict , pre_receive_list , ctx = None): self.ctx = ctx # Original input: list of PreReceiveTuple elements. self.pre_receive_list = pre_receive_list # branch_id to Branch object of all branches. Starts out with just the # branches from config, but grows as we add new Branch instances for # newly pushed branches, or newly discovered anonymous branches. self.branch_dict = branch_dict # List of sha1, one for each Git commit being pushed. Order MUST # be in topological order with the newest/child commits before # all of their older/parent commits. self.rev_list = [] # All these behavior controls tend to be set either all to True or all # to False, True for normal operation, False for Fast Push. We could # probably offer a less noisy API. Zig has a mild preference for # precise APIs that don't conflate. # Behavior control: check incoming commits to see if they're # already assigned to some existing branch? self.assign_previous = True # Behavior control: set to False to ignore current branch references in # the repo. When using the Assigner outside of 'git push' pre-receive- # hook time, you usually want to drag each new branch head all the way # back to the beginning of time, not back to its current location # (which is almost always the same as prt.new_sha1). self.connect_to_previous_branch_sha1 = True # Behavior control: flatten our memory-expensive Assign elements down # to much cheaper AssignFrozen elements? Skip if you plan on discarding # our data anyway, and save yourself 2% of our assign() time. self.flatten_memory = True # Behavior control: how many commits in a row is a branch allowed to # share with other branches before this branch should give up and # terminate? Leave unlimited for maximal branch re-use, minimal branch # count. self.tunnel_max_ct = TUNNEL_UNLIMITED # Tracking any current tunnelling through shared commits. # Tracked only when tunnel_max_ct is not TUNNEL_UNLIMITED. self.tunnel_list = list() # Behavior control: actually assign while tunnelling? Causes more # commits to be submitted to multiple branches, which is exactly what # we want for first-push. self.tunnel_assign = False # sha1 to Assign object, one for every pushed commit. self.assign_dict = {} # Instrumentation: How long are our branches? self.branch_len = {} # Record whether anonymous branches exist self.have_anonymous_branches = False if LOG.isEnabledFor(logging.DEBUG2): for i, prt in enumerate(pre_receive_list): LOG.debug2('PRT[{}] {}'.format(i, prt)) def assign(self): """Main entry point. Assign a branch ID to every rev in our rev_list.""" with Timer(TIMER_OVERALL): self._load_commit_dag() self._assign_to_loaded_dag() self._dump_instrumentation() def _assign_to_loaded_dag(self): """Run the assigner over our commit DAG.""" # Zig thinks this _add_assign_for_ref_heads() no longer necessary. with Timer(TIMER_BRANCH_HEAD): self._add_assign_for_ref_heads() with Timer(TIMER_ASSIGN_PREVIOUS): self._assign_previous() with Timer(TIMER_ASSIGN_BRANCH_NAMED): self._assign_branches_named() with Timer(TIMER_ASSIGN_BRANCH_ANON): self._assign_branches_anon() with Timer(TIMER_BRANCH_HEAD): self._force_assign_pushed_ref_heads() with Timer(TIMER_FREE_MEMORY): self._free_memory() def is_assigned(self, sha1): """Do we have at least one branch assigned for this commit?""" return sha1 in self.assign_dict def branch_id_list(self, sha1): """Return a list of branch_ids assigned to a commit.""" return self.assign_dict[sha1].branch_id_list() def _assign_previous(self): """Many commits in assign_dict were already assigned branches in a previous push or pull. Remember and honor those. """ if not self.assign_previous: return # Test-only mode skips this step and that's okay. if not self.ctx: return for assign in self.assign_dict.values(): otl = ObjectType.commits_for_sha1(self.ctx, assign.sha1) for ot in otl: self._assign_branch(assign, ot.branch_id) def _free_memory(self): """Dump everything we no longer need once assignment is complete. Replace large Assign instances with tiny AssignFrozen instances. """ if not self.flatten_memory: return #self.ctx = None #self.branch_dict = None #self.rev_list = None self.assign_dict = {sha1: AssignFrozen(assign.branch_id) for sha1, assign in self.assign_dict.items()} def annotate_lines(self, lines): """Append "(branch_id1)" string to any line that contains a sha1 that has a branch assignment. """ re_sha1 = re.compile(NTR('([0-9a-f]{40})')) r = [] for l in lines: m = re_sha1.search(l) if m: sha1 = m.group(1) if sha1 in self.assign_dict: annotation = ' ({})'.format(', ' .join(self.assign_dict[sha1].branch_id_list())) l += annotation r.append(l) return r def _dump_instrumentation(self): """Debugging dump of timing and other info.""" if _DUMP_LOG and LOG_GRAPH.isEnabledFor(logging.DEBUG3): cmd = list(_DUMP_LOG) + [prt.new_sha1 for prt in self.pre_receive_list] p = p4gf_proc.popen_no_throw(cmd) l = self.annotate_lines(p['out'].splitlines()) LOG_GRAPH.debug3('Log: {}\n{}'.format(' '.join(cmd), '\n'.join(l))) total_seconds = Timer(TIMER_OVERALL).time total_rev_ct = len(self.assign_dict) LOG_TIME.debug("branches : {}".format(len(self.branch_dict))) LOG_TIME.debug("commits : {}".format(total_rev_ct)) LOG_TIME.debug("seconds : {}".format(int(total_seconds + 0.5))) if 1.0 <= total_seconds: # Commits per second math becomes unreliable for short runs. rev_per_second = total_rev_ct / total_seconds LOG_TIME.debug("commits/second: {}".format(int(rev_per_second + 0.5))) if self.branch_len: histo = p4gf_histogram.to_histogram(self.branch_len.values()) histo_lines = p4gf_histogram.to_lines(histo) LOG_TIME.debug('Branch length histogram: how many branches have N commits?\n' + '\n'.join(histo_lines)) def _load_commit_dag(self): """Load the Git commit tree into memory. We just need the parent/child relationships. """ # pylint:disable=too-many-branches # A single call to git-rev-list produces both the commit sha1 list # that we need AND the child->parent associations that we need. It's # screaming fast: 32,000 commit lines in <1 second. with Timer(TIMER_RUN_REV_LIST): range_list = sorted(set([prt.to_range() for prt in self.pre_receive_list])) cmd = [ 'git', 'rev-list' , '--date-order', '--parents'] + range_list LOG.debug2("DAG: {}".format(' '.join(cmd))) d = p4gf_proc.popen(cmd) seen_parents = set() # Pass 1: Build up a dict of sha1->Assign objects, one per commit. with Timer(TIMER_CONSUME_REV_LIST): lines = d['out'].splitlines() with ProgressReporter.Determinate(len(lines)): for line in lines: ProgressReporter.increment(_('Loading commit tree into memory...')) sha1s = line.split() curr_sha1 = sha1s.pop(0) self.rev_list.append(curr_sha1) if LOG.isEnabledFor(logging.DEBUG3): LOG.debug3('DAG: rev_list {} {}' .format( p4gf_util.abbrev(curr_sha1) , ' '.join(p4gf_util.abbrev(sha1s)))) self.assign_dict[curr_sha1] = Assign(curr_sha1, sha1s) seen_parents.update(sha1s) # git-rev-list is awesome in that it gives us only as much as we need # for self.rev_list, but unawesome in that this optimization tends to # omit paths to branch refs' OLD heads if the old heads are 2+ commits # back in time, and that time is ALREADY covered by some OTHER branch. # Re-run each pushed branch separately to add enough Assign() nodes # to form a full path to its old ref. if 2 <= len(self.pre_receive_list): for prt in self.pre_receive_list: # Skip NEW branch refs: those don't have # to connect up to anything. if prt.old_sha1 == p4gf_const.NULL_COMMIT_SHA1: continue with Timer(TIMER_RUN_REV_LIST): cmd = ['git', 'rev-list' , '--date-order', '--parents', '--reverse', prt.to_range()] LOG.debug2("DAG: {}".format(' '.join(cmd))) d = p4gf_proc.popen(cmd) with Timer(TIMER_CONSUME_REV_LIST): for line in d['out'].splitlines(): sha1s = line.split() curr_sha1 = sha1s.pop(0) if curr_sha1 in self.assign_dict: break LOG.debug3('DAG: path {} {}' .format( p4gf_util.abbrev(curr_sha1) , ' '.join(p4gf_util.abbrev(sha1s)))) self.assign_dict[curr_sha1] = Assign(curr_sha1, sha1s) seen_parents.update(sha1s) # Create acting-as-parent-only nodes in dict, too. We don't process # these as part of iterating over revs, but we need them when # tree walking. with Timer(TIMER_CONSUME_REV_LIST): parent_only = seen_parents - set(self.assign_dict.keys()) for curr_sha1 in parent_only: if curr_sha1 in self.assign_dict: break LOG.debug3('DAG: par only {}'.format(p4gf_util.abbrev(curr_sha1))) self.assign_dict[curr_sha1] = Assign(curr_sha1, []) # Pass 2: Fill in Assign.children list with Timer(TIMER_ASSIGN_CHILDREN): with ProgressReporter.Determinate(len(self.assign_dict)): for assign in self.assign_dict.values(): ProgressReporter.increment(_('Finding child commits...')) for par_sha1 in assign.parents: par_assign = self.assign_dict.get(par_sha1) if par_assign: par_assign.children.add(assign.sha1) else: # Expected and okay: some parents already exist and # are not part of our push/fast-export list. LOG.debug2( "DAG: child {child} -> parent {parent}: parent not part of push" .format(child=assign.sha1[:7], parent=par_sha1[:7])) def _branch_id_to_sha1(self): """Return a dict of branch_id to sha1 of what the world should look like AFTER this push completes. Creates new Branch instances and adds them to branch_dict if we encounter branch refs not yet in our branch_dict. Creates new Assign instances and adds them to assign_dict if the referenced sha1 is not part of this push. Happens often: any reference that does not move, and sometimes new references too, point to commits that we received in some previous push. """ known = [p4gf_branch.BranchRef('refs/heads/' + bm.git_branch_name) for bm in self.undeleted_branches() if bm.git_branch_name] pushed = [prt.ref for prt in self.pre_receive_list] LOG.debug2('_assign_branch_id_to_ref_heads() known={} pushed={}' .format(known, pushed)) ref_list = set(known + pushed) ref_to_sha1 = p4gf_util.git_ref_list_to_sha1(ref_list) # Use the pre-receive tuples to help with commit-to-branch assignment. for prt in self.pre_receive_list: ref_to_sha1[prt.ref] = prt.new_sha1 result = {} for ref in sorted(ref_list): sha1 = ref_to_sha1[ref] if not sha1: LOG.debug2("abid() p4gf_util.git_ref_list_to_sha1()" " returned no sha1 for ref={}".format(ref)) continue branch_id = self.ref_to_branch_id(ref) if not branch_id: # This is a new branch. Create a new Branch instance # for it and record its Git name. branch = self._create_anon_branch() if ref.startswith('refs/heads/'): branch.git_branch_name = ref[len('refs/heads/'):] branch_id = branch.branch_id a = self.assign_dict.get(sha1) if not a: # Push assigns branch ref to a commit we already had from # some previous push/pull. Must store the assignment so that # p4gf_copy_to_p4 will know where to put the branch ref. # # This Assign object lacks parent info, but that's okay # since we're just using it to store a single branch ref # assignment, not in deeper branch id calculations. a = Assign(sha1) self.assign_dict[sha1] = a result[branch_id] = sha1 return result def _add_assign_for_ref_heads(self): """Make sure that each known or newly pushed branch reference has an Assign instance to (eventually) receive that branch assignment. _load_commit_dag() creates Assign instances only for pushed refs that point to a newly pushed commits. It does not see any old, unpushed refs, nor any pushed refs that point to commits that we received in an earlier push. Those are is not yet in rev_list or assign_dict. This Assign object lacks parent info, but that's okay since we're just using it to store a single branch ref assignment, not in deeper branch id calculations. The created Assign objects are NOT assigned to any branch yet. That's _assign_branches_named()'s job. """ for sha1 in self._branch_id_to_sha1().values(): a = self.assign_dict.get(sha1) if not a: a = Assign(sha1) self.assign_dict[sha1] = a def _force_assign_pushed_ref_heads(self): """Force all pushed references to be one of the (possibly multiple) branch assignments for their commits. This causes each branch's final head to create one final Perforce changelist on that branch, even if this commit appears on multiple branches. """ for branch in self._pushed_branch_sequence(): new_head_sha1 = self._branch_to_pushed_new_head_sha1(branch) assign = self.assign_dict.get(new_head_sha1) self._assign_branch(assign, branch.branch_id) def _assign_branches_named(self): """For each pushed branch reference, find a path from its new head location to its old head location (if any). Assign commits to the branch as we find the path. """ LOG.debug('_assign_branches_named') for branch in self._pushed_branch_sequence(): LOG.debug2('_assign_branches_named branch={}'.format(p4gf_branch.abbrev(branch))) if self.connect_to_previous_branch_sha1: old_head_sha1 = _branch_to_old_head_sha1(branch) else: old_head_sha1 = None new_head_sha1 = self._branch_to_pushed_new_head_sha1(branch) if new_head_sha1 is None: raise RuntimeError(_("BUG: _pushed_branch_sequence() returned a branch" " '{branch}' with no corresponding PreReceiveTuple") .format(branch=branch.to_log(LOG))) if new_head_sha1 == p4gf_const.NULL_COMMIT_SHA1: LOG.debug('_assign_branches_named(): skipping branch={}:' ' new sha1 is 0000000'.format(branch.to_log(LOG))) return if old_head_sha1: self._assign_branch_named_old_to_new( branch , old_head_sha1=old_head_sha1 , new_head_sha1=new_head_sha1) else: self._assign_branch_named_any_to_new( branch , new_head_sha1=new_head_sha1) def _assign_branch_named_old_to_new(self, branch, old_head_sha1, new_head_sha1): """Find a path from this branch's new head location back to its old head location. Assign this branch to all commits along the path, unless those commits already have a branch assignment. """ if LOG.isEnabledFor(logging.DEBUG2): LOG.debug2('_assign_branch_named_old_to_new() branch={} {} from {}..{}' .format( p4gf_branch.abbrev(branch.branch_id) , branch.git_branch_name , p4gf_util.abbrev(old_head_sha1) , p4gf_util.abbrev(new_head_sha1))) # Note which commits are descendants of the old head. # Only such commits are possible choices when creating a path # from new head to old. self._set_reachable_by(old_head_sha1, branch) # Choose only reachable parents to create the path. self._assign_path( assign_branch=branch , new_head_sha1=new_head_sha1 , reachable_by=branch) def _assign_branch_named_any_to_new(self, branch, new_head_sha1): """Starting at new head and working back through parent links to any root-most newly pushed commit, assign branch to commits along the path unless such commits already have a branch assignment. """ if LOG.isEnabledFor(logging.DEBUG2): LOG.debug2('_assign_branch_named_any_to_new() branch={} {} from ???????..{}' .format( p4gf_branch.abbrev(branch.branch_id) , branch.git_branch_name , p4gf_util.abbrev(new_head_sha1))) # Choose any parents to create the path. self._assign_path( assign_branch=branch , new_head_sha1=new_head_sha1 , reachable_by=None) def _branch_to_pushed_new_head_sha1(self, branch): """Return the new head sha1 in branch's corresponding PreReceiveTuple.""" prt_ref = p4gf_branch.BranchRef('refs/heads/' + branch.git_branch_name) for prt in self.pre_receive_list: if prt.ref == prt_ref: return prt.new_sha1 return None def _assign_path(self, assign_branch, new_head_sha1, reachable_by): """Starting at new head and working back through parent links to old head, assign branch to commits along the path unless such commits already have a branch assignment. """ if LOG.isEnabledFor(logging.DEBUG2): LOG.debug2('_assign_path() new_head={new_head_sha1}' ' assign={assign_branch} reachable={reachable_by}' .format(new_head_sha1 = p4gf_util.abbrev(new_head_sha1) , assign_branch = p4gf_branch.abbrev(assign_branch) , reachable_by = p4gf_branch.abbrev(reachable_by) )) curr_assign = self.assign_dict.get(new_head_sha1) self._tunnel_reset() while True: if not curr_assign.branch_id: self._assign_branch(curr_assign, assign_branch.branch_id) if LOG.isEnabledFor(logging.DEBUG3): LOG.debug3('_assign_path curr={} assigned {}' .format( p4gf_util.abbrev(curr_assign.sha1) , p4gf_util.abbrev(assign_branch.branch_id))) else: # Normally don't assign if already assigned, # but in tunnel mode, yeah, assign. if ( self.tunnel_assign and assign_branch.branch_id not in curr_assign.branch_id): self._assign_branch(curr_assign, assign_branch.branch_id) LOG.debug3('_assign_path curr={} tunnelling through ({})' .format( p4gf_util.abbrev(curr_assign.sha1) , p4gf_util.abbrev(curr_assign.branch_id_str()))) else: LOG.debug3('_assign_path curr={} already assigned ({})' .format( p4gf_util.abbrev(curr_assign.sha1) , p4gf_util.abbrev(curr_assign.branch_id_str()))) chosen_par_assign = self._best_parent_assign( curr_assign, assign_branch.branch_id , reachable_by) if not chosen_par_assign: if LOG.isEnabledFor(logging.DEBUG3): LOG.debug3('_assign_path curr={} no usable parent. Done.' .format(p4gf_util.abbrev(curr_assign.sha1))) self._tunnel_back_out(assign_branch.branch_id) break if not self._tunnel_check_passes( chosen_par_assign , assign_branch.branch_id): if LOG.isEnabledFor(logging.DEBUG3): LOG.debug3('_assign_path curr={} exceeds tunnel length. Done.' .format(p4gf_util.abbrev(curr_assign.sha1))) break curr_assign = chosen_par_assign def _best_parent_assign(self, child_assign, branch_id, required_reachable_by=None): """Return one of child_assign's parent Assign instances. First available match in this order: Choose which parent to follow as first of: 1. assigned parent already assigned to this branch 2. unassigned first-parent *reachable 3. unassigned any parent *reachable 4. assigned first-parent *reachable 5. assigned any parent *reachable If required_reachable_by passed in as non-None, then considers only parents with reachable_by==required_reachable_by """ # pylint: disable=too-many-branches if not child_assign.parents: return None # 1. Do we already have one parent assigned to this branch? # Never assign more than one parent to the same branch. par_assign_list = None for par_sha1 in child_assign.parents: par_assign = self.assign_dict.get(par_sha1) if not par_assign: continue if par_assign.branch_id and branch_id in par_assign.branch_id: return par_assign # 2. unassigned first-parent *reachable first_par_assign = self.assign_dict.get(child_assign.parents[0]) # Require reachable (if caller requested) if ( first_par_assign and required_reachable_by and first_par_assign.reachable_by is not required_reachable_by): LOG.debug('# par.reachable={} != req={}' .format( first_par_assign.reachable_by , required_reachable_by)) first_par_assign = None # Unassigned? We've got a winner. if first_par_assign and not first_par_assign.branch_id: return first_par_assign # 3. unassigned any parent *reachable par_assign_list = None for par_sha1 in child_assign.parents: par_assign = self.assign_dict.get(par_sha1) if not par_assign: continue if ( required_reachable_by and par_assign.reachable_by is not required_reachable_by): continue # Unassigned? We've got a winner. if not par_assign.branch_id: return par_assign # No unassigned winner yet? # Remember our list of parent Assign for later if not par_assign_list: par_assign_list = [par_assign] else: par_assign_list.append(par_assign) # 4. assigned first-parent *reachable if first_par_assign: return first_par_assign # 5. assigned any parent *reachable if par_assign_list: return par_assign_list[0] return None def _set_reachable_by(self, old_head_sha1, reachable_by): """Tree-walk a commit and all of its descendants, setting their reachable_by pointer to the given branch. O(n) commits worst-case (all commits child of old_head_sha1) """ LOG.debug2('_set_reachable_by() old_head_sha1={} reachable_by={}' .format( p4gf_util.abbrev(old_head_sha1) , p4gf_util.abbrev(reachable_by.branch_id))) old_head_assign = self.assign_dict.get(old_head_sha1) if not old_head_assign: LOG.debug3('_set_reachable_by() old_head not in assign_dict. Done.') return work_queue = [old_head_assign] while work_queue: curr_assign = work_queue.pop() if not curr_assign: continue curr_assign.reachable_by = reachable_by LOG.debug2('curr={} set {}' .format( p4gf_util.abbrev(curr_assign.sha1) , p4gf_util.abbrev(reachable_by.branch_id))) for child_sha1 in curr_assign.children: # Visit children, but skip ones we've already seen due to some # other path (merge commits) child_assign = self.assign_dict.get(child_sha1) if not child_assign: LOG.debug3('curr={} child={} not found' .format( p4gf_util.abbrev(curr_assign.sha1) , p4gf_util.abbrev(child_sha1))) continue elif child_assign.reachable_by is reachable_by: LOG.debug3('curr={} child={} already set' .format( p4gf_util.abbrev(curr_assign.sha1) , p4gf_util.abbrev(child_sha1))) continue else: LOG.debug3('curr={} child={} enqueued' .format( p4gf_util.abbrev(curr_assign.sha1) , p4gf_util.abbrev(child_sha1))) work_queue.append(child_assign) def _pushed_branch_sequence(self): """Return a list of Branch instances, one for each pushed branch reference. Order the branches by priority: which branches get first dibs on unassigned commits? 1. master-ish 2. named fully populated 3. named lightweight """ masterish = None fully_populated = [] lightweight = [] pushed_git_name_list = [p4gf_branch.BranchRef(prt.ref[len('refs/heads/'):]) for prt in self.pre_receive_list if prt.ref.startswith('refs/heads/')] for branch in self.undeleted_branches(): if ( (not branch.git_branch_name) or (not branch.git_branch_name in pushed_git_name_list)): continue elif branch.more_equal: masterish = branch elif branch.is_lightweight: lightweight.append(branch) else: fully_populated.append(branch) if masterish: result = [masterish] else: result = [] # Why sorted()? Order within a group doesn't _officially_ matter, but # it is a lot easier to write reproducible tests if we have a # reproducible order. git_branch_name is easily visible and controllable # by test scripts so favor that when available. Fall back to branch_id # for anonymous branches that lack a git_branch_name. result.extend(sorted(fully_populated, key=lambda br: br.git_branch_name)) result.extend(sorted(lightweight, key=lambda br: br.branch_id)) # LOG.debug('### _pushed_branch_sequence returning={}'.format(result)) return result def _assign_branches_anon(self): """Assign each remaining unassigned commit to an anonymous branch. Reuse existing anonymous branches before creating a new one. """ # Sort by branch_id solely to force reproducible results. anon_branch_list = sorted( (b for b in self.undeleted_branches() if not b.git_branch_name) , key=lambda x: x.branch_id , reverse=True ) # For each commit with no branch assignment for sha1 in self.rev_list: assign = self.assign_dict.get(sha1) if assign.branch_id: continue # Record if exist at least one anonymous branch # This setting used to support enable-git-branch-creation self.have_anonymous_branches = True # 3.1 Reuse existing, or create new anonymous # lightweight branch for this commit if anon_branch_list: branch = anon_branch_list.pop() else: branch = self._create_anon_branch() # 3.2. Repeat 2.2: assign this anonymous branch to # this commit and a single chain of parents back # to the start of pushed history. self._assign_path( assign_branch=branch , new_head_sha1=sha1 , reachable_by =None) def _create_anon_branch(self): """Create a new Branch object to hold an anonymous branch. View left empty. We haven't completed our branch assignment pass, and view depends on parent commit(s)'s assigned branch and its view. Store new Branch object in our branch_dict. """ # Test-only mode omits a context if self.ctx: p4gf = self.ctx.p4gf else: p4gf = None branch = p4gf_branch.Branch(p4=p4gf) branch.is_lightweight = True self.branch_dict[branch.branch_id] = branch return branch def ref_to_branch_id(self, ref): """Which Git Fusion branch ID contains ref as a Git branch name? O(n) scan. """ if ref.startswith('refs/heads/'): ref = ref[11:] ref = p4gf_branch.BranchRef(ref) for bm in self.undeleted_branches(): # ignore any deleted branch if bm.git_branch_name == ref and not bm.deleted: return bm.branch_id return None def _assign_branch(self, assign, branch_id): """Add branch_id to assign's list of branches.""" if assign.add_branch_id(branch_id): _increment_bucket(branch_id, self.branch_len) def _unassign_branch(self, assign, branch_id): """Remove branch_id from assign's list of branches.""" if assign.remove_branch_id(branch_id): _decrement_bucket(branch_id, self.branch_len) def undeleted_branches(self): """An iterator/generator of all Branch values in branch_dict that are not deleted. Intentionally duplicating Context.undeleted_branches() here rather than requiring a non-None self.ctx """ for branch in self.branch_dict.values(): if branch.deleted: continue yield branch def to_dict(self): """Convert this object to a map that can be serialized via JSON.""" result = dict() result['rev_list'] = self.rev_list assign_dict = {k: v.to_dict() for k, v in self.assign_dict.items()} result['assign_dict'] = assign_dict return result @staticmethod def from_dict(d, branch_dict, pre_receive_list, ctx): """Create an instance based on the given map, as from to_dict.""" result = Assigner(branch_dict, pre_receive_list, ctx) result.rev_list = d['rev_list'] ad = d['assign_dict'] result.assign_dict = {k: AssignFrozen.from_dict(ad[k]) for k in ad} return result def _tunnel_check_passes(self, chosen_par_assign, branch_id): """Does this assignment fit within set limits about sharing commits with other branches? In addition to testing for violation, update internal tracking list. """ # Not tracking tunnel length? # Then don't...er...track tunnel length. if self.tunnel_max_ct == TUNNEL_UNLIMITED: return True is_tunnel = 0 < len(chosen_par_assign.branch_id) # Not tunnelling? Then clear any current tunnel # tracking and permit it. if not is_tunnel: self._tunnel_reset() return True # Tunnel can accept one more? # Then do so and we're done. if len(self.tunnel_list) < self.tunnel_max_ct: LOG.debug("tunneling through {} for branch_id={} already assigned={}" .format(p4gf_util.abbrev(chosen_par_assign.sha1) , branch_id , chosen_par_assign.branch_id_str())) self.tunnel_list.append(chosen_par_assign) return True # Tunnel cannot accept any more assignments. # Current tunnel attempt has failed. Un-assign # everything in the tunnel, then reject. LOG.debug("backing out, len={} sha1={} branch_id={} tunnel={}" .format(len(self.tunnel_list) , p4gf_util.abbrev(chosen_par_assign.sha1) , branch_id , ", ".join( [p4gf_util.abbrev(a.sha1) for a in self.tunnel_list]) )) self._tunnel_back_out(branch_id) return False def _tunnel_reset(self): """Clear any tunnel tracking list: we're starting a new branch.""" if self.tunnel_list: LOG.debug("tunnel reset") self.tunnel_list = list() def _tunnel_back_out(self, branch_id): """Tunnel too long. Remove assignment.""" for assign in self.tunnel_list: self._unassign_branch(assign, branch_id) # -- class Assign ------------------------------------------------------------- class Assign: """A single commit's branch assignment and the work leading up to that assignment. """ def __init__(self, sha1=None, parents=None): # This commit's sha1 self.sha1 = sha1 # Which branches of Perforce file hierarchy receive this commit. Can # be multiple if multiple branch heads point to the same commit # (only happens for heads right now, we lack the smarts to know when # to drag multiple branches back through shared history.) # # Likely to convert to single string, with heads' multi-branch-id # stored in a field that we can leave None for most of the time. self.branch_id = set() # List of child commit sha1s that use this commit as a parent. self.children = set() # List of parent sha1s, first-parent is [0]. self.parents = parents # Is this commit a descendant of the current branch's old head commit? # If so, then reachable_by points to the current branch instance. If # not, then reachable_by is left pointing to either None or some other # previous branch instance. self.reachable_by = None def add_branch_id(self, branch_id): """Assign this commit to the given branch_id. Return True if actually added, False if already had this branch_id assigned. """ assert isinstance(branch_id, p4gf_branch.BranchId) if branch_id in self.branch_id: return False self.branch_id.add(branch_id) return True def remove_branch_id(self, branch_id): """Un-assign this commit from the given branch_id. Return True if actually removed, False if already lacked this branch_id. """ assert isinstance(branch_id, p4gf_branch.BranchId) if branch_id not in self.branch_id: return False self.branch_id.remove(branch_id) return True def branch_id_str(self): """Return our branch ID(s) as a single string.""" if not self.branch_id: return None return ' '.join(self.branch_id) def branch_id_list(self): """Return our branch ID(s) as a list. Return empty list if no assignments. """ return list(self.branch_id) def release_links(self): """Sever all ties to parents and children, but retain sha1. Called by FastPush as it iterates through our assignments, freeing memory by destructively removing Assign elements as it process the assigner dict. Keep the sha1: before FastPush severs a child's link to its parent(s), it (very rarely) needs to its first-parent sha1 to use that for a ghost changelist. """ self.branch_id = None self.children = None self.parents = None self.reachable_by = None def __repr__(self): return '{} {}'.format(p4gf_util.abbrev(self.sha1), self.branch_id_str()) # -- class AssignFrozen ------------------------------------------------------- class AssignFrozen: """A single commit's branch assignment. A smaller subset of what Assign stores. We don't need to keep around all that work-in-progress data once we're done assigning branches. Switching to this smaller class frees up memory for later use. """ def __init__(self, branch_id=None): # _branch_id is usually a single branch_id str # But for the rare case where multiple pushed branch refs point # to the same commit, _branch_id is a list of branch_id str. # self._branch_id = [p4gf_branch.BranchId(b) for b in self._flatten(branch_id)] def branch_id_list(self): """Return our branch ID(s) as a list.""" return list(self._branch_id) def branch_id_str(self): """Return our branch ID(s) as a single string.""" if not self._branch_id: return None return ' '.join(self.branch_id_list()) @staticmethod def _flatten(branch_id): """Prefer scalar, then list.""" if not branch_id or isinstance(branch_id, str): return branch_id # Sorting solely to allow reproducible test results. # Order should not matter. return sorted(branch_id) def to_dict(self): """Convert this object to a map that can be serialized via JSON.""" return {'branch_id': self._branch_id} @staticmethod def from_dict(d): """Create an instance based on the given map, as from to_dict.""" return AssignFrozen(d['branch_id']) # -- class PreReceiveTuple ---------------------------------------------------- class PreReceiveTuple: """One line from a pre-receive-hook stdin.""" def __init__(self, old_sha1, new_sha1, ref): self.old_sha1 = old_sha1 self.new_sha1 = new_sha1 self._ref = None self.ref = ref @property def ref(self): """Get ref.""" return self._ref @ref.setter def ref(self, value): """Set ref.""" self._ref = p4gf_branch.BranchRef(value) def to_range(self): """Return old..new if we have an old, or just new if we don't.""" if (not self.old_sha1) or (self.old_sha1 == p4gf_const.NULL_COMMIT_SHA1): return self.new_sha1 return '{}..{}'.format(self.old_sha1, self.new_sha1) def __str__(self): """Return string representation of pre-receive info.""" return 'old={0}, new={1}, ref={2}'.format( p4gf_util.abbrev(self.old_sha1) , p4gf_util.abbrev(self.new_sha1) , self.ref ) @staticmethod def from_line(line): """Inflate from a space-delimited line such as what Git feeds us on STDIN or what to_line() returns. """ words = line.strip().split() return PreReceiveTuple(words[0], words[1], words[2]) def to_line(self): """Format a tuple into something that from_line() can read.""" return '{} {} {}'.format(self.old_sha1, self.new_sha1, self.ref) def git_branch_name(self): """Return the "mybranch" portion of "refs/heads/mybranch".""" prefix = 'refs/heads/' if self.ref.startswith(prefix): return p4gf_branch.BranchRef(self.ref[len(prefix):]) else: return None def to_dict(self): """For easier JSON formatting.""" return { 'old_sha1' : self.old_sha1 , 'new_sha1' : self.new_sha1 , 'ref' : self.ref } @staticmethod def from_dict(d): """For easier JSON parsing.""" return PreReceiveTuple( old_sha1 = d['old_sha1'] , new_sha1 = d['new_sha1'] , ref = d['ref' ] ) # p4gf_profiler timer names TIMER_OVERALL = NTR('p4gf_branch_id total') TIMER_RUN_REV_LIST = NTR('run_rev_list') TIMER_CONSUME_REV_LIST = NTR('consume_rev_list') TIMER_ASSIGN_CHILDREN = NTR('assign_children') TIMER_BRANCH_HEAD = NTR('branch_head') TIMER_ASSIGN_PREVIOUS = NTR('assign_previous') TIMER_ASSIGN_BRANCH_ANON_HEADS = NTR('assign_branch_anon_heads') TIMER_ASSIGN_BRANCH_NAMED = NTR('assign_branch_named') TIMER_ASSIGN_BRANCH_ANON = NTR('assign_branch_anon') TIMER_FREE_MEMORY = NTR('free_memory') def _increment_bucket(key, coll): """Add 1 to a bucket, creating bucket if necessary.""" if key in coll: coll[key] += 1 else: coll[key] = 1 def _decrement_bucket(key, coll): """Subtract 1 from a bucket.""" if key in coll and 0 < coll[key]: coll[key] -= 1 def _branch_to_old_head_sha1(branch): """Test-replacable hook for looking up a branch's pre-push sha1.""" return branch.sha1_for_branch() # -- test scaffolding --------------------------------------------------------- class _TestRefHead: """A Git reference's symbolic name and sha1 value.""" def __init__(self): self.name = None self.head_sha1 = None def __str__(self): return "{} {}".format(self.head_sha1, self.name) def __repr__(self): return self.__str__() @staticmethod def to_sha1_list(refhead_list): """Return the sha1 of each _TestRefHead in a list.""" return [x.head_sha1 for x in refhead_list] def _test_return_none(_branch): """Test replacement for _branch_to_old_head_sha1() to look more like a pre-push repo. """ return None def _test_dump_result_to_stdout(assigner): """Dump all assignments to stdout in a format that a test script would enjoy.""" # print("Commit count: {}".format(len(assigner.rev_list))) fmt = NTR("{sha1:<7.7}\t{branch_id}\t{subject}") for rev in assigner.rev_list: p = p4gf_proc.popen(['git', 'log', '-1', '--pretty=format:%s', rev]) subject = p['out'].splitlines()[0] branch_id = assigner.assign_dict[rev].branch_id_str() print(fmt.format(sha1=rev, branch_id=branch_id, subject=subject)) def _test_dump_all_commits(branch_dict, git_dir, pushed_ref_list): """Dump a 'git rev-list' of all commits in git_dir, annotated with the name of which branch of Perforce file hierarchy that this commit should go into. """ os.environ['GIT_DIR'] = git_dir LOG.debug2('pushed_ref_list={}'.format(pushed_ref_list)) # ### Need a better way to test various "what if I push X and not Y?" ref_to_sha1 = p4gf_util.git_ref_list_to_sha1(pushed_ref_list) LOG.debug2('ref_to_sha1={}'.format(ref_to_sha1)) prl = [PreReceiveTuple( old_sha1=p4gf_const.NULL_COMMIT_SHA1 , new_sha1=ref_to_sha1[ref] , ref =ref) for ref in pushed_ref_list] assigner = Assigner(branch_dict, prl) assigner.assign() _test_dump_result_to_stdout(assigner) def main(): """Log information regarding branches in a repository.""" desc = _("""Runs branch-id calculation on the branches defined in repo-config-file within the given git directory See debug log for results.""") parser = p4gf_util.create_arg_parser(desc=desc) parser.add_argument('config_file', metavar='') parser.add_argument('git_dir', metavar='') parser.add_argument('ref', metavar='', nargs='*') args = parser.parse_args() with p4gf_log.ExceptionLogger(): p4gf_proc.init() p4gf_branch.init_case_handling() _refs = args.ref if args.ref else ['master'] LOG.debug('config={} dir={}\nrefs={}'.format(args.config_file, args.git_dir, _refs)) p4gf_branch.use_consecutive_branch_ids() _config = configparser.ConfigParser() _config.read(args.config_file) _branch_dict = p4gf_branch.dict_from_config(_config) global _branch_to_old_head_sha1 # pylint:disable=global-variable-undefined _branch_to_old_head_sha1 = _test_return_none _git_dir = args.git_dir _remote_refs = ['refs/heads/' + ref for ref in _refs] _test_dump_all_commits(_branch_dict, _git_dir, _remote_refs) if __name__ == "__main__": main()