From 38d30402f956cdce3c755f2616011301cba009e0 Mon Sep 17 00:00:00 2001 From: Stan Ke <156306548@qq.com> Date: Mon, 11 Mar 2024 16:21:49 +0800 Subject: [PATCH] New posts --- git-filter-repo | 4005 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 4005 insertions(+) create mode 100644 git-filter-repo diff --git a/git-filter-repo b/git-filter-repo new file mode 100644 index 0000000..e53b43a --- /dev/null +++ b/git-filter-repo @@ -0,0 +1,4005 @@ +#!/usr/bin/env python3 + +""" +git-filter-repo filters git repositories, similar to git filter-branch, BFG +repo cleaner, and others. The basic idea is that it works by running + git fast-export | filter | git fast-import +where this program not only launches the whole pipeline but also serves as +the 'filter' in the middle. It does a few additional things on top as well +in order to make it into a well-rounded filtering tool. + +git-filter-repo can also be used as a library for more involved filtering +operations; however: + ***** API BACKWARD COMPATIBILITY CAVEAT ***** + Programs using git-filter-repo as a library can reach pretty far into its + internals, but I am not prepared to guarantee backward compatibility of + all APIs. I suspect changes will be rare, but I reserve the right to + change any API. Since it is assumed that repository filtering is + something one would do very rarely, and in particular that it's a + one-shot operation, this should not be a problem in practice for anyone. + However, if you want to re-use a program you have written that uses + git-filter-repo as a library (or makes use of one of its --*-callback + arguments), you should either make sure you are using the same version of + git and git-filter-repo, or make sure to re-test it. + + If there are particular pieces of the API you are concerned about, and + there is not already a testcase for it in t9391-lib-usage.sh or + t9392-python-callback.sh, please contribute a testcase. That will not + prevent me from changing the API, but it will allow you to look at the + history of a testcase to see whether and how the API changed. + ***** END API BACKWARD COMPATIBILITY CAVEAT ***** +""" + +import argparse +import collections +import fnmatch +import gettext +import io +import os +import platform +import re +import shutil +import subprocess +import sys +import time +import textwrap + +from datetime import tzinfo, timedelta, datetime + +__all__ = ["Blob", "Reset", "FileChange", "Commit", "Tag", "Progress", + "Checkpoint", "FastExportParser", "ProgressWriter", + "string_to_date", "date_to_string", + "record_id_rename", "GitUtils", "FilteringOptions", "RepoFilter"] + +deleted_hash = b'0'*40 +write_marks = True +date_format_permissive = True + +def gettext_poison(msg): + if "GIT_TEST_GETTEXT_POISON" in os.environ: # pragma: no cover + return "# GETTEXT POISON #" + return gettext.gettext(msg) + +_ = gettext_poison + +def setup_gettext(): + TEXTDOMAIN="git-filter-repo" + podir = os.environ.get("GIT_TEXTDOMAINDIR") or "@@LOCALEDIR@@" + if not os.path.isdir(podir): # pragma: no cover + podir = None # Python has its own fallback; use that + + ## This looks like the most straightforward translation of the relevant + ## code in git.git:gettext.c and git.git:perl/Git/I18n.pm: + #import locale + #locale.setlocale(locale.LC_MESSAGES, ""); + #locale.setlocale(locale.LC_TIME, ""); + #locale.textdomain(TEXTDOMAIN); + #locale.bindtextdomain(TEXTDOMAIN, podir); + ## but the python docs suggest using the gettext module (which doesn't + ## have setlocale()) instead, so: + gettext.textdomain(TEXTDOMAIN); + gettext.bindtextdomain(TEXTDOMAIN, podir); + +def _timedelta_to_seconds(delta): + """ + Converts timedelta to seconds + """ + offset = delta.days*86400 + delta.seconds + (delta.microseconds+0.0)/1000000 + return round(offset) + +class FixedTimeZone(tzinfo): + """ + Fixed offset in minutes east from UTC. + """ + + tz_re = re.compile(br'^([-+]?)(\d\d)(\d\d)$') + + def __init__(self, offset_string): + tzinfo.__init__(self) + sign, hh, mm = FixedTimeZone.tz_re.match(offset_string).groups() + factor = -1 if (sign and sign == b'-') else 1 + self._offset = timedelta(minutes = factor*(60*int(hh) + int(mm))) + self._offset_string = offset_string + + def utcoffset(self, dt): + return self._offset + + def tzname(self, dt): + return self._offset_string + + def dst(self, dt): + return timedelta(0) + +def string_to_date(datestring): + (unix_timestamp, tz_offset) = datestring.split() + return datetime.fromtimestamp(int(unix_timestamp), + FixedTimeZone(tz_offset)) + +def date_to_string(dateobj): + epoch = datetime.fromtimestamp(0, dateobj.tzinfo) + return(b'%d %s' % (int(_timedelta_to_seconds(dateobj - epoch)), + dateobj.tzinfo.tzname(0))) + +def decode(bytestr): + 'Try to convert bytestr to utf-8 for outputting as an error message.' + return bytestr.decode('utf-8', 'backslashreplace') + +def glob_to_regex(glob_bytestr): + 'Translate glob_bytestr into a regex on bytestrings' + + # fnmatch.translate is idiotic and won't accept bytestrings + if (decode(glob_bytestr).encode() != glob_bytestr): # pragma: no cover + raise SystemExit(_("Error: Cannot handle glob %s").format(glob_bytestr)) + + # Create regex operating on string + regex = fnmatch.translate(decode(glob_bytestr)) + + # FIXME: This is an ugly hack... + # fnmatch.translate tries to do multi-line matching and wants the glob to + # match up to the end of the input, which isn't relevant for us, so we + # have to modify the regex. fnmatch.translate has used different regex + # constructs to achieve this with different python versions, so we have + # to check for each of them and then fix it up. It would be much better + # if fnmatch.translate could just take some flags to allow us to specify + # what we want rather than employing this hackery, but since it + # doesn't... + if regex.endswith(r'\Z(?ms)'): # pragma: no cover + regex = regex[0:-7] + elif regex.startswith(r'(?s:') and regex.endswith(r')\Z'): # pragma: no cover + regex = regex[4:-3] + + # Finally, convert back to regex operating on bytestr + return regex.encode() + +class PathQuoting: + _unescape = {b'a': b'\a', + b'b': b'\b', + b'f': b'\f', + b'n': b'\n', + b'r': b'\r', + b't': b'\t', + b'v': b'\v', + b'"': b'"', + b'\\':b'\\'} + _unescape_re = re.compile(br'\\([a-z"\\]|[0-9]{3})') + _escape = [bytes([x]) for x in range(127)]+[ + b'\\'+bytes(ord(c) for c in oct(x)[2:]) for x in range(127,256)] + _reverse = dict(map(reversed, _unescape.items())) + for x in _reverse: + _escape[ord(x)] = b'\\'+_reverse[x] + _special_chars = [len(x) > 1 for x in _escape] + + @staticmethod + def unescape_sequence(orig): + seq = orig.group(1) + return PathQuoting._unescape[seq] if len(seq) == 1 else bytes([int(seq, 8)]) + + @staticmethod + def dequote(quoted_string): + if quoted_string.startswith(b'"'): + assert quoted_string.endswith(b'"') + return PathQuoting._unescape_re.sub(PathQuoting.unescape_sequence, + quoted_string[1:-1]) + return quoted_string + + @staticmethod + def enquote(unquoted_string): + # Option 1: Quoting when fast-export would: + # pqsc = PathQuoting._special_chars + # if any(pqsc[x] for x in set(unquoted_string)): + # Option 2, perf hack: do minimal amount of quoting required by fast-import + if unquoted_string.startswith(b'"') or b'\n' in unquoted_string: + pqe = PathQuoting._escape + return b'"' + b''.join(pqe[x] for x in unquoted_string) + b'"' + return unquoted_string + +class AncestryGraph(object): + """ + A class that maintains a direct acycle graph of commits for the purpose of + determining if one commit is the ancestor of another. + """ + + def __init__(self): + self.cur_value = 0 + + # A mapping from the external identifers given to us to the simple integers + # we use in self.graph + self.value = {} + + # A tuple of (depth, list-of-ancestors). Values and keys in this graph are + # all integers from the self.value dict. The depth of a commit is one more + # than the max depth of any of its ancestors. + self.graph = {} + + # Cached results from previous calls to is_ancestor(). + self._cached_is_ancestor = {} + + def record_external_commits(self, external_commits): + """ + Record in graph that each commit in external_commits exists, and is + treated as a root commit with no parents. + """ + for c in external_commits: + if c not in self.value: + self.cur_value += 1 + self.value[c] = self.cur_value + self.graph[self.cur_value] = (1, []) + + def add_commit_and_parents(self, commit, parents): + """ + Record in graph that commit has the given parents. parents _MUST_ have + been first recorded. commit _MUST_ not have been recorded yet. + """ + assert all(p in self.value for p in parents) + assert commit not in self.value + + # Get values for commit and parents + self.cur_value += 1 + self.value[commit] = self.cur_value + graph_parents = [self.value[x] for x in parents] + + # Determine depth for commit, then insert the info into the graph + depth = 1 + if parents: + depth += max(self.graph[p][0] for p in graph_parents) + self.graph[self.cur_value] = (depth, graph_parents) + + def is_ancestor(self, possible_ancestor, check): + """ + Return whether possible_ancestor is an ancestor of check + """ + a, b = self.value[possible_ancestor], self.value[check] + original_pair = (a,b) + a_depth = self.graph[a][0] + ancestors = [b] + visited = set() + while ancestors: + ancestor = ancestors.pop() + prev_pair = (a, ancestor) + if prev_pair in self._cached_is_ancestor: + if not self._cached_is_ancestor[prev_pair]: + continue + self._cached_is_ancestor[original_pair] = True + return True + if ancestor in visited: + continue + visited.add(ancestor) + depth, more_ancestors = self.graph[ancestor] + if ancestor == a: + self._cached_is_ancestor[original_pair] = True + return True + elif depth <= a_depth: + continue + ancestors.extend(more_ancestors) + self._cached_is_ancestor[original_pair] = False + return False + +class MailmapInfo(object): + def __init__(self, filename): + self.changes = {} + self._parse_file(filename) + + def _parse_file(self, filename): + name_and_email_re = re.compile(br'(.*?)\s*<([^>]*)>\s*') + comment_re = re.compile(br'\s*#.*') + if not os.access(filename, os.R_OK): + raise SystemExit(_("Cannot read %s") % decode(filename)) + with open(filename, 'br') as f: + count = 0 + for line in f: + count += 1 + err = "Unparseable mailmap file: line #{} is bad: {}".format(count, line) + # Remove comments + line = comment_re.sub(b'', line) + # Remove leading and trailing whitespace + line = line.strip() + if not line: + continue + + m = name_and_email_re.match(line) + if not m: + raise SystemExit(err) + proper_name, proper_email = m.groups() + if len(line) == m.end(): + self.changes[(None, proper_email)] = (proper_name, proper_email) + continue + rest = line[m.end():] + m = name_and_email_re.match(rest) + if m: + commit_name, commit_email = m.groups() + if len(rest) != m.end(): + raise SystemExit(err) + else: + commit_name, commit_email = rest, None + self.changes[(commit_name, commit_email)] = (proper_name, proper_email) + + def translate(self, name, email): + ''' Given a name and email, return the expected new name and email from the + mailmap if there is a translation rule for it, otherwise just return + the given name and email.''' + for old, new in self.changes.items(): + old_name, old_email = old + new_name, new_email = new + if (old_email is None or email.lower() == old_email.lower()) and ( + name == old_name or not old_name): + return (new_name or name, new_email or email) + return (name, email) + +class ProgressWriter(object): + def __init__(self): + self._last_progress_update = time.time() + self._last_message = None + + def show(self, msg): + self._last_message = msg + now = time.time() + if now - self._last_progress_update > .1: + self._last_progress_update = now + sys.stdout.write("\r{}".format(msg)) + sys.stdout.flush() + + def finish(self): + self._last_progress_update = 0 + if self._last_message: + self.show(self._last_message) + sys.stdout.write("\n") + +class _IDs(object): + """ + A class that maintains the 'name domain' of all the 'marks' (short int + id for a blob/commit git object). The reason this mechanism is necessary + is because the text of fast-export may refer to an object using a different + mark than the mark that was assigned to that object using IDS.new(). This + class allows you to translate the fast-export marks (old) to the marks + assigned from IDS.new() (new). + + Note that there are two reasons why the marks may differ: (1) The + user manually creates Blob or Commit objects (for insertion into the + stream) (2) We're reading the data from two different repositories + and trying to combine the data (git fast-export will number ids from + 1...n, and having two 1's, two 2's, two 3's, causes issues). + """ + + def __init__(self): + """ + Init + """ + # The id for the next created blob/commit object + self._next_id = 1 + + # A map of old-ids to new-ids (1:1 map) + self._translation = {} + + # A map of new-ids to every old-id that points to the new-id (1:N map) + self._reverse_translation = {} + + def has_renames(self): + """ + Return whether there have been ids remapped to new values + """ + return bool(self._translation) + + def new(self): + """ + Should be called whenever a new blob or commit object is created. The + returned value should be used as the id/mark for that object. + """ + rv = self._next_id + self._next_id += 1 + return rv + + def record_rename(self, old_id, new_id, handle_transitivity = False): + """ + Record that old_id is being renamed to new_id. + """ + if old_id != new_id: + # old_id -> new_id + self._translation[old_id] = new_id + + # Transitivity will be needed if new commits are being inserted mid-way + # through a branch. + if handle_transitivity: + # Anything that points to old_id should point to new_id + if old_id in self._reverse_translation: + for id_ in self._reverse_translation[old_id]: + self._translation[id_] = new_id + + # Record that new_id is pointed to by old_id + if new_id not in self._reverse_translation: + self._reverse_translation[new_id] = [] + self._reverse_translation[new_id].append(old_id) + + def translate(self, old_id): + """ + If old_id has been mapped to an alternate id, return the alternate id. + """ + if old_id in self._translation: + return self._translation[old_id] + else: + return old_id + + def __str__(self): + """ + Convert IDs to string; used for debugging + """ + rv = "Current count: %d\nTranslation:\n" % self._next_id + for k in sorted(self._translation): + rv += " %d -> %s\n" % (k, self._translation[k]) + + rv += "Reverse translation:\n" + for k in sorted(self._reverse_translation): + rv += " " + str(k) + " -> " + str(self._reverse_translation[k]) + "\n" + + return rv + +class _GitElement(object): + """ + The base class for all git elements that we create. + """ + + def __init__(self): + # A string that describes what type of Git element this is + self.type = None + + # A flag telling us if this Git element has been dumped + # (i.e. printed) or skipped. Typically elements that have been + # dumped or skipped will not be dumped again. + self.dumped = 0 + + def dump(self, file_): + """ + This version should never be called. Derived classes need to + override! We should note that subclasses should implement this + method such that the output would match the format produced by + fast-export. + """ + raise SystemExit(_("Unimplemented function: %s") % type(self).__name__ + +".dump()") # pragma: no cover + + def __bytes__(self): + """ + Convert GitElement to bytestring; used for debugging + """ + old_dumped = self.dumped + writeme = io.BytesIO() + self.dump(writeme) + output_lines = writeme.getvalue().splitlines() + writeme.close() + self.dumped = old_dumped + return b"%s:\n %s" % (type(self).__name__.encode(), + b"\n ".join(output_lines)) + + def skip(self, new_id=None): + """ + Ensures this element will not be written to output + """ + self.dumped = 2 + +class _GitElementWithId(_GitElement): + """ + The base class for Git elements that have IDs (commits and blobs) + """ + + def __init__(self): + _GitElement.__init__(self) + + # The mark (short, portable id) for this element + self.id = _IDS.new() + + # The previous mark for this element + self.old_id = None + + def skip(self, new_id=None): + """ + This element will no longer be automatically written to output. When a + commit gets skipped, it's ID will need to be translated to that of its + parent. + """ + self.dumped = 2 + + _IDS.record_rename(self.old_id or self.id, new_id) + +class Blob(_GitElementWithId): + """ + This class defines our representation of git blob elements (i.e. our + way of representing file contents). + """ + + def __init__(self, data, original_id = None): + _GitElementWithId.__init__(self) + + # Denote that this is a blob + self.type = 'blob' + + # Record original id + self.original_id = original_id + + # Stores the blob's data + assert(type(data) == bytes) + self.data = data + + def dump(self, file_): + """ + Write this blob element to a file. + """ + self.dumped = 1 + HASH_TO_ID[self.original_id] = self.id + ID_TO_HASH[self.id] = self.original_id + + file_.write(b'blob\n') + file_.write(b'mark :%d\n' % self.id) + file_.write(b'data %d\n%s' % (len(self.data), self.data)) + file_.write(b'\n') + + +class Reset(_GitElement): + """ + This class defines our representation of git reset elements. A reset + event is the creation (or recreation) of a named branch, optionally + starting from a specific revision). + """ + + def __init__(self, ref, from_ref = None): + _GitElement.__init__(self) + + # Denote that this is a reset + self.type = 'reset' + + # The name of the branch being (re)created + self.ref = ref + + # Some reference to the branch/commit we are resetting from + self.from_ref = from_ref + + def dump(self, file_): + """ + Write this reset element to a file + """ + self.dumped = 1 + + file_.write(b'reset %s\n' % self.ref) + if self.from_ref: + if isinstance(self.from_ref, int): + file_.write(b'from :%d\n' % self.from_ref) + else: + file_.write(b'from %s\n' % self.from_ref) + file_.write(b'\n') + +class FileChange(_GitElement): + """ + This class defines our representation of file change elements. File change + elements are components within a Commit element. + """ + + def __init__(self, type_, filename = None, id_ = None, mode = None): + _GitElement.__init__(self) + + # Denote the type of file-change (b'M' for modify, b'D' for delete, etc) + # We could + # assert(type(type_) == bytes) + # here but I don't just due to worries about performance overhead... + self.type = type_ + + # Record the name of the file being changed + self.filename = filename + + # Record the mode (mode describes type of file entry (non-executable, + # executable, or symlink)). + self.mode = mode + + # blob_id is the id (mark) of the affected blob + self.blob_id = id_ + + if type_ == b'DELETEALL': + assert filename is None and id_ is None and mode is None + self.filename = b'' # Just so PathQuoting.enquote doesn't die + else: + assert filename is not None + + if type_ == b'M': + assert id_ is not None and mode is not None + elif type_ == b'D': + assert id_ is None and mode is None + elif type_ == b'R': # pragma: no cover (now avoid fast-export renames) + assert mode is None + if id_ is None: + raise SystemExit(_("new name needed for rename of %s") % filename) + self.filename = (self.filename, id_) + self.blob_id = None + + def dump(self, file_): + """ + Write this file-change element to a file + """ + skipped_blob = (self.type == b'M' and self.blob_id is None) + if skipped_blob: return + self.dumped = 1 + + quoted_filename = PathQuoting.enquote(self.filename) + if self.type == b'M' and isinstance(self.blob_id, int): + file_.write(b'M %s :%d %s\n' % (self.mode, self.blob_id, quoted_filename)) + elif self.type == b'M': + file_.write(b'M %s %s %s\n' % (self.mode, self.blob_id, quoted_filename)) + elif self.type == b'D': + file_.write(b'D %s\n' % quoted_filename) + elif self.type == b'DELETEALL': + file_.write(b'deleteall\n') + else: + raise SystemExit(_("Unhandled filechange type: %s") % self.type) # pragma: no cover + +class Commit(_GitElementWithId): + """ + This class defines our representation of commit elements. Commit elements + contain all the information associated with a commit. + """ + + def __init__(self, branch, + author_name, author_email, author_date, + committer_name, committer_email, committer_date, + message, + file_changes, + parents, + original_id = None, + encoding = None, # encoding for message; None implies UTF-8 + **kwargs): + _GitElementWithId.__init__(self) + self.old_id = self.id + + # Denote that this is a commit element + self.type = 'commit' + + # Record the affected branch + self.branch = branch + + # Record original id + self.original_id = original_id + + # Record author's name + self.author_name = author_name + + # Record author's email + self.author_email = author_email + + # Record date of authoring + self.author_date = author_date + + # Record committer's name + self.committer_name = committer_name + + # Record committer's email + self.committer_email = committer_email + + # Record date the commit was made + self.committer_date = committer_date + + # Record commit message and its encoding + self.encoding = encoding + self.message = message + + # List of file-changes associated with this commit. Note that file-changes + # are also represented as git elements + self.file_changes = file_changes + + self.parents = parents + + def dump(self, file_): + """ + Write this commit element to a file. + """ + self.dumped = 1 + HASH_TO_ID[self.original_id] = self.id + ID_TO_HASH[self.id] = self.original_id + + # Make output to fast-import slightly easier for humans to read if the + # message has no trailing newline of its own; cosmetic, but a nice touch... + extra_newline = b'\n' + if self.message.endswith(b'\n') or not (self.parents or self.file_changes): + extra_newline = b'' + + if not self.parents: + file_.write(b'reset %s\n' % self.branch) + file_.write((b'commit %s\n' + b'mark :%d\n' + b'author %s <%s> %s\n' + b'committer %s <%s> %s\n' + ) % ( + self.branch, self.id, + self.author_name, self.author_email, self.author_date, + self.committer_name, self.committer_email, self.committer_date + )) + if self.encoding: + file_.write(b'encoding %s\n' % self.encoding) + file_.write(b'data %d\n%s%s' % + (len(self.message), self.message, extra_newline)) + for i, parent in enumerate(self.parents): + file_.write(b'from ' if i==0 else b'merge ') + if isinstance(parent, int): + file_.write(b':%d\n' % parent) + else: + file_.write(b'%s\n' % parent) + for change in self.file_changes: + change.dump(file_) + if not self.parents and not self.file_changes: + # Workaround a bug in pre-git-2.22 versions of fast-import with + # the get-mark directive. + file_.write(b'\n') + file_.write(b'\n') + + def first_parent(self): + """ + Return first parent commit + """ + if self.parents: + return self.parents[0] + return None + + def skip(self, new_id=None): + _SKIPPED_COMMITS.add(self.old_id or self.id) + _GitElementWithId.skip(self, new_id) + +class Tag(_GitElementWithId): + """ + This class defines our representation of annotated tag elements. + """ + + def __init__(self, ref, from_ref, + tagger_name, tagger_email, tagger_date, tag_msg, + original_id = None): + _GitElementWithId.__init__(self) + self.old_id = self.id + + # Denote that this is a tag element + self.type = 'tag' + + # Store the name of the tag + self.ref = ref + + # Store the entity being tagged (this should be a commit) + self.from_ref = from_ref + + # Record original id + self.original_id = original_id + + # Store the name of the tagger + self.tagger_name = tagger_name + + # Store the email of the tagger + self.tagger_email = tagger_email + + # Store the date + self.tagger_date = tagger_date + + # Store the tag message + self.message = tag_msg + + def dump(self, file_): + """ + Write this tag element to a file + """ + + self.dumped = 1 + HASH_TO_ID[self.original_id] = self.id + ID_TO_HASH[self.id] = self.original_id + + file_.write(b'tag %s\n' % self.ref) + if (write_marks and self.id): + file_.write(b'mark :%d\n' % self.id) + markfmt = b'from :%d\n' if isinstance(self.from_ref, int) else b'from %s\n' + file_.write(markfmt % self.from_ref) + if self.tagger_name: + file_.write(b'tagger %s <%s> ' % (self.tagger_name, self.tagger_email)) + file_.write(self.tagger_date) + file_.write(b'\n') + file_.write(b'data %d\n%s' % (len(self.message), self.message)) + file_.write(b'\n') + +class Progress(_GitElement): + """ + This class defines our representation of progress elements. The progress + element only contains a progress message, which is printed by fast-import + when it processes the progress output. + """ + + def __init__(self, message): + _GitElement.__init__(self) + + # Denote that this is a progress element + self.type = 'progress' + + # Store the progress message + self.message = message + + def dump(self, file_): + """ + Write this progress element to a file + """ + self.dumped = 1 + + file_.write(b'progress %s\n' % self.message) + file_.write(b'\n') + +class Checkpoint(_GitElement): + """ + This class defines our representation of checkpoint elements. These + elements represent events which force fast-import to close the current + packfile, start a new one, and to save out all current branch refs, tags + and marks. + """ + + def __init__(self): + _GitElement.__init__(self) + + # Denote that this is a checkpoint element + self.type = 'checkpoint' + + def dump(self, file_): + """ + Write this checkpoint element to a file + """ + self.dumped = 1 + + file_.write(b'checkpoint\n') + file_.write(b'\n') + +class LiteralCommand(_GitElement): + """ + This class defines our representation of commands. The literal command + includes only a single line, and is not processed in any special way. + """ + + def __init__(self, line): + _GitElement.__init__(self) + + # Denote that this is a literal element + self.type = 'literal' + + # Store the command + self.line = line + + def dump(self, file_): + """ + Write this progress element to a file + """ + self.dumped = 1 + + file_.write(self.line) + +class Alias(_GitElement): + """ + This class defines our representation of fast-import alias elements. An + alias element is the setting of one mark to the same sha1sum as another, + usually because the newer mark corresponded to a pruned commit. + """ + + def __init__(self, ref, to_ref): + _GitElement.__init__(self) + # Denote that this is a reset + self.type = 'alias' + + self.ref = ref + self.to_ref = to_ref + + def dump(self, file_): + """ + Write this reset element to a file + """ + self.dumped = 1 + + file_.write(b'alias\nmark :%d\nto :%d\n\n' % (self.ref, self.to_ref)) + +class FastExportParser(object): + """ + A class for parsing and handling the output from fast-export. This + class allows the user to register callbacks when various types of + data are encountered in the fast-export output. The basic idea is that, + FastExportParser takes fast-export output, creates the various objects + as it encounters them, the user gets to use/modify these objects via + callbacks, and finally FastExportParser outputs the modified objects + in fast-import format (presumably so they can be used to create a new + repo). + """ + + def __init__(self, + tag_callback = None, commit_callback = None, + blob_callback = None, progress_callback = None, + reset_callback = None, checkpoint_callback = None, + done_callback = None): + # Members below simply store callback functions for the various git + # elements + self._tag_callback = tag_callback + self._blob_callback = blob_callback + self._reset_callback = reset_callback + self._commit_callback = commit_callback + self._progress_callback = progress_callback + self._checkpoint_callback = checkpoint_callback + self._done_callback = done_callback + + # Keep track of which refs appear from the export, and which make it to + # the import (pruning of empty commits, renaming of refs, and creating + # new manual objects and inserting them can cause these to differ). + self._exported_refs = set() + self._imported_refs = set() + + # A list of the branches we've seen, plus the last known commit they + # pointed to. An entry in latest_*commit will be deleted if we get a + # reset for that branch. These are used because of fast-import's weird + # decision to allow having an implicit parent via naming the branch + # instead of requiring branches to be specified via 'from' directives. + self._latest_commit = {} + self._latest_orig_commit = {} + + # A handle to the input source for the fast-export data + self._input = None + + # A handle to the output file for the output we generate (we call dump + # on many of the git elements we create). + self._output = None + + # Stores the contents of the current line of input being parsed + self._currentline = '' + + # Compile some regexes and cache those + self._mark_re = re.compile(br'mark :(\d+)\n$') + self._parent_regexes = {} + parent_regex_rules = (br' :(\d+)\n$', br' ([0-9a-f]{40})\n') + for parent_refname in (b'from', b'merge'): + ans = [re.compile(parent_refname+x) for x in parent_regex_rules] + self._parent_regexes[parent_refname] = ans + self._quoted_string_re = re.compile(br'"(?:[^"\\]|\\.)*"') + self._refline_regexes = {} + for refline_name in (b'reset', b'commit', b'tag', b'progress'): + self._refline_regexes[refline_name] = re.compile(refline_name+b' (.*)\n$') + self._user_regexes = {} + for user in (b'author', b'committer', b'tagger'): + self._user_regexes[user] = re.compile(user + b' (.*?) <(.*?)> (.*)\n$') + + def _advance_currentline(self): + """ + Grab the next line of input + """ + self._currentline = self._input.readline() + + def _parse_optional_mark(self): + """ + If the current line contains a mark, parse it and advance to the + next line; return None otherwise + """ + mark = None + matches = self._mark_re.match(self._currentline) + if matches: + mark = int(matches.group(1)) + self._advance_currentline() + return mark + + def _parse_optional_parent_ref(self, refname): + """ + If the current line contains a reference to a parent commit, then + parse it and advance the current line; otherwise return None. Note + that the name of the reference ('from', 'merge') must match the + refname arg. + """ + orig_baseref, baseref = None, None + rule, altrule = self._parent_regexes[refname] + matches = rule.match(self._currentline) + if matches: + orig_baseref = int(matches.group(1)) + # We translate the parent commit mark to what it needs to be in + # our mark namespace + baseref = _IDS.translate(orig_baseref) + self._advance_currentline() + else: + matches = altrule.match(self._currentline) + if matches: + orig_baseref = matches.group(1) + baseref = orig_baseref + self._advance_currentline() + return orig_baseref, baseref + + def _parse_optional_filechange(self): + """ + If the current line contains a file-change object, then parse it + and advance the current line; otherwise return None. We only care + about file changes of type b'M' and b'D' (these are the only types + of file-changes that fast-export will provide). + """ + filechange = None + changetype = self._currentline[0:1] + if changetype == b'M': + (changetype, mode, idnum, path) = self._currentline.split(None, 3) + if idnum[0:1] == b':': + idnum = idnum[1:] + path = path.rstrip(b'\n') + # We translate the idnum to our id system + if len(idnum) != 40: + idnum = _IDS.translate( int(idnum) ) + if idnum is not None: + if path.startswith(b'"'): + path = PathQuoting.dequote(path) + filechange = FileChange(b'M', path, idnum, mode) + else: + filechange = b'skipped' + self._advance_currentline() + elif changetype == b'D': + (changetype, path) = self._currentline.split(None, 1) + path = path.rstrip(b'\n') + if path.startswith(b'"'): + path = PathQuoting.dequote(path) + filechange = FileChange(b'D', path) + self._advance_currentline() + elif changetype == b'R': # pragma: no cover (now avoid fast-export renames) + rest = self._currentline[2:-1] + if rest.startswith(b'"'): + m = self._quoted_string_re.match(rest) + if not m: + raise SystemExit(_("Couldn't parse rename source")) + orig = PathQuoting.dequote(m.group(0)) + new = rest[m.end()+1:] + else: + orig, new = rest.split(b' ', 1) + if new.startswith(b'"'): + new = PathQuoting.dequote(new) + filechange = FileChange(b'R', orig, new) + self._advance_currentline() + return filechange + + def _parse_original_id(self): + original_id = self._currentline[len(b'original-oid '):].rstrip() + self._advance_currentline() + return original_id + + def _parse_encoding(self): + encoding = self._currentline[len(b'encoding '):].rstrip() + self._advance_currentline() + return encoding + + def _parse_ref_line(self, refname): + """ + Parses string data (often a branch name) from current-line. The name of + the string data must match the refname arg. The program will crash if + current-line does not match, so current-line will always be advanced if + this method returns. + """ + matches = self._refline_regexes[refname].match(self._currentline) + if not matches: + raise SystemExit(_("Malformed %(refname)s line: '%(line)s'") % + ({'refname': refname, 'line':self._currentline}) + ) # pragma: no cover + ref = matches.group(1) + self._advance_currentline() + return ref + + def _parse_user(self, usertype): + """ + Get user name, email, datestamp from current-line. Current-line will + be advanced. + """ + user_regex = self._user_regexes[usertype] + (name, email, when) = user_regex.match(self._currentline).groups() + + self._advance_currentline() + return (name, email, when) + + def _parse_data(self): + """ + Reads data from _input. Current-line will be advanced until it is beyond + the data. + """ + fields = self._currentline.split() + assert fields[0] == b'data' + size = int(fields[1]) + data = self._input.read(size) + self._advance_currentline() + if self._currentline == b'\n': + self._advance_currentline() + return data + + def _parse_blob(self): + """ + Parse input data into a Blob object. Once the Blob has been created, it + will be handed off to the appropriate callbacks. Current-line will be + advanced until it is beyond this blob's data. The Blob will be dumped + to _output once everything else is done (unless it has been skipped by + the callback). + """ + # Parse the Blob + self._advance_currentline() + id_ = self._parse_optional_mark() + + original_id = None + if self._currentline.startswith(b'original-oid'): + original_id = self._parse_original_id(); + + data = self._parse_data() + if self._currentline == b'\n': + self._advance_currentline() + + # Create the blob + blob = Blob(data, original_id) + + # If fast-export text had a mark for this blob, need to make sure this + # mark translates to the blob's true id. + if id_: + blob.old_id = id_ + _IDS.record_rename(id_, blob.id) + + # Call any user callback to allow them to use/modify the blob + if self._blob_callback: + self._blob_callback(blob) + + # Now print the resulting blob + if not blob.dumped: + blob.dump(self._output) + + def _parse_reset(self): + """ + Parse input data into a Reset object. Once the Reset has been created, + it will be handed off to the appropriate callbacks. Current-line will + be advanced until it is beyond the reset data. The Reset will be dumped + to _output once everything else is done (unless it has been skipped by + the callback). + """ + # Parse the Reset + ref = self._parse_ref_line(b'reset') + self._exported_refs.add(ref) + ignoreme, from_ref = self._parse_optional_parent_ref(b'from') + if self._currentline == b'\n': + self._advance_currentline() + + # fast-export likes to print extraneous resets that serve no purpose. + # While we could continue processing such resets, that is a waste of + # resources. Also, we want to avoid recording that this ref was + # seen in such cases, since this ref could be rewritten to nothing. + if not from_ref: + self._latest_commit.pop(ref, None) + self._latest_orig_commit.pop(ref, None) + return + + # Create the reset + reset = Reset(ref, from_ref) + + # Call any user callback to allow them to modify the reset + if self._reset_callback: + self._reset_callback(reset) + + # Update metadata + self._latest_commit[reset.ref] = reset.from_ref + self._latest_orig_commit[reset.ref] = reset.from_ref + + # Now print the resulting reset + if not reset.dumped: + self._imported_refs.add(reset.ref) + reset.dump(self._output) + + def _parse_commit(self): + """ + Parse input data into a Commit object. Once the Commit has been created, + it will be handed off to the appropriate callbacks. Current-line will + be advanced until it is beyond the commit data. The Commit will be dumped + to _output once everything else is done (unless it has been skipped by + the callback OR the callback has removed all file-changes from the commit). + """ + # Parse the Commit. This may look involved, but it's pretty simple; it only + # looks bad because a commit object contains many pieces of data. + branch = self._parse_ref_line(b'commit') + self._exported_refs.add(branch) + id_ = self._parse_optional_mark() + + original_id = None + if self._currentline.startswith(b'original-oid'): + original_id = self._parse_original_id(); + + author_name = None + author_email = None + if self._currentline.startswith(b'author'): + (author_name, author_email, author_date) = self._parse_user(b'author') + + (committer_name, committer_email, committer_date) = \ + self._parse_user(b'committer') + + if not author_name and not author_email: + (author_name, author_email, author_date) = \ + (committer_name, committer_email, committer_date) + + encoding = None + if self._currentline.startswith(b'encoding '): + encoding = self._parse_encoding() + + commit_msg = self._parse_data() + + pinfo = [self._parse_optional_parent_ref(b'from')] + # Due to empty pruning, we can have real 'from' and 'merge' lines that + # due to commit rewriting map to a parent of None. We need to record + # 'from' if its non-None, and we need to parse all 'merge' lines. + while self._currentline.startswith(b'merge '): + pinfo.append(self._parse_optional_parent_ref(b'merge')) + orig_parents, parents = [list(tmp) for tmp in zip(*pinfo)] + + # No parents is oddly represented as [None] instead of [], due to the + # special 'from' handling. Convert it here to a more canonical form. + if parents == [None]: + parents = [] + if orig_parents == [None]: + orig_parents = [] + + # fast-import format is kinda stupid in that it allows implicit parents + # based on the branch name instead of requiring them to be specified by + # 'from' directives. The only way to get no parent is by using a reset + # directive first, which clears the latest_commit_for_this_branch tracking. + if not orig_parents and self._latest_commit.get(branch): + parents = [self._latest_commit[branch]] + if not orig_parents and self._latest_orig_commit.get(branch): + orig_parents = [self._latest_orig_commit[branch]] + + # Get the list of file changes + file_changes = [] + file_change = self._parse_optional_filechange() + had_file_changes = file_change is not None + while file_change: + if not (type(file_change) == bytes and file_change == b'skipped'): + file_changes.append(file_change) + file_change = self._parse_optional_filechange() + if self._currentline == b'\n': + self._advance_currentline() + + # Okay, now we can finally create the Commit object + commit = Commit(branch, + author_name, author_email, author_date, + committer_name, committer_email, committer_date, + commit_msg, file_changes, parents, original_id, encoding) + + # If fast-export text had a mark for this commit, need to make sure this + # mark translates to the commit's true id. + if id_: + commit.old_id = id_ + _IDS.record_rename(id_, commit.id) + + # Call any user callback to allow them to modify the commit + aux_info = {'orig_parents': orig_parents, + 'had_file_changes': had_file_changes} + if self._commit_callback: + self._commit_callback(commit, aux_info) + + # Now print the resulting commit, or if prunable skip it + self._latest_orig_commit[branch] = commit.id + if not (commit.old_id or commit.id) in _SKIPPED_COMMITS: + self._latest_commit[branch] = commit.id + if not commit.dumped: + self._imported_refs.add(commit.branch) + commit.dump(self._output) + + def _parse_tag(self): + """ + Parse input data into a Tag object. Once the Tag has been created, + it will be handed off to the appropriate callbacks. Current-line will + be advanced until it is beyond the tag data. The Tag will be dumped + to _output once everything else is done (unless it has been skipped by + the callback). + """ + # Parse the Tag + tag = self._parse_ref_line(b'tag') + self._exported_refs.add(b'refs/tags/'+tag) + id_ = self._parse_optional_mark() + ignoreme, from_ref = self._parse_optional_parent_ref(b'from') + + original_id = None + if self._currentline.startswith(b'original-oid'): + original_id = self._parse_original_id(); + + tagger_name, tagger_email, tagger_date = None, None, None + if self._currentline.startswith(b'tagger'): + (tagger_name, tagger_email, tagger_date) = self._parse_user(b'tagger') + tag_msg = self._parse_data() + if self._currentline == b'\n': + self._advance_currentline() + + # Create the tag + tag = Tag(tag, from_ref, + tagger_name, tagger_email, tagger_date, tag_msg, + original_id) + + # If fast-export text had a mark for this tag, need to make sure this + # mark translates to the tag's true id. + if id_: + tag.old_id = id_ + _IDS.record_rename(id_, tag.id) + + # Call any user callback to allow them to modify the tag + if self._tag_callback: + self._tag_callback(tag) + + # The tag might not point at anything that still exists (self.from_ref + # will be None if the commit it pointed to and all its ancestors were + # pruned due to being empty) + if tag.from_ref: + # Print out this tag's information + if not tag.dumped: + self._imported_refs.add(b'refs/tags/'+tag.ref) + tag.dump(self._output) + else: + tag.skip() + + def _parse_progress(self): + """ + Parse input data into a Progress object. Once the Progress has + been created, it will be handed off to the appropriate + callbacks. Current-line will be advanced until it is beyond the + progress data. The Progress will be dumped to _output once + everything else is done (unless it has been skipped by the callback). + """ + # Parse the Progress + message = self._parse_ref_line(b'progress') + if self._currentline == b'\n': + self._advance_currentline() + + # Create the progress message + progress = Progress(message) + + # Call any user callback to allow them to modify the progress messsage + if self._progress_callback: + self._progress_callback(progress) + + # NOTE: By default, we do NOT print the progress message; git + # fast-import would write it to fast_import_pipes which could mess with + # our parsing of output from the 'ls' and 'get-mark' directives we send + # to fast-import. If users want these messages, they need to process + # and handle them in the appropriate callback above. + + def _parse_checkpoint(self): + """ + Parse input data into a Checkpoint object. Once the Checkpoint has + been created, it will be handed off to the appropriate + callbacks. Current-line will be advanced until it is beyond the + checkpoint data. The Checkpoint will be dumped to _output once + everything else is done (unless it has been skipped by the callback). + """ + # Parse the Checkpoint + self._advance_currentline() + if self._currentline == b'\n': + self._advance_currentline() + + # Create the checkpoint + checkpoint = Checkpoint() + + # Call any user callback to allow them to drop the checkpoint + if self._checkpoint_callback: + self._checkpoint_callback(checkpoint) + + # NOTE: By default, we do NOT print the checkpoint message; although it + # we would only realistically get them with --stdin, the fact that we + # are filtering makes me think the checkpointing is less likely to be + # reasonable. In fact, I don't think it's necessary in general. If + # users do want it, they should process it in the checkpoint_callback. + + def _parse_literal_command(self): + """ + Parse literal command. Then just dump the line as is. + """ + # Create the literal command object + command = LiteralCommand(self._currentline) + self._advance_currentline() + + # Now print the resulting literal command + if not command.dumped: + command.dump(self._output) + + def insert(self, obj): + assert not obj.dumped + obj.dump(self._output) + if type(obj) == Commit: + self._imported_refs.add(obj.branch) + elif type(obj) in (Reset, Tag): + self._imported_refs.add(obj.ref) + + def run(self, input, output): + """ + This method filters fast export output. + """ + # Set input. If no args provided, use stdin. + self._input = input + self._output = output + + # Run over the input and do the filtering + self._advance_currentline() + while self._currentline: + if self._currentline.startswith(b'blob'): + self._parse_blob() + elif self._currentline.startswith(b'reset'): + self._parse_reset() + elif self._currentline.startswith(b'commit'): + self._parse_commit() + elif self._currentline.startswith(b'tag'): + self._parse_tag() + elif self._currentline.startswith(b'progress'): + self._parse_progress() + elif self._currentline.startswith(b'checkpoint'): + self._parse_checkpoint() + elif self._currentline.startswith(b'feature'): + self._parse_literal_command() + elif self._currentline.startswith(b'option'): + self._parse_literal_command() + elif self._currentline.startswith(b'done'): + if self._done_callback: + self._done_callback() + self._parse_literal_command() + # Prevent confusion from others writing additional stuff that'll just + # be ignored + self._output.close() + elif self._currentline.startswith(b'#'): + self._parse_literal_command() + elif self._currentline.startswith(b'get-mark') or \ + self._currentline.startswith(b'cat-blob') or \ + self._currentline.startswith(b'ls'): + raise SystemExit(_("Unsupported command: '%s'") % self._currentline) + else: + raise SystemExit(_("Could not parse line: '%s'") % self._currentline) + + def get_exported_and_imported_refs(self): + return self._exported_refs, self._imported_refs + +def record_id_rename(old_id, new_id): + """ + Register a new translation + """ + handle_transitivity = True + _IDS.record_rename(old_id, new_id, handle_transitivity) + +# Internal globals +_IDS = _IDs() +_SKIPPED_COMMITS = set() +HASH_TO_ID = {} +ID_TO_HASH = {} + +class SubprocessWrapper(object): + @staticmethod + def decodify(args): + if type(args) == str: + return args + else: + assert type(args) == list + return [decode(x) if type(x)==bytes else x for x in args] + + @staticmethod + def call(*args, **kwargs): + if 'cwd' in kwargs: + kwargs['cwd'] = decode(kwargs['cwd']) + return subprocess.call(SubprocessWrapper.decodify(*args), **kwargs) + + @staticmethod + def check_output(*args, **kwargs): + if 'cwd' in kwargs: + kwargs['cwd'] = decode(kwargs['cwd']) + return subprocess.check_output(SubprocessWrapper.decodify(*args), **kwargs) + + @staticmethod + def check_call(*args, **kwargs): # pragma: no cover # used by filter-lamely + if 'cwd' in kwargs: + kwargs['cwd'] = decode(kwargs['cwd']) + return subprocess.check_call(SubprocessWrapper.decodify(*args), **kwargs) + + @staticmethod + def Popen(*args, **kwargs): + if 'cwd' in kwargs: + kwargs['cwd'] = decode(kwargs['cwd']) + return subprocess.Popen(SubprocessWrapper.decodify(*args), **kwargs) + +subproc = subprocess +if platform.system() == 'Windows' or 'PRETEND_UNICODE_ARGS' in os.environ: + subproc = SubprocessWrapper + +class GitUtils(object): + @staticmethod + def get_commit_count(repo, *args): + """ + Return the number of commits that have been made on repo. + """ + if not args: + args = ['--all'] + if len(args) == 1 and isinstance(args[0], list): + args = args[0] + p = subproc.Popen(["git", "rev-list", "--count"] + args, + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + cwd=repo) + if p.wait() != 0: + raise SystemExit(_("%s does not appear to be a valid git repository") + % decode(repo)) + return int(p.stdout.read()) + + @staticmethod + def get_total_objects(repo): + """ + Return the number of objects (both packed and unpacked) + """ + p1 = subproc.Popen(["git", "count-objects", "-v"], + stdout=subprocess.PIPE, cwd=repo) + lines = p1.stdout.read().splitlines() + # Return unpacked objects + packed-objects + return int(lines[0].split()[1]) + int(lines[2].split()[1]) + + @staticmethod + def is_repository_bare(repo_working_dir): + out = subproc.check_output('git rev-parse --is-bare-repository'.split(), + cwd=repo_working_dir) + return (out.strip() == b'true') + + @staticmethod + def determine_git_dir(repo_working_dir): + d = subproc.check_output('git rev-parse --git-dir'.split(), + cwd=repo_working_dir).strip() + if repo_working_dir==b'.' or d.startswith(b'/'): + return d + return os.path.join(repo_working_dir, d) + + @staticmethod + def get_refs(repo_working_dir): + try: + output = subproc.check_output('git show-ref'.split(), + cwd=repo_working_dir) + except subprocess.CalledProcessError as e: + # If error code is 1, there just aren't any refs; i.e. new repo. + # If error code is other than 1, some other error (e.g. not a git repo) + if e.returncode != 1: + raise SystemExit('fatal: {}'.format(e)) + output = '' + return dict(reversed(x.split()) for x in output.splitlines()) + + @staticmethod + def get_blob_sizes(quiet = False): + blob_size_progress = ProgressWriter() + num_blobs = 0 + processed_blobs_msg = _("Processed %d blob sizes") + + # Get sizes of blobs by sha1 + cmd = '--batch-check=%(objectname) %(objecttype) ' + \ + '%(objectsize) %(objectsize:disk)' + cf = subproc.Popen(['git', 'cat-file', '--batch-all-objects', cmd], + bufsize = -1, + stdout = subprocess.PIPE) + unpacked_size = {} + packed_size = {} + for line in cf.stdout: + sha, objtype, objsize, objdisksize = line.split() + objsize, objdisksize = int(objsize), int(objdisksize) + if objtype == b'blob': + unpacked_size[sha] = objsize + packed_size[sha] = objdisksize + num_blobs += 1 + if not quiet: + blob_size_progress.show(processed_blobs_msg % num_blobs) + cf.wait() + if not quiet: + blob_size_progress.finish() + return unpacked_size, packed_size + + @staticmethod + def get_file_changes(repo, parent_hash, commit_hash): + """ + Return a FileChanges list with the differences between parent_hash + and commit_hash + """ + file_changes = [] + + cmd = ["git", "diff-tree", "-r", parent_hash, commit_hash] + output = subproc.check_output(cmd, cwd=repo) + for line in output.splitlines(): + fileinfo, path = line.split(b'\t', 1) + if path.startswith(b'"'): + path = PathQuoting.dequote(path) + oldmode, mode, oldhash, newhash, changetype = fileinfo.split() + if changetype == b'D': + file_changes.append(FileChange(b'D', path)) + elif changetype in (b'A', b'M', b'T'): + identifier = HASH_TO_ID.get(newhash, newhash) + file_changes.append(FileChange(b'M', path, identifier, mode)) + else: # pragma: no cover + raise SystemExit("Unknown change type for line {}".format(line)) + + return file_changes + + @staticmethod + def print_my_version(): + with open(__file__, 'br') as f: + contents = f.read() + # If people replaced @@LOCALEDIR@@ string to point at their local + # directory, undo it so we can get original source version. + contents = re.sub(br'\A#\!.*', + br'#!/usr/bin/env python3', contents) + contents = re.sub(br'(\("GIT_TEXTDOMAINDIR"\) or ").*"', + br'\1@@LOCALEDIR@@"', contents) + + cmd = 'git hash-object --stdin'.split() + version = subproc.check_output(cmd, input=contents).strip() + print(decode(version[0:12])) + +class FilteringOptions(object): + default_replace_text = b'***REMOVED***' + class AppendFilter(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + user_path = values + suffix = option_string[len('--path-'):] or 'match' + if suffix.startswith('rename'): + mod_type = 'rename' + match_type = option_string[len('--path-rename-'):] or 'match' + values = values.split(b':') + if len(values) != 2: + raise SystemExit(_("Error: --path-rename expects one colon in its" + " argument: .")) + if values[0] and values[1] and not ( + values[0].endswith(b'/') == values[1].endswith(b'/')): + raise SystemExit(_("Error: With --path-rename, if OLD_NAME and " + "NEW_NAME are both non-empty and either ends " + "with a slash then both must.")) + if any(v.startswith(b'/') for v in values): + raise SystemExit(_("Error: Pathnames cannot begin with a '/'")) + components = values[0].split(b'/') + values[1].split(b'/') + else: + mod_type = 'filter' + match_type = suffix + components = values.split(b'/') + if values.startswith(b'/'): + raise SystemExit(_("Error: Pathnames cannot begin with a '/'")) + for illegal_path in [b'.', b'..']: + if illegal_path in components: + raise SystemExit(_("Error: Invalid path component '%s' found in '%s'") + % (decode(illegal_path), decode(user_path))) + if match_type == 'regex': + values = re.compile(values) + items = getattr(namespace, self.dest, []) or [] + items.append((mod_type, match_type, values)) + if (match_type, mod_type) == ('glob', 'filter'): + if not values.endswith(b'*'): + extension = b'*' if values.endswith(b'/') else b'/*' + items.append((mod_type, match_type, values+extension)) + setattr(namespace, self.dest, items) + + class HelperFilter(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + af = FilteringOptions.AppendFilter(dest='path_changes', + option_strings=None) + dirname = values if values[-1:] == b'/' else values+b'/' + if option_string == '--subdirectory-filter': + af(parser, namespace, dirname, '--path-match') + af(parser, namespace, dirname+b':', '--path-rename') + elif option_string == '--to-subdirectory-filter': + af(parser, namespace, b':'+dirname, '--path-rename') + else: + raise SystemExit(_("Error: HelperFilter given invalid option_string: %s") + % option_string) # pragma: no cover + + class FileWithPathsFilter(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + if not namespace.path_changes: + namespace.path_changes = [] + namespace.path_changes += FilteringOptions.get_paths_from_file(values) + + @staticmethod + def create_arg_parser(): + # Include usage in the summary, so we can put the description first + summary = _('''Rewrite (or analyze) repository history + + git-filter-repo destructively rewrites history (unless --analyze or + --dry-run are given) according to specified rules. It refuses to do any + rewriting unless either run from a clean fresh clone, or --force was + given. + + Basic Usage: + git-filter-repo --analyze + git-filter-repo [FILTER/RENAME/CONTROL OPTIONS] + + See EXAMPLES section for details. + ''').rstrip() + + # Provide a long helpful examples section + example_text = _('''CALLBACKS + + All callback functions are of the same general format. For a command line + argument like + --foo-callback 'BODY' + + the following code will be compiled and called: + def foo_callback(foo): + BODY + + Thus, to replace 'Jon' with 'John' in author/committer/tagger names: + git filter-repo --name-callback 'return name.replace(b"Jon", b"John")' + + To remove all 'Tested-by' tags in commit (or tag) messages: + git filter-repo --message-callback 'return re.sub(br"\\nTested-by:.*", "", message)' + + To remove all .DS_Store files: + git filter-repo --filename-callback 'return None if os.path.basename(filename) == b".DS_Store" else filename' + + Note that if BODY resolves to a filename, then the contents of that file + will be used as the BODY in the callback function. + + For more detailed examples and explanations AND caveats, see + https://htmlpreview.github.io/?https://github.com/newren/git-filter-repo/blob/docs/html/git-filter-repo.html#CALLBACKS + +EXAMPLES + + To get a bunch of reports mentioning renames that have occurred in + your repo and listing sizes of objects aggregated by any of path, + directory, extension, or blob-id: + git filter-repo --analyze + + (These reports can help you choose how to filter your repo; it can + be useful to re-run this command after filtering to regenerate the + report and verify the changes look correct.) + + To extract the history that touched just 'guides' and 'tools/releases': + git filter-repo --path guides/ --path tools/releases + + To remove foo.zip and bar/baz/zips from every revision in history: + git filter-repo --path foo.zip --path bar/baz/zips/ --invert-paths + + To replace the text 'password' with 'p455w0rd': + git filter-repo --replace-text <(echo "password==>p455w0rd") + + To use the current version of the .mailmap file to update authors, + committers, and taggers throughout history and make it permanent: + git filter-repo --use-mailmap + + To extract the history of 'src/', rename all files to have a new leading + directory 'my-module' (e.g. src/foo.java -> my-module/src/foo.java), and + add a 'my-module-' prefix to all tags: + git filter-repo --path src/ --to-subdirectory-filter my-module --tag-rename '':'my-module-' + + For more detailed examples and explanations, see + https://htmlpreview.github.io/?https://github.com/newren/git-filter-repo/blob/docs/html/git-filter-repo.html#EXAMPLES''') + + # Create the basic parser + parser = argparse.ArgumentParser(description=summary, + usage = argparse.SUPPRESS, + add_help = False, + epilog = example_text, + formatter_class=argparse.RawDescriptionHelpFormatter) + + analyze = parser.add_argument_group(title=_("Analysis")) + analyze.add_argument('--analyze', action='store_true', + help=_("Analyze repository history and create a report that may be " + "useful in determining what to filter in a subsequent run. " + "Will not modify your repo.")) + analyze.add_argument('--report-dir', + metavar='DIR_OR_FILE', + type=os.fsencode, + dest='report_dir', + help=_("Directory to write report, defaults to GIT_DIR/filter_repo/analysis," + "refuses to run if exists, --force delete existing dir first.")) + + path = parser.add_argument_group(title=_("Filtering based on paths " + "(see also --filename-callback)"), + description=textwrap.dedent(_(""" + These options specify the paths to select. Note that much like git + itself, renames are NOT followed so you may need to specify multiple + paths, e.g. `--path olddir/ --path newdir/` + """[1:]))) + + path.add_argument('--invert-paths', action='store_false', dest='inclusive', + help=_("Invert the selection of files from the specified " + "--path-{match,glob,regex} options below, i.e. only select " + "files matching none of those options.")) + + path.add_argument('--path-match', '--path', metavar='DIR_OR_FILE', + type=os.fsencode, + action=FilteringOptions.AppendFilter, dest='path_changes', + help=_("Exact paths (files or directories) to include in filtered " + "history. Multiple --path options can be specified to get " + "a union of paths.")) + path.add_argument('--path-glob', metavar='GLOB', type=os.fsencode, + action=FilteringOptions.AppendFilter, dest='path_changes', + help=_("Glob of paths to include in filtered history. Multiple " + "--path-glob options can be specified to get a union of " + "paths.")) + path.add_argument('--path-regex', metavar='REGEX', type=os.fsencode, + action=FilteringOptions.AppendFilter, dest='path_changes', + help=_("Regex of paths to include in filtered history. Multiple " + "--path-regex options can be specified to get a union of " + "paths")) + path.add_argument('--use-base-name', action='store_true', + help=_("Match on file base name instead of full path from the top " + "of the repo. Incompatible with --path-rename, and " + "incompatible with matching against directory names.")) + + rename = parser.add_argument_group(title=_("Renaming based on paths " + "(see also --filename-callback)")) + rename.add_argument('--path-rename', '--path-rename-match', + metavar='OLD_NAME:NEW_NAME', dest='path_changes', type=os.fsencode, + action=FilteringOptions.AppendFilter, + help=_("Path to rename; if filename or directory matches OLD_NAME " + "rename to NEW_NAME. Multiple --path-rename options can be " + "specified. NOTE: If you combine filtering options with " + "renaming ones, do not rely on a rename argument to select " + "paths; you also need a filter to select them.")) + + helpers = parser.add_argument_group(title=_("Path shortcuts")) + helpers.add_argument('--paths-from-file', metavar='FILENAME', + type=os.fsencode, + action=FilteringOptions.FileWithPathsFilter, dest='path_changes', + help=_("Specify several path filtering and renaming directives, one " + "per line. Lines with '==>' in them specify path renames, " + "and lines can begin with 'literal:' (the default), 'glob:', " + "or 'regex:' to specify different matching styles. Blank " + "lines and lines starting with a '#' are ignored.")) + helpers.add_argument('--subdirectory-filter', metavar='DIRECTORY', + action=FilteringOptions.HelperFilter, type=os.fsencode, + help=_("Only look at history that touches the given subdirectory " + "and treat that directory as the project root. Equivalent " + "to using '--path DIRECTORY/ --path-rename DIRECTORY/:'")) + helpers.add_argument('--to-subdirectory-filter', metavar='DIRECTORY', + action=FilteringOptions.HelperFilter, type=os.fsencode, + help=_("Treat the project root as instead being under DIRECTORY. " + "Equivalent to using '--path-rename :DIRECTORY/'")) + + contents = parser.add_argument_group(title=_("Content editing filters " + "(see also --blob-callback)")) + contents.add_argument('--replace-text', metavar='EXPRESSIONS_FILE', + help=_("A file with expressions that, if found, will be replaced. " + "By default, each expression is treated as literal text, " + "but 'regex:' and 'glob:' prefixes are supported. You can " + "end the line with '==>' and some replacement text to " + "choose a replacement choice other than the default of '{}'." + .format(decode(FilteringOptions.default_replace_text)))) + contents.add_argument('--strip-blobs-bigger-than', metavar='SIZE', + dest='max_blob_size', default=0, + help=_("Strip blobs (files) bigger than specified size (e.g. '5M', " + "'2G', etc)")) + contents.add_argument('--strip-blobs-with-ids', metavar='BLOB-ID-FILENAME', + help=_("Read git object ids from each line of the given file, and " + "strip all of them from history")) + + refrename = parser.add_argument_group(title=_("Renaming of refs " + "(see also --refname-callback)")) + refrename.add_argument('--tag-rename', metavar='OLD:NEW', type=os.fsencode, + help=_("Rename tags starting with OLD to start with NEW. For " + "example, --tag-rename foo:bar will rename tag foo-1.2.3 " + "to bar-1.2.3; either OLD or NEW can be empty.")) + + messages = parser.add_argument_group(title=_("Filtering of commit messages " + "(see also --message-callback)")) + messages.add_argument('--replace-message', metavar='EXPRESSIONS_FILE', + help=_("A file with expressions that, if found in commit messages, " + "will be replaced. This file uses the same syntax as " + "--replace-text.")) + messages.add_argument('--preserve-commit-hashes', action='store_true', + help=_("By default, since commits are rewritten and thus gain new " + "hashes, references to old commit hashes in commit messages " + "are replaced with new commit hashes (abbreviated to the same " + "length as the old reference). Use this flag to turn off " + "updating commit hashes in commit messages.")) + messages.add_argument('--preserve-commit-encoding', action='store_true', + help=_("Do not reencode commit messages into UTF-8. By default, if " + "the commit object specifies an encoding for the commit " + "message, the message is re-encoded into UTF-8.")) + + people = parser.add_argument_group(title=_("Filtering of names & emails " + "(see also --name-callback " + "and --email-callback)")) + people.add_argument('--mailmap', dest='mailmap', metavar='FILENAME', + type=os.fsencode, + help=_("Use specified mailmap file (see git-shortlog(1) for " + "details on the format) when rewriting author, committer, " + "and tagger names and emails. If the specified file is " + "part of git history, historical versions of the file will " + "be ignored; only the current contents are consulted.")) + people.add_argument('--use-mailmap', dest='mailmap', + action='store_const', const=b'.mailmap', + help=_("Same as: '--mailmap .mailmap' ")) + + parents = parser.add_argument_group(title=_("Parent rewriting")) + parents.add_argument('--replace-refs', default=None, + choices=['delete-no-add', 'delete-and-add', + 'update-no-add', 'update-or-add', + 'update-and-add'], + help=_("Replace refs (see git-replace(1)) are used to rewrite " + "parents (unless turned off by the usual git mechanism); this " + "flag specifies what do do with those refs afterward. " + "Replace refs can either be deleted or updated to point at new " + "commit hashes. Also, new replace refs can be added for each " + "commit rewrite. With 'update-or-add', new replace refs are " + "only added for commit rewrites that aren't used to update an " + "existing replace ref. default is 'update-and-add' if " + "$GIT_DIR/filter-repo/already_ran does not exist; " + "'update-or-add' otherwise.")) + parents.add_argument('--prune-empty', default='auto', + choices=['always', 'auto', 'never'], + help=_("Whether to prune empty commits. 'auto' (the default) means " + "only prune commits which become empty (not commits which were " + "empty in the original repo, unless their parent was pruned). " + "When the parent of a commit is pruned, the first non-pruned " + "ancestor becomes the new parent.")) + parents.add_argument('--prune-degenerate', default='auto', + choices=['always', 'auto', 'never'], + help=_("Since merge commits are needed for history topology, they " + "are typically exempt from pruning. However, they can become " + "degenerate with the pruning of other commits (having fewer " + "than two parents, having one commit serve as both parents, or " + "having one parent as the ancestor of the other.) If such " + "merge commits have no file changes, they can be pruned. The " + "default ('auto') is to only prune empty merge commits which " + "become degenerate (not which started as such).")) + parents.add_argument('--no-ff', action='store_true', + help=_("Even if the first parent is or becomes an ancestor of another " + "parent, do not prune it. This modifies how " + "--prune-degenerate behaves, and may be useful in projects who " + "always use merge --no-ff.")) + + callback = parser.add_argument_group(title=_("Generic callback code snippets")) + callback.add_argument('--filename-callback', metavar="FUNCTION_BODY_OR_FILE", + help=_("Python code body for processing filenames; see CALLBACKS " + "sections below.")) + callback.add_argument('--message-callback', metavar="FUNCTION_BODY_OR_FILE", + help=_("Python code body for processing messages (both commit " + "messages and tag messages); see CALLBACKS section below.")) + callback.add_argument('--name-callback', metavar="FUNCTION_BODY_OR_FILE", + help=_("Python code body for processing names of people; see " + "CALLBACKS section below.")) + callback.add_argument('--email-callback', metavar="FUNCTION_BODY_OR_FILE", + help=_("Python code body for processing emails addresses; see " + "CALLBACKS section below.")) + callback.add_argument('--refname-callback', metavar="FUNCTION_BODY_OR_FILE", + help=_("Python code body for processing refnames; see CALLBACKS " + "section below.")) + + callback.add_argument('--blob-callback', metavar="FUNCTION_BODY_OR_FILE", + help=_("Python code body for processing blob objects; see " + "CALLBACKS section below.")) + callback.add_argument('--commit-callback', metavar="FUNCTION_BODY_OR_FILE", + help=_("Python code body for processing commit objects; see " + "CALLBACKS section below.")) + callback.add_argument('--tag-callback', metavar="FUNCTION_BODY_OR_FILE", + help=_("Python code body for processing tag objects; see CALLBACKS " + "section below.")) + callback.add_argument('--reset-callback', metavar="FUNCTION_BODY_OR_FILE", + help=_("Python code body for processing reset objects; see " + "CALLBACKS section below.")) + + desc = _( + "Specifying alternate source or target locations implies --partial,\n" + "except that the normal default for --replace-refs is used. However,\n" + "unlike normal uses of --partial, this doesn't risk mixing old and new\n" + "history since the old and new histories are in different repositories.") + location = parser.add_argument_group(title=_("Location to filter from/to"), + description=desc) + location.add_argument('--source', type=os.fsencode, + help=_("Git repository to read from")) + location.add_argument('--target', type=os.fsencode, + help=_("Git repository to overwrite with filtered history")) + + misc = parser.add_argument_group(title=_("Miscellaneous options")) + misc.add_argument('--help', '-h', action='store_true', + help=_("Show this help message and exit.")) + misc.add_argument('--version', action='store_true', + help=_("Display filter-repo's version and exit.")) + misc.add_argument('--force', '-f', action='store_true', + help=_("Rewrite repository history even if the current repo does not " + "look like a fresh clone. History rewriting is irreversible " + "(and includes immediate pruning of reflogs and old objects), " + "so be cautious about using this flag.")) + misc.add_argument('--partial', action='store_true', + help=_("Do a partial history rewrite, resulting in the mixture of " + "old and new history. This implies a default of " + "update-no-add for --replace-refs, disables rewriting " + "refs/remotes/origin/* to refs/heads/*, disables removing " + "of the 'origin' remote, disables removing unexported refs, " + "disables expiring the reflog, and disables the automatic " + "post-filter gc. Also, this modifies --tag-rename and " + "--refname-callback options such that instead of replacing " + "old refs with new refnames, it will instead create new " + "refs and keep the old ones around. Use with caution.")) + # WARNING: --refs presents a problem with become-degenerate pruning: + # * Excluding a commit also excludes its ancestors so when some other + # commit has an excluded ancestor as a parent we have no way of + # knowing what it is an ancestor of without doing a special + # full-graph walk. + misc.add_argument('--refs', nargs='+', + help=_("Limit history rewriting to the specified refs. Implies " + "--partial. In addition to the normal caveats of --partial " + "(mixing old and new history, no automatic remapping of " + "refs/remotes/origin/* to refs/heads/*, etc.), this also may " + "cause problems for pruning of degenerate empty merge " + "commits when negative revisions are specified.")) + + misc.add_argument('--dry-run', action='store_true', + help=_("Do not change the repository. Run `git fast-export` and " + "filter its output, and save both the original and the " + "filtered version for comparison. This also disables " + "rewriting commit messages due to not knowing new commit " + "IDs and disables filtering of some empty commits due to " + "inability to query the fast-import backend." )) + misc.add_argument('--debug', action='store_true', + help=_("Print additional information about operations being " + "performed and commands being run. When used together " + "with --dry-run, also show extra information about what " + "would be run.")) + # WARNING: --state-branch has some problems: + # * It does not work well with manually inserted objects (user creating + # Blob() or Commit() or Tag() objects and calling + # RepoFilter.insert(obj) on them). + # * It does not work well with multiple source or multiple target repos + # * It doesn't work so well with pruning become-empty commits (though + # --refs doesn't work so well with it either) + # These are probably fixable, given some work (e.g. re-importing the + # graph at the beginning to get the AncestryGraph right, doing our own + # export of marks instead of using fast-export --export-marks, etc.), but + # for now just hide the option. + misc.add_argument('--state-branch', + #help=_("Enable incremental filtering by saving the mapping of old " + # "to new objects to the specified branch upon exit, and" + # "loading that mapping from that branch (if it exists) " + # "upon startup.")) + help=argparse.SUPPRESS) + misc.add_argument('--stdin', action='store_true', + help=_("Instead of running `git fast-export` and filtering its " + "output, filter the fast-export stream from stdin. The " + "stdin must be in the expected input format (e.g. it needs " + "to include original-oid directives).")) + misc.add_argument('--quiet', action='store_true', + help=_("Pass --quiet to other git commands called")) + return parser + + @staticmethod + def sanity_check_args(args): + if args.analyze and args.path_changes: + raise SystemExit(_("Error: --analyze is incompatible with --path* flags; " + "it's a read-only operation.")) + if args.analyze and args.stdin: + raise SystemExit(_("Error: --analyze is incompatible with --stdin.")) + # If no path_changes are found, initialize with empty list but mark as + # not inclusive so that all files match + if args.path_changes == None: + args.path_changes = [] + args.inclusive = False + else: + # Similarly, if we have no filtering paths, then no path should be + # filtered out. Based on how newname() works, the easiest way to + # achieve that is setting args.inclusive to False. + if not any(x[0] == 'filter' for x in args.path_changes): + args.inclusive = False + # Also check for incompatible --use-base-name and --path-rename flags. + if args.use_base_name: + if any(x[0] == 'rename' for x in args.path_changes): + raise SystemExit(_("Error: --use-base-name and --path-rename are " + "incompatible.")) + # Also throw some sanity checks on git version here; + # PERF: remove these checks once new enough git versions are common + p = subproc.Popen('git fast-export -h'.split(), + stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + output = p.stdout.read() + if b'--anonymize-map' not in output: # pragma: no cover + global date_format_permissive + date_format_permissive = False + if b'--mark-tags' not in output: # pragma: no cover + global write_marks + write_marks = False + if args.state_branch: + # We need a version of git-fast-export with --mark-tags + raise SystemExit(_("Error: need git >= 2.24.0")) + if b'--reencode' not in output: # pragma: no cover + if args.preserve_commit_encoding: + # We need a version of git-fast-export with --reencode + raise SystemExit(_("Error: need git >= 2.23.0")) + else: + # Set args.preserve_commit_encoding to None which we'll check for later + # to avoid passing --reencode=yes to fast-export (that option was the + # default prior to git-2.23) + args.preserve_commit_encoding = None + # If we don't have fast-exoprt --reencode, we may also be missing + # diff-tree --combined-all-paths, which is even more important... + p = subproc.Popen('git diff-tree -h'.split(), + stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + output = p.stdout.read() + if b'--combined-all-paths' not in output: + # We need a version of git-diff-tree with --combined-all-paths + raise SystemExit(_("Error: need git >= 2.22.0")) + # End of sanity checks on git version + if args.max_blob_size: + suffix = args.max_blob_size[-1] + if suffix not in '1234567890': + mult = {'K': 1024, 'M': 1024**2, 'G': 1024**3} + if suffix not in mult: + raise SystemExit(_("Error: could not parse --strip-blobs-bigger-than" + " argument %s") + % args.max_blob_size) + args.max_blob_size = int(args.max_blob_size[0:-1]) * mult[suffix] + else: + args.max_blob_size = int(args.max_blob_size) + + @staticmethod + def get_replace_text(filename): + replace_literals = [] + replace_regexes = [] + with open(filename, 'br') as f: + for line in f: + line = line.rstrip(b'\r\n') + + # Determine the replacement + replacement = FilteringOptions.default_replace_text + if b'==>' in line: + line, replacement = line.rsplit(b'==>', 1) + + # See if we need to match via regex + regex = None + if line.startswith(b'regex:'): + regex = line[6:] + elif line.startswith(b'glob:'): + regex = glob_to_regex(line[5:]) + if regex: + replace_regexes.append((re.compile(regex), replacement)) + else: + # Otherwise, find the literal we need to replace + if line.startswith(b'literal:'): + line = line[8:] + if not line: + continue + replace_literals.append((line, replacement)) + return {'literals': replace_literals, 'regexes': replace_regexes} + + @staticmethod + def get_paths_from_file(filename): + new_path_changes = [] + with open(filename, 'br') as f: + for line in f: + line = line.rstrip(b'\r\n') + + # Skip blank lines + if not line: + continue + # Skip comment lines + if line.startswith(b'#'): + continue + + # Determine the replacement + match_type, repl = 'literal', None + if b'==>' in line: + line, repl = line.rsplit(b'==>', 1) + + # See if we need to match via regex + match_type = 'match' # a.k.a. 'literal' + if line.startswith(b'regex:'): + match_type = 'regex' + match = re.compile(line[6:]) + elif line.startswith(b'glob:'): + match_type = 'glob' + match = line[5:] + if repl: + raise SystemExit(_("Error: In %s, 'glob:' and '==>' are incompatible (renaming globs makes no sense)" % decode(filename))) + else: + if line.startswith(b'literal:'): + match = line[8:] + else: + match = line + if repl is not None: + if match and repl and match.endswith(b'/') != repl.endswith(b'/'): + raise SystemExit(_("Error: When rename directories, if OLDNAME " + "and NEW_NAME are both non-empty and either " + "ends with a slash then both must.")) + + # Record the filter or rename + if repl is not None: + new_path_changes.append(['rename', match_type, (match, repl)]) + else: + new_path_changes.append(['filter', match_type, match]) + if match_type == 'glob' and not match.endswith(b'*'): + extension = b'*' if match.endswith(b'/') else b'/*' + new_path_changes.append(['filter', match_type, match+extension]) + return new_path_changes + + @staticmethod + def default_options(): + return FilteringOptions.parse_args([], error_on_empty = False) + + @staticmethod + def parse_args(input_args, error_on_empty = True): + parser = FilteringOptions.create_arg_parser() + if not input_args and error_on_empty: + parser.print_usage() + raise SystemExit(_("No arguments specified.")) + args = parser.parse_args(input_args) + if args.help: + parser.print_help() + raise SystemExit() + if args.version: + GitUtils.print_my_version() + raise SystemExit() + FilteringOptions.sanity_check_args(args) + if args.mailmap: + args.mailmap = MailmapInfo(args.mailmap) + if args.replace_text: + args.replace_text = FilteringOptions.get_replace_text(args.replace_text) + if args.replace_message: + args.replace_message = FilteringOptions.get_replace_text(args.replace_message) + if args.strip_blobs_with_ids: + with open(args.strip_blobs_with_ids, 'br') as f: + args.strip_blobs_with_ids = set(f.read().split()) + else: + args.strip_blobs_with_ids = set() + if (args.partial or args.refs) and not args.replace_refs: + args.replace_refs = 'update-no-add' + args.repack = not (args.partial or args.refs) + if args.refs or args.source or args.target: + args.partial = True + if not args.refs: + args.refs = ['--all'] + return args + +class RepoAnalyze(object): + + # First, several helper functions for analyze_commit() + + @staticmethod + def equiv_class(stats, filename): + return stats['equivalence'].get(filename, (filename,)) + + @staticmethod + def setup_equivalence_for_rename(stats, oldname, newname): + # if A is renamed to B and B is renamed to C, then the user thinks of + # A, B, and C as all being different names for the same 'file'. We record + # this as an equivalence class: + # stats['equivalence'][name] = (A,B,C) + # for name being each of A, B, and C. + old_tuple = stats['equivalence'].get(oldname, ()) + if newname in old_tuple: + return + elif old_tuple: + new_tuple = tuple(list(old_tuple)+[newname]) + else: + new_tuple = (oldname, newname) + for f in new_tuple: + stats['equivalence'][f] = new_tuple + + @staticmethod + def setup_or_update_rename_history(stats, commit, oldname, newname): + rename_commits = stats['rename_history'].get(oldname, set()) + rename_commits.add(commit) + stats['rename_history'][oldname] = rename_commits + + @staticmethod + def handle_renames(stats, commit, change_types, filenames): + for index, change_type in enumerate(change_types): + if change_type == ord(b'R'): + oldname, newname = filenames[index], filenames[-1] + RepoAnalyze.setup_equivalence_for_rename(stats, oldname, newname) + RepoAnalyze.setup_or_update_rename_history(stats, commit, + oldname, newname) + + @staticmethod + def handle_file(stats, graph, commit, modes, shas, filenames): + mode, sha, filename = modes[-1], shas[-1], filenames[-1] + + # Figure out kind of deletions to undo for this file, and update lists + # of all-names-by-sha and all-filenames + delmode = 'tree_deletions' + if mode != b'040000': + delmode = 'file_deletions' + stats['names'][sha].add(filename) + stats['allnames'].add(filename) + + # If the file (or equivalence class of files) was recorded as deleted, + # clearly it isn't anymore + equiv = RepoAnalyze.equiv_class(stats, filename) + for f in equiv: + stats[delmode].pop(f, None) + + # If we get a modify/add for a path that was renamed, we may need to break + # the equivalence class. However, if the modify/add was on a branch that + # doesn't have the rename in its history, we are still okay. + need_to_break_equivalence = False + if equiv[-1] != filename: + for rename_commit in stats['rename_history'][filename]: + if graph.is_ancestor(rename_commit, commit): + need_to_break_equivalence = True + + if need_to_break_equivalence: + for f in equiv: + if f in stats['equivalence']: + del stats['equivalence'][f] + + @staticmethod + def analyze_commit(stats, graph, commit, parents, date, file_changes): + graph.add_commit_and_parents(commit, parents) + for change in file_changes: + modes, shas, change_types, filenames = change + if len(parents) == 1 and change_types.startswith(b'R'): + change_types = b'R' # remove the rename score; we don't care + if modes[-1] == b'160000': + continue + elif modes[-1] == b'000000': + # Track when files/directories are deleted + for f in RepoAnalyze.equiv_class(stats, filenames[-1]): + if any(x == b'040000' for x in modes[0:-1]): + stats['tree_deletions'][f] = date + else: + stats['file_deletions'][f] = date + elif change_types.strip(b'AMT') == b'': + RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames) + elif modes[-1] == b'040000' and change_types.strip(b'RAM') == b'': + RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames) + elif change_types.strip(b'RAMT') == b'': + RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames) + RepoAnalyze.handle_renames(stats, commit, change_types, filenames) + else: + raise SystemExit(_("Unhandled change type(s): %(change_type)s " + "(in commit %(commit)s)") + % ({'change_type': change_types, 'commit': commit}) + ) # pragma: no cover + + @staticmethod + def gather_data(args): + unpacked_size, packed_size = GitUtils.get_blob_sizes() + stats = {'names': collections.defaultdict(set), + 'allnames' : set(), + 'file_deletions': {}, + 'tree_deletions': {}, + 'equivalence': {}, + 'rename_history': collections.defaultdict(set), + 'unpacked_size': unpacked_size, + 'packed_size': packed_size, + 'num_commits': 0} + + # Setup the rev-list/diff-tree process + processed_commits_msg = _("Processed %d commits") + commit_parse_progress = ProgressWriter() + num_commits = 0 + cmd = ('git rev-list --topo-order --reverse {}'.format(' '.join(args.refs)) + + ' | git diff-tree --stdin --always --root --format=%H%n%P%n%cd' + + ' --date=short -M -t -c --raw --combined-all-paths') + dtp = subproc.Popen(cmd, shell=True, bufsize=-1, stdout=subprocess.PIPE) + f = dtp.stdout + line = f.readline() + if not line: + raise SystemExit(_("Nothing to analyze; repository is empty.")) + cont = bool(line) + graph = AncestryGraph() + while cont: + commit = line.rstrip() + parents = f.readline().split() + date = f.readline().rstrip() + + # We expect a blank line next; if we get a non-blank line then + # this commit modified no files and we need to move on to the next. + # If there is no line, we've reached end-of-input. + line = f.readline() + if not line: + cont = False + line = line.rstrip() + + # If we haven't reached end of input, and we got a blank line meaning + # a commit that has modified files, then get the file changes associated + # with this commit. + file_changes = [] + if cont and not line: + cont = False + for line in f: + if not line.startswith(b':'): + cont = True + break + n = 1+max(1, len(parents)) + assert line.startswith(b':'*(n-1)) + relevant = line[n-1:-1] + splits = relevant.split(None, n) + modes = splits[0:n] + splits = splits[n].split(None, n) + shas = splits[0:n] + splits = splits[n].split(b'\t') + change_types = splits[0] + filenames = [PathQuoting.dequote(x) for x in splits[1:]] + file_changes.append([modes, shas, change_types, filenames]) + + # If someone is trying to analyze a subset of the history, make sure + # to avoid dying on commits with parents that we haven't seen before + if args.refs: + graph.record_external_commits([p for p in parents + if not p in graph.value]) + + # Analyze this commit and update progress + RepoAnalyze.analyze_commit(stats, graph, commit, parents, date, + file_changes) + num_commits += 1 + commit_parse_progress.show(processed_commits_msg % num_commits) + + # Show the final commits processed message and record the number of commits + commit_parse_progress.finish() + stats['num_commits'] = num_commits + + # Close the output, ensure rev-list|diff-tree pipeline completed successfully + dtp.stdout.close() + if dtp.wait(): + raise SystemExit(_("Error: rev-list|diff-tree pipeline failed; see above.")) # pragma: no cover + + return stats + + @staticmethod + def write_report(reportdir, stats): + def datestr(datetimestr): + return datetimestr if datetimestr else _('').encode() + + def dirnames(path): + while True: + path = os.path.dirname(path) + yield path + if path == b'': + break + + # Compute aggregate size information for paths, extensions, and dirs + total_size = {'packed': 0, 'unpacked': 0} + path_size = {'packed': collections.defaultdict(int), + 'unpacked': collections.defaultdict(int)} + ext_size = {'packed': collections.defaultdict(int), + 'unpacked': collections.defaultdict(int)} + dir_size = {'packed': collections.defaultdict(int), + 'unpacked': collections.defaultdict(int)} + for sha in stats['names']: + size = {'packed': stats['packed_size'][sha], + 'unpacked': stats['unpacked_size'][sha]} + for which in ('packed', 'unpacked'): + for name in stats['names'][sha]: + total_size[which] += size[which] + path_size[which][name] += size[which] + basename, ext = os.path.splitext(name) + ext_size[which][ext] += size[which] + for dirname in dirnames(name): + dir_size[which][dirname] += size[which] + + # Determine if and when extensions and directories were deleted + ext_deleted_data = {} + for name in stats['allnames']: + when = stats['file_deletions'].get(name, None) + + # Update the extension + basename, ext = os.path.splitext(name) + if when is None: + ext_deleted_data[ext] = None + elif ext in ext_deleted_data: + if ext_deleted_data[ext] is not None: + ext_deleted_data[ext] = max(ext_deleted_data[ext], when) + else: + ext_deleted_data[ext] = when + + dir_deleted_data = {} + for name in dir_size['packed']: + dir_deleted_data[name] = stats['tree_deletions'].get(name, None) + + with open(os.path.join(reportdir, b"README"), 'bw') as f: + # Give a basic overview of this file + f.write(b"== %s ==\n" % _("Overall Statistics").encode()) + f.write((" %s: %d\n" % (_("Number of commits"), + stats['num_commits'])).encode()) + f.write((" %s: %d\n" % (_("Number of filenames"), + len(path_size['packed']))).encode()) + f.write((" %s: %d\n" % (_("Number of directories"), + len(dir_size['packed']))).encode()) + f.write((" %s: %d\n" % (_("Number of file extensions"), + len(ext_size['packed']))).encode()) + f.write(b"\n") + f.write((" %s: %d\n" % (_("Total unpacked size (bytes)"), + total_size['unpacked'])).encode()) + f.write((" %s: %d\n" % (_("Total packed size (bytes)"), + total_size['packed'])).encode()) + f.write(b"\n") + + # Mention issues with the report + f.write(("== %s ==\n" % _("Caveats")).encode()) + f.write(("=== %s ===\n" % _("Sizes")).encode()) + f.write(textwrap.dedent(_(""" + Packed size represents what size your repository would be if no + trees, commits, tags, or other metadata were included (though it may + fail to represent de-duplication; see below). It also represents the + current packing, which may be suboptimal if you haven't gc'ed for a + while. + + Unpacked size represents what size your repository would be if no + trees, commits, tags, or other metadata were included AND if no + files were packed; i.e., without delta-ing or compression. + + Both unpacked and packed sizes can be slightly misleading. Deleting + a blob from history not save as much space as the unpacked size, + because it is obviously normally stored in packed form. Also, + deleting a blob from history may not save as much space as its packed + size either, because another blob could be stored as a delta against + that blob, so when you remove one blob another blob's packed size may + grow. + + Also, the sum of the packed sizes can add up to more than the + repository size; if the same contents appeared in the repository in + multiple places, git will automatically de-dupe and store only one + copy, while the way sizes are added in this analysis adds the size + for each file path that has those contents. Further, if a file is + ever reverted to a previous version's contents, the previous + version's size will be counted multiple times in this analysis, even + though git will only store it once. + """)[1:]).encode()) + f.write(b"\n") + f.write(("=== %s ===\n" % _("Deletions")).encode()) + f.write(textwrap.dedent(_(""" + Whether a file is deleted is not a binary quality, since it can be + deleted on some branches but still exist in others. Also, it might + exist in an old tag, but have been deleted in versions newer than + that. More thorough tracking could be done, including looking at + merge commits where one side of history deleted and the other modified, + in order to give a more holistic picture of deletions. However, that + algorithm would not only be more complex to implement, it'd also be + quite difficult to present and interpret by users. Since --analyze + is just about getting a high-level rough picture of history, it instead + implements the simplistic rule that is good enough for 98% of cases: + A file is marked as deleted if the last commit in the fast-export + stream that mentions the file lists it as deleted. + This makes it dependent on topological ordering, but generally gives + the "right" answer. + """)[1:]).encode()) + f.write(b"\n") + f.write(("=== %s ===\n" % _("Renames")).encode()) + f.write(textwrap.dedent(_(""" + Renames share the same non-binary nature that deletions do, plus + additional challenges: + * If the renamed file is renamed again, instead of just two names for + a path you can have three or more. + * Rename pairs of the form (oldname, newname) that we consider to be + different names of the "same file" might only be valid over certain + commit ranges. For example, if a new commit reintroduces a file + named oldname, then new versions of oldname aren't the "same file" + anymore. We could try to portray this to the user, but it's easier + for the user to just break the pairing and only report unbroken + rename pairings to the user. + * The ability for users to rename files differently in different + branches means that our chains of renames will not necessarily be + linear but may branch out. + """)[1:]).encode()) + f.write(b"\n") + + # Equivalence classes for names, so if folks only want to keep a + # certain set of paths, they know the old names they want to include + # too. + with open(os.path.join(reportdir, b"renames.txt"), 'bw') as f: + seen = set() + for pathname,equiv_group in sorted(stats['equivalence'].items(), + key=lambda x:(x[1], x[0])): + if equiv_group in seen: + continue + seen.add(equiv_group) + f.write(("{} ->\n ".format(decode(equiv_group[0])) + + "\n ".join(decode(x) for x in equiv_group[1:]) + + "\n").encode()) + + # List directories in reverse sorted order of unpacked size + with open(os.path.join(reportdir, b"directories-deleted-sizes.txt"), 'bw') as f: + msg = "=== %s ===\n" % _("Deleted directories by reverse size") + f.write(msg.encode()) + msg = _("Format: unpacked size, packed size, date deleted, directory name\n") + f.write(msg.encode()) + for dirname, size in sorted(dir_size['packed'].items(), + key=lambda x:(x[1],x[0]), reverse=True): + if (dir_deleted_data[dirname]): + f.write(b" %10d %10d %-10s %s\n" % (dir_size['unpacked'][dirname], + size, + datestr(dir_deleted_data[dirname]), + dirname or _('').encode())) + + with open(os.path.join(reportdir, b"directories-all-sizes.txt"), 'bw') as f: + f.write(("=== %s ===\n" % _("All directories by reverse size")).encode()) + msg = _("Format: unpacked size, packed size, date deleted, directory name\n") + f.write(msg.encode()) + for dirname, size in sorted(dir_size['packed'].items(), + key=lambda x:(x[1],x[0]), reverse=True): + f.write(b" %10d %10d %-10s %s\n" % (dir_size['unpacked'][dirname], + size, + datestr(dir_deleted_data[dirname]), + dirname or _("").encode())) + + # List extensions in reverse sorted order of unpacked size + with open(os.path.join(reportdir, b"extensions-deleted-sizes.txt"), 'bw') as f: + msg = "=== %s ===\n" % _("Deleted extensions by reverse size") + f.write(msg.encode()) + msg = _("Format: unpacked size, packed size, date deleted, extension name\n") + f.write(msg.encode()) + for extname, size in sorted(ext_size['packed'].items(), + key=lambda x:(x[1],x[0]), reverse=True): + if (ext_deleted_data[extname]): + f.write(b" %10d %10d %-10s %s\n" % (ext_size['unpacked'][extname], + size, + datestr(ext_deleted_data[extname]), + extname or _('').encode())) + + with open(os.path.join(reportdir, b"extensions-all-sizes.txt"), 'bw') as f: + f.write(("=== %s ===\n" % _("All extensions by reverse size")).encode()) + msg = _("Format: unpacked size, packed size, date deleted, extension name\n") + f.write(msg.encode()) + for extname, size in sorted(ext_size['packed'].items(), + key=lambda x:(x[1],x[0]), reverse=True): + f.write(b" %10d %10d %-10s %s\n" % (ext_size['unpacked'][extname], + size, + datestr(ext_deleted_data[extname]), + extname or _('').encode())) + + # List files in reverse sorted order of unpacked size + with open(os.path.join(reportdir, b"path-deleted-sizes.txt"), 'bw') as f: + msg = "=== %s ===\n" % _("Deleted paths by reverse accumulated size") + f.write(msg.encode()) + msg = _("Format: unpacked size, packed size, date deleted, path name(s)\n") + f.write(msg.encode()) + for pathname, size in sorted(path_size['packed'].items(), + key=lambda x:(x[1],x[0]), reverse=True): + when = stats['file_deletions'].get(pathname, None) + if when: + f.write(b" %10d %10d %-10s %s\n" % (path_size['unpacked'][pathname], + size, + datestr(when), + pathname)) + + with open(os.path.join(reportdir, b"path-all-sizes.txt"), 'bw') as f: + msg = "=== %s ===\n" % _("All paths by reverse accumulated size") + f.write(msg.encode()) + msg = _("Format: unpacked size, packed size, date deleted, path name\n") + f.write(msg.encode()) + for pathname, size in sorted(path_size['packed'].items(), + key=lambda x:(x[1],x[0]), reverse=True): + when = stats['file_deletions'].get(pathname, None) + f.write(b" %10d %10d %-10s %s\n" % (path_size['unpacked'][pathname], + size, + datestr(when), + pathname)) + + # List of filenames and sizes in descending order + with open(os.path.join(reportdir, b"blob-shas-and-paths.txt"), 'bw') as f: + f.write(("=== %s ===\n" % _("Files by sha and associated pathnames in reverse size")).encode()) + f.write(_("Format: sha, unpacked size, packed size, filename(s) object stored as\n").encode()) + for sha, size in sorted(stats['packed_size'].items(), + key=lambda x:(x[1],x[0]), reverse=True): + if sha not in stats['names']: + # Some objects in the repository might not be referenced, or not + # referenced by the branches/tags the user cares about; skip them. + continue + names_with_sha = stats['names'][sha] + if len(names_with_sha) == 1: + names_with_sha = names_with_sha.pop() + else: + names_with_sha = b'[' + b', '.join(sorted(names_with_sha)) + b']' + f.write(b" %s %10d %10d %s\n" % (sha, + stats['unpacked_size'][sha], + size, + names_with_sha)) + + @staticmethod + def run(args): + if args.report_dir: + reportdir = args.report_dir + else: + git_dir = GitUtils.determine_git_dir(b'.') + + # Create the report directory as necessary + results_tmp_dir = os.path.join(git_dir, b'filter-repo') + if not os.path.isdir(results_tmp_dir): + os.mkdir(results_tmp_dir) + reportdir = os.path.join(results_tmp_dir, b"analysis") + + if os.path.isdir(reportdir): + if args.force: + sys.stdout.write(_("Warning: Removing recursively: \"%s\"") % decode(reportdir)) + shutil.rmtree(reportdir) + else: + sys.stdout.write(_("Error: dir already exists (use --force to delete): \"%s\"\n") % decode(reportdir)) + sys.exit(1) + + os.mkdir(reportdir) + + # Gather the data we need + stats = RepoAnalyze.gather_data(args) + + # Write the reports + sys.stdout.write(_("Writing reports to %s...") % decode(reportdir)) + sys.stdout.flush() + RepoAnalyze.write_report(reportdir, stats) + sys.stdout.write(_("done.\n")) + +class InputFileBackup: + def __init__(self, input_file, output_file): + self.input_file = input_file + self.output_file = output_file + + def close(self): + self.input_file.close() + self.output_file.close() + + def read(self, size): + output = self.input_file.read(size) + self.output_file.write(output) + return output + + def readline(self): + line = self.input_file.readline() + self.output_file.write(line) + return line + +class DualFileWriter: + def __init__(self, file1, file2): + self.file1 = file1 + self.file2 = file2 + + def write(self, *args): + self.file1.write(*args) + self.file2.write(*args) + + def flush(self): + self.file1.flush() + self.file2.flush() + + def close(self): + self.file1.close() + self.file2.close() + +class RepoFilter(object): + def __init__(self, + args, + filename_callback = None, + message_callback = None, + name_callback = None, + email_callback = None, + refname_callback = None, + blob_callback = None, + commit_callback = None, + tag_callback = None, + reset_callback = None, + done_callback = None): + + self._args = args + + # Repo we are exporting + self._repo_working_dir = None + + # Store callbacks for acting on objects printed by FastExport + self._blob_callback = blob_callback + self._commit_callback = commit_callback + self._tag_callback = tag_callback + self._reset_callback = reset_callback + self._done_callback = done_callback + + # Store callbacks for acting on slices of FastExport objects + self._filename_callback = filename_callback # filenames from commits + self._message_callback = message_callback # commit OR tag message + self._name_callback = name_callback # author, committer, tagger + self._email_callback = email_callback # author, committer, tagger + self._refname_callback = refname_callback # from commit/tag/reset + self._handle_arg_callbacks() + + # Defaults for input + self._input = None + self._fep = None # Fast Export Process + self._fe_orig = None # Path to where original fast-export output stored + self._fe_filt = None # Path to where filtered fast-export output stored + self._parser = None # FastExportParser object we are working with + + # Defaults for output + self._output = None + self._fip = None # Fast Import Process + self._import_pipes = None + self._managed_output = True + + # A tuple of (depth, list-of-ancestors). Commits and ancestors are + # identified by their id (their 'mark' in fast-export or fast-import + # speak). The depth of a commit is one more than the max depth of any + # of its ancestors. + self._graph = AncestryGraph() + # Another one, for ancestry of commits in the original repo + self._orig_graph = AncestryGraph() + + # Names of files that were tweaked in any commit; such paths could lead + # to subsequent commits being empty + self._files_tweaked = set() + + # A set of commit hash pairs (oldhash, newhash) which used to be merge + # commits but due to filtering were turned into non-merge commits. + # The commits probably have suboptimal commit messages (e.g. "Merge branch + # next into master"). + self._commits_no_longer_merges = [] + + # A dict of original_ids to new_ids; filtering commits means getting + # new commit hash (sha1sums), and we record the mapping both for + # diagnostic purposes and so we can rewrite commit messages. Note that + # the new_id can be None rather than a commit hash if the original + # commit became empty and was pruned or was otherwise dropped. + self._commit_renames = {} + + # A set of original_ids for which we have not yet gotten the + # new_ids; we use OrderedDict because we need to know the order of + # insertion, but the values are always ignored (and set to None). + # If there was an OrderedSet class, I'd use it instead. + self._pending_renames = collections.OrderedDict() + + # A dict of commit_hash[0:7] -> set(commit_hashes with that prefix). + # + # It's common for commit messages to refer to commits by abbreviated + # commit hashes, as short as 7 characters. To facilitate translating + # such short hashes, we have a mapping of prefixes to full old hashes. + self._commit_short_old_hashes = collections.defaultdict(set) + + # A set of commit hash references appearing in commit messages which + # mapped to a valid commit that was removed entirely in the filtering + # process. The commit message will continue to reference the + # now-missing commit hash, since there was nothing to map it to. + self._commits_referenced_but_removed = set() + + # Progress handling (number of commits parsed, etc.) + self._progress_writer = ProgressWriter() + self._num_commits = 0 + + # Size of blobs in the repo + self._unpacked_size = {} + + # Other vars + self._sanity_checks_handled = False + self._finalize_handled = False + self._orig_refs = None + self._newnames = {} + + # Cache a few message translations for performance reasons + self._parsed_message = _("Parsed %d commits") + + # Compile some regexes and cache those + self._hash_re = re.compile(br'(\b[0-9a-f]{7,40}\b)') + + def _handle_arg_callbacks(self): + def make_callback(argname, str): + exec('def callback({}, _do_not_use_this_var = None):\n'.format(argname)+ + ' '+'\n '.join(str.splitlines()), globals()) + return callback #namespace['callback'] + def handle(type): + callback_field = '_{}_callback'.format(type) + code_string = getattr(self._args, type+'_callback') + if code_string: + if os.path.exists(code_string): + with open(code_string, 'r', encoding='utf-8') as f: + code_string = f.read() + if getattr(self, callback_field): + raise SystemExit(_("Error: Cannot pass a %s_callback to RepoFilter " + "AND pass --%s-callback" + % (type, type))) + if 'return ' not in code_string and \ + type not in ('blob', 'commit', 'tag', 'reset'): + raise SystemExit(_("Error: --%s-callback should have a return statement") + % type) + setattr(self, callback_field, make_callback(type, code_string)) + handle('filename') + handle('message') + handle('name') + handle('email') + handle('refname') + handle('blob') + handle('commit') + handle('tag') + handle('reset') + + def _run_sanity_checks(self): + self._sanity_checks_handled = True + if not self._managed_output: + if not self._args.replace_refs: + # If not _managed_output we don't want to make extra changes to the + # repo, so set default to no-op 'update-no-add' + self._args.replace_refs = 'update-no-add' + return + + if self._args.debug: + print("[DEBUG] Passed arguments:\n{}".format(self._args)) + + # Determine basic repository information + target_working_dir = self._args.target or b'.' + self._orig_refs = GitUtils.get_refs(target_working_dir) + is_bare = GitUtils.is_repository_bare(target_working_dir) + + # Determine if this is second or later run of filter-repo + tmp_dir = self.results_tmp_dir(create_if_missing=False) + already_ran = os.path.isfile(os.path.join(tmp_dir, b'already_ran')) + + # Default for --replace-refs + if not self._args.replace_refs: + self._args.replace_refs = ('update-or-add' if already_ran + else 'update-and-add') + + # Do sanity checks from the correct directory + if not self._args.force and not already_ran: + cwd = os.getcwd() + os.chdir(target_working_dir) + RepoFilter.sanity_check(self._orig_refs, is_bare) + os.chdir(cwd) + + @staticmethod + def sanity_check(refs, is_bare): + def abort(reason): + try: + cmd = 'git config remote.origin.url' + output = subproc.check_output(cmd.split()).strip() + except subprocess.CalledProcessError as e: + output = None + msg = "" + if output and os.path.isdir(output): + msg = _("Note: when cloning local repositories, you need to pass\n" + " --no-local to git clone to avoid this issue.\n") + raise SystemExit( + _("Aborting: Refusing to destructively overwrite repo history since\n" + "this does not look like a fresh clone.\n" + " (%s)\n%s" + "Please operate on a fresh clone instead. If you want to proceed\n" + "anyway, use --force.") % (reason, msg)) + + # Make sure repo is fully packed, just like a fresh clone would be. + # Note that transfer.unpackLimit defaults to 100, meaning that a + # repository with no packs and less than 100 objects should be considered + # fully packed. + output = subproc.check_output('git count-objects -v'.split()) + stats = dict(x.split(b': ') for x in output.splitlines()) + num_packs = int(stats[b'packs']) + num_loose_objects = int(stats[b'count']) + if num_packs > 1 or \ + (num_packs == 1 and num_loose_objects > 0) or \ + num_loose_objects >= 100: + abort(_("expected freshly packed repo")) + + # Make sure there is precisely one remote, named "origin"...or that this + # is a new bare repo with no packs and no remotes + output = subproc.check_output('git remote'.split()).strip() + if not (output == b"origin" or (num_packs == 0 and not output)): + abort(_("expected one remote, origin")) + + # Avoid letting people running with weird setups and overwriting GIT_DIR + # elsewhere + git_dir = GitUtils.determine_git_dir(b'.') + if is_bare and git_dir != b'.': + abort(_("GIT_DIR must be .")) + elif not is_bare and git_dir != b'.git': + abort(_("GIT_DIR must be .git")) + + # Make sure that all reflogs have precisely one entry + reflog_dir=os.path.join(git_dir, b'logs') + for root, dirs, files in os.walk(reflog_dir): + for filename in files: + pathname = os.path.join(root, filename) + with open(pathname, 'br') as f: + if len(f.read().splitlines()) > 1: + shortpath = pathname[len(reflog_dir)+1:] + abort(_("expected at most one entry in the reflog for %s") % + decode(shortpath)) + + # Make sure there are no stashed changes + if b'refs/stash' in refs: + abort(_("has stashed changes")) + + # Do extra checks in non-bare repos + if not is_bare: + # Avoid uncommitted, unstaged, or untracked changes + if subproc.call('git diff --staged --quiet'.split()): + abort(_("you have uncommitted changes")) + if subproc.call('git diff --quiet'.split()): + abort(_("you have unstaged changes")) + if len(subproc.check_output('git ls-files -o'.split())) > 0: + abort(_("you have untracked changes")) + + # Avoid unpushed changes + for refname, rev in refs.items(): + if not refname.startswith(b'refs/heads/'): + continue + origin_ref = refname.replace(b'refs/heads/', b'refs/remotes/origin/') + if origin_ref not in refs: + abort(_('%s exists, but %s not found') % (decode(refname), + decode(origin_ref))) + if rev != refs[origin_ref]: + abort(_('%s does not match %s') % (decode(refname), + decode(origin_ref))) + + # Make sure there is only one worktree + output = subproc.check_output('git worktree list'.split()) + if len(output.splitlines()) > 1: + abort(_('you have multiple worktrees')) + + @staticmethod + def cleanup(repo, repack, reset, run_quietly=False, show_debuginfo=False): + ''' cleanup repo; if repack then expire reflogs and do a gc --prune=now. + if reset then do a reset --hard. Optionally also curb output if + run_quietly is True, or go the opposite direction and show extra + output if show_debuginfo is True. ''' + assert not (run_quietly and show_debuginfo) + + if (repack and not run_quietly and not show_debuginfo): + print(_("Repacking your repo and cleaning out old unneeded objects")) + quiet_flags = '--quiet' if run_quietly else '' + cleanup_cmds = [] + if repack: + cleanup_cmds = ['git reflog expire --expire=now --all'.split(), + 'git gc {} --prune=now'.format(quiet_flags).split()] + if reset: + cleanup_cmds.insert(0, 'git reset {} --hard'.format(quiet_flags).split()) + location_info = ' (in {})'.format(decode(repo)) if repo != b'.' else '' + for cmd in cleanup_cmds: + if show_debuginfo: + print("[DEBUG] Running{}: {}".format(location_info, ' '.join(cmd))) + subproc.call(cmd, cwd=repo) + + def _get_rename(self, old_hash): + # If we already know the rename, just return it + new_hash = self._commit_renames.get(old_hash, None) + if new_hash: + return new_hash + + # If it's not in the remaining pending renames, we don't know it + if old_hash is not None and old_hash not in self._pending_renames: + return None + + # Read through the pending renames until we find it or we've read them all, + # and return whatever we might find + self._flush_renames(old_hash) + return self._commit_renames.get(old_hash, None) + + def _flush_renames(self, old_hash=None, limit=0): + # Parse through self._pending_renames until we have read enough. We have + # read enough if: + # self._pending_renames is empty + # old_hash != None and we found a rename for old_hash + # limit > 0 and len(self._pending_renames) started less than 2*limit + # limit > 0 and len(self._pending_renames) < limit + if limit and len(self._pending_renames) < 2 * limit: + return + fi_input, fi_output = self._import_pipes + while self._pending_renames: + orig_id, ignore = self._pending_renames.popitem(last=False) + new_id = fi_output.readline().rstrip() + self._commit_renames[orig_id] = new_id + if old_hash == orig_id: + return + if limit and len(self._pending_renames) < limit: + return + + def _translate_commit_hash(self, matchobj_or_oldhash): + old_hash = matchobj_or_oldhash + if not isinstance(matchobj_or_oldhash, bytes): + old_hash = matchobj_or_oldhash.group(1) + orig_len = len(old_hash) + new_hash = self._get_rename(old_hash) + if new_hash is None: + if old_hash[0:7] not in self._commit_short_old_hashes: + self._commits_referenced_but_removed.add(old_hash) + return old_hash + possibilities = self._commit_short_old_hashes[old_hash[0:7]] + matches = [x for x in possibilities + if x[0:orig_len] == old_hash] + if len(matches) != 1: + self._commits_referenced_but_removed.add(old_hash) + return old_hash + old_hash = matches[0] + new_hash = self._get_rename(old_hash) + + assert new_hash is not None + return new_hash[0:orig_len] + + def _trim_extra_parents(self, orig_parents, parents): + '''Due to pruning of empty commits, some parents could be non-existent + (None) or otherwise redundant. Remove the non-existent parents, and + remove redundant parents so long as that doesn't transform a merge + commit into a non-merge commit. + + Returns a tuple: + (parents, new_first_parent_if_would_become_non_merge)''' + + always_prune = (self._args.prune_degenerate == 'always') + + # Pruning of empty commits means multiple things: + # * An original parent of this commit may have been pruned causing the + # need to rewrite the reported parent to the nearest ancestor. We + # want to know when we're dealing with such a parent. + # * Further, there may be no "nearest ancestor" if the entire history + # of that parent was also pruned. (Detectable by the parent being + # 'None') + # Remove all parents rewritten to None, and keep track of which parents + # were rewritten to an ancestor. + tmp = zip(parents, + orig_parents, + [(x in _SKIPPED_COMMITS or always_prune) for x in orig_parents]) + tmp2 = [x for x in tmp if x[0] is not None] + if not tmp2: + # All ancestors have been pruned; we have no parents. + return [], None + parents, orig_parents, is_rewritten = [list(x) for x in zip(*tmp2)] + + # We can't have redundant parents if we don't have at least 2 parents + if len(parents) < 2: + return parents, None + + # Don't remove redundant parents if user doesn't want us to + if self._args.prune_degenerate == 'never': + return parents, None + + # Remove duplicate parents (if both sides of history have lots of commits + # which become empty due to pruning, the most recent ancestor on both + # sides may be the same commit), except only remove parents that have + # been rewritten due to previous empty pruning. + seen = set() + seen_add = seen.add + # Deleting duplicate rewritten parents means keeping parents if either + # they have not been seen or they are ones that have not been rewritten. + parents_copy = parents + uniq = [[p, orig_parents[i], is_rewritten[i]] for i, p in enumerate(parents) + if not (p in seen or seen_add(p)) or not is_rewritten[i]] + parents, orig_parents, is_rewritten = [list(x) for x in zip(*uniq)] + if len(parents) < 2: + return parents_copy, parents[0] + + # Flatten unnecessary merges. (If one side of history is entirely + # empty commits that were pruned, we may end up attempting to + # merge a commit with its ancestor. Remove parents that are an + # ancestor of another parent.) + num_parents = len(parents) + to_remove = [] + for cur in range(num_parents): + if not is_rewritten[cur]: + continue + for other in range(num_parents): + if cur == other: + continue + if not self._graph.is_ancestor(parents[cur], parents[other]): + continue + # parents[cur] is an ancestor of parents[other], so parents[cur] + # seems redundant. However, if it was intentionally redundant + # (e.g. a no-ff merge) in the original, then we want to keep it. + if not always_prune and \ + self._orig_graph.is_ancestor(orig_parents[cur], + orig_parents[other]): + continue + # Some folks want their history to have all first parents be merge + # commits (except for any root commits), and always do a merge --no-ff. + # For such folks, don't remove the first parent even if it's an + # ancestor of other commits. + if self._args.no_ff and cur == 0: + continue + # Okay so the cur-th parent is an ancestor of the other-th parent, + # and it wasn't that way in the original repository; mark the + # cur-th parent as removable. + to_remove.append(cur) + break # cur removed, so skip rest of others -- i.e. check cur+=1 + for x in reversed(to_remove): + parents.pop(x) + if len(parents) < 2: + return parents_copy, parents[0] + + return parents, None + + def _prunable(self, commit, new_1st_parent, had_file_changes, orig_parents): + parents = commit.parents + + if self._args.prune_empty == 'never': + return False + always_prune = (self._args.prune_empty == 'always') + + # For merge commits, unless there are prunable (redundant) parents, we + # do not want to prune + if len(parents) >= 2 and not new_1st_parent: + return False + + if len(parents) < 2: + # Special logic for commits that started empty... + if not had_file_changes and not always_prune: + had_parents_pruned = (len(parents) < len(orig_parents) or + (len(orig_parents) == 1 and + orig_parents[0] in _SKIPPED_COMMITS)) + # If the commit remains empty and had parents which were pruned, + # then prune this commit; otherwise, retain it + return (not commit.file_changes and had_parents_pruned) + + # We can only get here if the commit didn't start empty, so if it's + # empty now, it obviously became empty + if not commit.file_changes: + return True + + # If there are no parents of this commit and we didn't match the case + # above, then this commit cannot be pruned. Since we have no parent(s) + # to compare to, abort now to prevent future checks from failing. + if not parents: + return False + + # Similarly, we cannot handle the hard cases if we don't have a pipe + # to communicate with fast-import + if not self._import_pipes: + return False + + # If there have not been renames/remappings of IDs (due to insertion of + # new blobs), then we can sometimes know things aren't prunable with a + # simple check + if not _IDS.has_renames(): + # non-merge commits can only be empty if blob/file-change editing caused + # all file changes in the commit to have the same file contents as + # the parent. + changed_files = set(change.filename for change in commit.file_changes) + if len(orig_parents) < 2 and changed_files - self._files_tweaked: + return False + + # Finally, the hard case: due to either blob rewriting, or due to pruning + # of empty commits wiping out the first parent history back to the merge + # base, the list of file_changes we have may not actually differ from our + # (new) first parent's version of the files, i.e. this would actually be + # an empty commit. Check by comparing the contents of this commit to its + # (remaining) parent. + # + # NOTE on why this works, for the case of original first parent history + # having been pruned away due to being empty: + # The first parent history having been pruned away due to being + # empty implies the original first parent would have a tree (after + # filtering) that matched the merge base's tree. Since + # file_changes has the changes needed to go from what would have + # been the first parent to our new commit, and what would have been + # our first parent has a tree that matches the merge base, then if + # the new first parent has a tree matching the versions of files in + # file_changes, then this new commit is empty and thus prunable. + fi_input, fi_output = self._import_pipes + self._flush_renames() # Avoid fi_output having other stuff present + # Optimization note: we could have two loops over file_changes, the + # first doing all the self._output.write() calls, and the second doing + # the rest. But I'm worried about fast-import blocking on fi_output + # buffers filling up so I instead read from it as I go. + for change in commit.file_changes: + parent = new_1st_parent or commit.parents[0] # exists due to above checks + quoted_filename = PathQuoting.enquote(change.filename) + if isinstance(parent, int): + self._output.write(b"ls :%d %s\n" % (parent, quoted_filename)) + else: + self._output.write(b"ls %s %s\n" % (parent, quoted_filename)) + self._output.flush() + parent_version = fi_output.readline().split() + if change.type == b'D': + if parent_version != [b'missing', quoted_filename]: + return False + else: + blob_sha = change.blob_id + if isinstance(change.blob_id, int): + self._output.write(b"get-mark :%d\n" % change.blob_id) + self._output.flush() + blob_sha = fi_output.readline().rstrip() + if parent_version != [change.mode, b'blob', blob_sha, quoted_filename]: + return False + + return True + + def _record_remapping(self, commit, orig_parents): + new_id = None + # Record the mapping of old commit hash to new one + if commit.original_id and self._import_pipes: + fi_input, fi_output = self._import_pipes + self._output.write(b"get-mark :%d\n" % commit.id) + self._output.flush() + orig_id = commit.original_id + self._commit_short_old_hashes[orig_id[0:7]].add(orig_id) + # Note that we have queued up an id for later reading; flush a + # few of the older ones if we have too many queued up + self._pending_renames[orig_id] = None + self._flush_renames(None, limit=40) + # Also, record if this was a merge commit that turned into a non-merge + # commit. + if len(orig_parents) >= 2 and len(commit.parents) < 2: + self._commits_no_longer_merges.append((commit.original_id, new_id)) + + def callback_metadata(self, extra_items = dict()): + return {'commit_rename_func': self._translate_commit_hash, + 'ancestry_graph': self._graph, + 'original_ancestry_graph': self._orig_graph, + **extra_items} + + def _tweak_blob(self, blob): + if self._args.max_blob_size and len(blob.data) > self._args.max_blob_size: + blob.skip() + + if blob.original_id in self._args.strip_blobs_with_ids: + blob.skip() + + if ( self._args.replace_text + # not (if blob contains zero byte in the first 8Kb, that is, if blob is binary data) + and not b"\0" in blob.data[0:8192] + ): + for literal, replacement in self._args.replace_text['literals']: + blob.data = blob.data.replace(literal, replacement) + for regex, replacement in self._args.replace_text['regexes']: + blob.data = regex.sub(replacement, blob.data) + + if self._blob_callback: + self._blob_callback(blob, self.callback_metadata()) + + def _filter_files(self, commit): + def filename_matches(path_expression, pathname): + ''' Returns whether path_expression matches pathname or a leading + directory thereof, allowing path_expression to not have a trailing + slash even if it is meant to match a leading directory. ''' + if path_expression == b'': + return True + n = len(path_expression) + if (pathname.startswith(path_expression) and + (path_expression[n-1:n] == b'/' or + len(pathname) == n or + pathname[n:n+1] == b'/')): + return True + return False + + def newname(path_changes, pathname, use_base_name, filtering_is_inclusive): + ''' Applies filtering and rename changes from path_changes to pathname, + returning any of None (file isn't wanted), original filename (file + is wanted with original name), or new filename. ''' + wanted = False + full_pathname = pathname + if use_base_name: + pathname = os.path.basename(pathname) + for (mod_type, match_type, path_exp) in path_changes: + if mod_type == 'filter' and not wanted: + assert match_type in ('match', 'glob', 'regex') + if match_type == 'match' and filename_matches(path_exp, pathname): + wanted = True + if match_type == 'glob' and fnmatch.fnmatch(pathname, path_exp): + wanted = True + if match_type == 'regex' and path_exp.search(pathname): + wanted = True + elif mod_type == 'rename': + match, repl = path_exp + assert match_type in ('match','regex') # glob was translated to regex + if match_type == 'match' and filename_matches(match, full_pathname): + full_pathname = full_pathname.replace(match, repl, 1) + if match_type == 'regex': + full_pathname = match.sub(repl, full_pathname) + return full_pathname if (wanted == filtering_is_inclusive) else None + + args = self._args + new_file_changes = {} # Assumes no renames or copies, otherwise collisions + for change in commit.file_changes: + # NEEDSWORK: _If_ we ever want to pass `--full-tree` to fast-export and + # parse that output, we'll need to modify this block; `--full-tree` + # issues a deleteall directive which has no filename, and thus this + # block would normally strip it. Of course, FileChange() and + # _parse_optional_filechange() would need updates too. + if change.type == b'DELETEALL': + new_file_changes[b''] = change + continue + if change.filename in self._newnames: + change.filename = self._newnames[change.filename] + else: + original_filename = change.filename + change.filename = newname(args.path_changes, change.filename, + args.use_base_name, args.inclusive) + if self._filename_callback: + change.filename = self._filename_callback(change.filename) + self._newnames[original_filename] = change.filename + if not change.filename: + continue # Filtering criteria excluded this file; move on to next one + if change.filename in new_file_changes: + # Getting here means that path renaming is in effect, and caused one + # path to collide with another. That's usually bad, but can be okay + # under two circumstances: + # 1) Sometimes people have a file named OLDFILE in old revisions of + # history, and they rename to NEWFILE, and would like to rewrite + # history so that all revisions refer to it as NEWFILE. As such, + # we can allow a collision when (at least) one of the two paths + # is a deletion. Note that if OLDFILE and NEWFILE are unrelated + # this also allows the rewrite to continue, which makes sense + # since OLDFILE is no longer in the way. + # 2) If OLDFILE and NEWFILE are exactly equal, then writing them + # both to the same location poses no problem; we only need one + # file. (This could come up if someone copied a file in some + # commit, then later either deleted the file or kept it exactly + # in sync with the original with any changes, and then decides + # they want to rewrite history to only have one of the two files) + colliding_change = new_file_changes[change.filename] + if change.type == b'D': + # We can just throw this one away and keep the other + continue + elif change.type == b'M' and ( + change.mode == colliding_change.mode and + change.blob_id == colliding_change.blob_id): + # The two are identical, so we can throw this one away and keep other + continue + elif new_file_changes[change.filename].type != b'D': + raise SystemExit(_("File renaming caused colliding pathnames!\n") + + _(" Commit: {}\n").format(commit.original_id) + + _(" Filename: {}").format(change.filename)) + # Strip files that are too large + if self._args.max_blob_size and \ + self._unpacked_size.get(change.blob_id, 0) > self._args.max_blob_size: + continue + if self._args.strip_blobs_with_ids and \ + change.blob_id in self._args.strip_blobs_with_ids: + continue + # Otherwise, record the change + new_file_changes[change.filename] = change + commit.file_changes = [v for k,v in sorted(new_file_changes.items())] + + def _tweak_commit(self, commit, aux_info): + # Change the commit message according to callback + if not self._args.preserve_commit_hashes: + commit.message = self._hash_re.sub(self._translate_commit_hash, + commit.message) + if self._args.replace_message: + for literal, replacement in self._args.replace_message['literals']: + commit.message = commit.message.replace(literal, replacement) + for regex, replacement in self._args.replace_message['regexes']: + commit.message = regex.sub(replacement, commit.message) + if self._message_callback: + commit.message = self._message_callback(commit.message) + + # Change the author & committer according to mailmap rules + args = self._args + if args.mailmap: + commit.author_name, commit.author_email = \ + args.mailmap.translate(commit.author_name, commit.author_email) + commit.committer_name, commit.committer_email = \ + args.mailmap.translate(commit.committer_name, commit.committer_email) + # Change author & committer according to callbacks + if self._name_callback: + commit.author_name = self._name_callback(commit.author_name) + commit.committer_name = self._name_callback(commit.committer_name) + if self._email_callback: + commit.author_email = self._email_callback(commit.author_email) + commit.committer_email = self._email_callback(commit.committer_email) + + # Sometimes the 'branch' given is a tag; if so, rename it as requested so + # we don't get any old tagnames + if self._args.tag_rename: + commit.branch = RepoFilter._do_tag_rename(args.tag_rename, commit.branch) + if self._refname_callback: + commit.branch = self._refname_callback(commit.branch) + + # Filter or rename the list of file changes + orig_file_changes = set(commit.file_changes) + self._filter_files(commit) + + # Record ancestry graph + parents, orig_parents = commit.parents, aux_info['orig_parents'] + if self._args.state_branch: + external_parents = parents + else: + external_parents = [p for p in parents if not isinstance(p, int)] + self._graph.record_external_commits(external_parents) + self._orig_graph.record_external_commits(external_parents) + self._graph.add_commit_and_parents(commit.id, parents) + self._orig_graph.add_commit_and_parents(commit.old_id, orig_parents) + + # Prune parents (due to pruning of empty commits) if relevant + old_1st_parent = parents[0] if parents else None + parents, new_1st_parent = self._trim_extra_parents(orig_parents, parents) + commit.parents = parents + + # If parents were pruned, then we need our file changes to be relative + # to the new first parent + if parents and old_1st_parent != parents[0]: + commit.file_changes = GitUtils.get_file_changes(self._repo_working_dir, + ID_TO_HASH[parents[0]], + commit.original_id) + orig_file_changes = set(commit.file_changes) + self._filter_files(commit) + + # Find out which files were modified by the callbacks. Such paths could + # lead to subsequent commits being empty (e.g. if removing a line containing + # a password from every version of a file that had the password, and some + # later commit did nothing more than remove that line) + final_file_changes = set(commit.file_changes) + if self._args.replace_text or self._blob_callback: + differences = orig_file_changes.union(final_file_changes) + else: + differences = orig_file_changes.symmetric_difference(final_file_changes) + self._files_tweaked.update(x.filename for x in differences) + + # Call the user-defined callback, if any + if self._commit_callback: + self._commit_callback(commit, self.callback_metadata(aux_info)) + + # Now print the resulting commit, or if prunable skip it + if not commit.dumped: + if not self._prunable(commit, new_1st_parent, + aux_info['had_file_changes'], orig_parents): + self._insert_into_stream(commit) + self._record_remapping(commit, orig_parents) + else: + rewrite_to = new_1st_parent or commit.first_parent() + commit.skip(new_id = rewrite_to) + if self._args.state_branch: + alias = Alias(commit.old_id or commit.id, rewrite_to or deleted_hash) + self._insert_into_stream(alias) + reset = Reset(commit.branch, rewrite_to or deleted_hash) + self._insert_into_stream(reset) + self._commit_renames[commit.original_id] = None + + # Show progress + self._num_commits += 1 + if not self._args.quiet: + self._progress_writer.show(self._parsed_message % self._num_commits) + + @staticmethod + def _do_tag_rename(rename_pair, tagname): + old, new = rename_pair.split(b':', 1) + old, new = b'refs/tags/'+old, b'refs/tags/'+new + if tagname.startswith(old): + return tagname.replace(old, new, 1) + return tagname + + def _tweak_tag(self, tag): + # Tweak the tag message according to callbacks + if self._args.replace_message: + for literal, replacement in self._args.replace_message['literals']: + tag.message = tag.message.replace(literal, replacement) + for regex, replacement in self._args.replace_message['regexes']: + tag.message = regex.sub(replacement, tag.message) + if self._message_callback: + tag.message = self._message_callback(tag.message) + + # Tweak the tag name according to tag-name-related callbacks + tag_prefix = b'refs/tags/' + fullref = tag_prefix+tag.ref + if self._args.tag_rename: + fullref = RepoFilter._do_tag_rename(self._args.tag_rename, fullref) + if self._refname_callback: + fullref = self._refname_callback(fullref) + if not fullref.startswith(tag_prefix): + msg = "Error: fast-import requires tags to be in refs/tags/ namespace." + msg += "\n {} renamed to {}".format(tag_prefix+tag.ref, fullref) + raise SystemExit(msg) + tag.ref = fullref[len(tag_prefix):] + + # Tweak the tagger according to callbacks + if self._args.mailmap: + tag.tagger_name, tag.tagger_email = \ + self._args.mailmap.translate(tag.tagger_name, tag.tagger_email) + if self._name_callback: + tag.tagger_name = self._name_callback(tag.tagger_name) + if self._email_callback: + tag.tagger_email = self._email_callback(tag.tagger_email) + + # Call general purpose tag callback + if self._tag_callback: + self._tag_callback(tag, self.callback_metadata()) + + def _tweak_reset(self, reset): + if self._args.tag_rename: + reset.ref = RepoFilter._do_tag_rename(self._args.tag_rename, reset.ref) + if self._refname_callback: + reset.ref = self._refname_callback(reset.ref) + if self._reset_callback: + self._reset_callback(reset, self.callback_metadata()) + + def results_tmp_dir(self, create_if_missing=True): + target_working_dir = self._args.target or b'.' + git_dir = GitUtils.determine_git_dir(target_working_dir) + d = os.path.join(git_dir, b'filter-repo') + if create_if_missing and not os.path.isdir(d): + os.mkdir(d) + return d + + def _load_marks_file(self, marks_basename): + full_branch = 'refs/heads/{}'.format(self._args.state_branch) + marks_file = os.path.join(self.results_tmp_dir(), marks_basename) + working_dir = self._args.target or b'.' + cmd = ['git', '-C', working_dir, 'show-ref', full_branch] + contents = b'' + if subproc.call(cmd, stdout=subprocess.DEVNULL) == 0: + cmd = ['git', '-C', working_dir, 'show', + '%s:%s' % (full_branch, decode(marks_basename))] + try: + contents = subproc.check_output(cmd) + except subprocess.CalledProcessError as e: # pragma: no cover + raise SystemExit(_("Failed loading %s from %s") % + (decode(marks_basename), full_branch)) + if contents: + biggest_id = max(int(x.split()[0][1:]) for x in contents.splitlines()) + _IDS._next_id = max(_IDS._next_id, biggest_id+1) + with open(marks_file, 'bw') as f: + f.write(contents) + return marks_file + + def _save_marks_files(self): + basenames = [b'source-marks', b'target-marks'] + working_dir = self._args.target or b'.' + + # Check whether the branch exists + parent = [] + full_branch = 'refs/heads/{}'.format(self._args.state_branch) + cmd = ['git', '-C', working_dir, 'show-ref', full_branch] + if subproc.call(cmd, stdout=subprocess.DEVNULL) == 0: + parent = ['-p', full_branch] + + # Run 'git hash-object $MARKS_FILE' for each marks file, save result + blob_hashes = {} + for marks_basename in basenames: + marks_file = os.path.join(self.results_tmp_dir(), marks_basename) + if not os.path.isfile(marks_file): # pragma: no cover + raise SystemExit(_("Failed to find %s to save to %s") + % (marks_file, self._args.state_branch)) + cmd = ['git', '-C', working_dir, 'hash-object', '-w', marks_file] + blob_hashes[marks_basename] = subproc.check_output(cmd).strip() + + # Run 'git mktree' to create a tree out of it + p = subproc.Popen(['git', '-C', working_dir, 'mktree'], + stdin=subprocess.PIPE, stdout=subprocess.PIPE) + for b in basenames: + p.stdin.write(b'100644 blob %s\t%s\n' % (blob_hashes[b], b)) + p.stdin.close() + p.wait() + tree = p.stdout.read().strip() + + # Create the new commit + cmd = (['git', '-C', working_dir, 'commit-tree', '-m', 'New mark files', + tree] + parent) + commit = subproc.check_output(cmd).strip() + subproc.call(['git', '-C', working_dir, 'update-ref', full_branch, commit]) + + def importer_only(self): + self._run_sanity_checks() + self._setup_output() + + def set_output(self, outputRepoFilter): + assert outputRepoFilter._output + + # set_output implies this RepoFilter is doing exporting, though may not + # be the only one. + self._setup_input(use_done_feature = False) + + # Set our output management up to pipe to outputRepoFilter's locations + self._managed_output = False + self._output = outputRepoFilter._output + self._import_pipes = outputRepoFilter._import_pipes + + # Handle sanity checks, though currently none needed for export-only cases + self._run_sanity_checks() + + def _setup_input(self, use_done_feature): + if self._args.stdin: + self._input = sys.stdin.detach() + sys.stdin = None # Make sure no one tries to accidentally use it + self._fe_orig = None + else: + skip_blobs = (self._blob_callback is None and + self._args.replace_text is None and + self._args.source == self._args.target) + extra_flags = [] + if skip_blobs: + extra_flags.append('--no-data') + if self._args.max_blob_size: + self._unpacked_size, packed_size = GitUtils.get_blob_sizes() + if use_done_feature: + extra_flags.append('--use-done-feature') + if write_marks: + extra_flags.append(b'--mark-tags') + if self._args.state_branch: + assert(write_marks) + source_marks_file = self._load_marks_file(b'source-marks') + extra_flags.extend([b'--export-marks='+source_marks_file, + b'--import-marks='+source_marks_file]) + if self._args.preserve_commit_encoding is not None: # pragma: no cover + reencode = 'no' if self._args.preserve_commit_encoding else 'yes' + extra_flags.append('--reencode='+reencode) + location = ['-C', self._args.source] if self._args.source else [] + fep_cmd = ['git'] + location + ['fast-export', '--show-original-ids', + '--signed-tags=strip', '--tag-of-filtered-object=rewrite', + '--fake-missing-tagger', '--reference-excluded-parents' + ] + extra_flags + self._args.refs + self._fep = subproc.Popen(fep_cmd, bufsize=-1, stdout=subprocess.PIPE) + self._input = self._fep.stdout + if self._args.dry_run or self._args.debug: + self._fe_orig = os.path.join(self.results_tmp_dir(), + b'fast-export.original') + output = open(self._fe_orig, 'bw') + self._input = InputFileBackup(self._input, output) + if self._args.debug: + tmp = [decode(x) if isinstance(x, bytes) else x for x in fep_cmd] + print("[DEBUG] Running: {}".format(' '.join(tmp))) + print(" (saving a copy of the output at {})" + .format(decode(self._fe_orig))) + + def _setup_output(self): + if not self._args.dry_run: + location = ['-C', self._args.target] if self._args.target else [] + fip_cmd = ['git'] + location + ['-c', 'core.ignorecase=false', + 'fast-import', '--force', '--quiet'] + if date_format_permissive: + fip_cmd.append('--date-format=raw-permissive') + if self._args.state_branch: + target_marks_file = self._load_marks_file(b'target-marks') + fip_cmd.extend([b'--export-marks='+target_marks_file, + b'--import-marks='+target_marks_file]) + self._fip = subproc.Popen(fip_cmd, bufsize=-1, + stdin=subprocess.PIPE, stdout=subprocess.PIPE) + self._import_pipes = (self._fip.stdin, self._fip.stdout) + if self._args.dry_run or self._args.debug: + self._fe_filt = os.path.join(self.results_tmp_dir(), + b'fast-export.filtered') + self._output = open(self._fe_filt, 'bw') + else: + self._output = self._fip.stdin + if self._args.debug and not self._args.dry_run: + self._output = DualFileWriter(self._fip.stdin, self._output) + tmp = [decode(x) if isinstance(x, bytes) else x for x in fip_cmd] + print("[DEBUG] Running: {}".format(' '.join(tmp))) + print(" (using the following file as input: {})" + .format(decode(self._fe_filt))) + + def _migrate_origin_to_heads(self): + refs_to_migrate = set(x for x in self._orig_refs + if x.startswith(b'refs/remotes/origin/')) + if not refs_to_migrate: + return + if self._args.debug: + print("[DEBUG] Migrating refs/remotes/origin/* -> refs/heads/*") + target_working_dir = self._args.target or b'.' + p = subproc.Popen('git update-ref --no-deref --stdin'.split(), + stdin=subprocess.PIPE, cwd=target_working_dir) + for ref in refs_to_migrate: + if ref == b'refs/remotes/origin/HEAD': + p.stdin.write(b'delete %s %s\n' % (ref, self._orig_refs[ref])) + del self._orig_refs[ref] + continue + newref = ref.replace(b'refs/remotes/origin/', b'refs/heads/') + if newref not in self._orig_refs: + p.stdin.write(b'create %s %s\n' % (newref, self._orig_refs[ref])) + p.stdin.write(b'delete %s %s\n' % (ref, self._orig_refs[ref])) + self._orig_refs[newref] = self._orig_refs[ref] + del self._orig_refs[ref] + p.stdin.close() + if p.wait(): + raise SystemExit(_("git update-ref failed; see above")) # pragma: no cover + + # Now remove + if self._args.debug: + print("[DEBUG] Removing 'origin' remote (rewritten history will no ") + print(" longer be related; consider re-pushing it elsewhere.") + subproc.call('git remote rm origin'.split(), cwd=target_working_dir) + + def _final_commands(self): + self._finalize_handled = True + self._done_callback and self._done_callback() + + if not self._args.quiet: + self._progress_writer.finish() + + def _ref_update(self, target_working_dir): + # Start the update-ref process + p = subproc.Popen('git update-ref --no-deref --stdin'.split(), + stdin=subprocess.PIPE, + cwd=target_working_dir) + + # Remove replace_refs from _orig_refs + replace_refs = {k:v for k, v in self._orig_refs.items() + if k.startswith(b'refs/replace/')} + reverse_replace_refs = collections.defaultdict(list) + for k,v in replace_refs.items(): + reverse_replace_refs[v].append(k) + all(map(self._orig_refs.pop, replace_refs)) + + # Remove unused refs + exported_refs, imported_refs = self.get_exported_and_imported_refs() + refs_to_nuke = exported_refs - imported_refs + if self._args.partial: + refs_to_nuke = set() + if refs_to_nuke and self._args.debug: + print("[DEBUG] Deleting the following refs:\n "+ + decode(b"\n ".join(refs_to_nuke))) + p.stdin.write(b''.join([b"delete %s\n" % x + for x in refs_to_nuke])) + + # Delete or update and add replace_refs; note that fast-export automatically + # handles 'update-no-add', we only need to take action for the other four + # choices for replace_refs. + self._flush_renames() + actual_renames = {k:v for k,v in self._commit_renames.items() if k != v} + if self._args.replace_refs in ['delete-no-add', 'delete-and-add']: + # Delete old replace refs, if unwanted + replace_refs_to_nuke = set(replace_refs) + if self._args.replace_refs == 'delete-and-add': + # git-update-ref won't allow us to update a ref twice, so be careful + # to avoid deleting refs we'll later update + replace_refs_to_nuke = replace_refs_to_nuke.difference( + [b'refs/replace/'+x for x in actual_renames]) + p.stdin.write(b''.join([b"delete %s\n" % x + for x in replace_refs_to_nuke])) + if self._args.replace_refs in ['delete-and-add', 'update-or-add', + 'update-and-add']: + # Add new replace refs + update_only = (self._args.replace_refs == 'update-or-add') + p.stdin.write(b''.join([b"update refs/replace/%s %s\n" % (old, new) + for old,new in actual_renames.items() + if new and not (update_only and + old in reverse_replace_refs)])) + + # Complete the update-ref process + p.stdin.close() + if p.wait(): + raise SystemExit(_("git update-ref failed; see above")) # pragma: no cover + + def _record_metadata(self, metadata_dir, orig_refs): + self._flush_renames() + with open(os.path.join(metadata_dir, b'commit-map'), 'bw') as f: + f.write(("%-40s %s\n" % (_("old"), _("new"))).encode()) + for (old,new) in self._commit_renames.items(): + msg = b'%s %s\n' % (old, new if new != None else deleted_hash) + f.write(msg) + + exported_refs, imported_refs = self.get_exported_and_imported_refs() + + batch_check_process = None + batch_check_output_re = re.compile(b'^([0-9a-f]{40}) ([a-z]+) ([0-9]+)$') + with open(os.path.join(metadata_dir, b'ref-map'), 'bw') as f: + f.write(("%-40s %-40s %s\n" % (_("old"), _("new"), _("ref"))).encode()) + for refname, old_hash in orig_refs.items(): + if refname not in exported_refs: + continue + if refname not in imported_refs: + new_hash = deleted_hash + elif old_hash in self._commit_renames: + new_hash = self._commit_renames[old_hash] + new_hash = new_hash if new_hash != None else deleted_hash + else: # Must be either an annotated tag, or a ref whose tip was pruned + if not batch_check_process: + cmd = 'git cat-file --batch-check'.split() + target_working_dir = self._args.target or b'.' + batch_check_process = subproc.Popen(cmd, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + cwd=target_working_dir) + batch_check_process.stdin.write(refname+b"\n") + batch_check_process.stdin.flush() + line = batch_check_process.stdout.readline() + m = batch_check_output_re.match(line) + if m and m.group(2) in (b'tag', b'commit'): + new_hash = m.group(1) + elif line.endswith(b' missing\n'): + new_hash = deleted_hash + else: + raise SystemExit(_("Failed to find new id for %(refname)s " + "(old id was %(old_hash)s)") + % ({'refname': refname, 'old_hash': old_hash}) + ) # pragma: no cover + f.write(b'%s %s %s\n' % (old_hash, new_hash, refname)) + if self._args.source or self._args.target: + new_refs = GitUtils.get_refs(self._args.target or b'.') + for ref, new_hash in new_refs.items(): + if ref not in orig_refs and not ref.startswith(b'refs/replace/'): + old_hash = b'0'*len(new_hash) + f.write(b'%s %s %s\n' % (old_hash, new_hash, ref)) + if batch_check_process: + batch_check_process.stdin.close() + batch_check_process.wait() + + with open(os.path.join(metadata_dir, b'suboptimal-issues'), 'bw') as f: + issues_found = False + if self._commits_no_longer_merges: + issues_found = True + + f.write(textwrap.dedent(_(''' + The following commits used to be merge commits but due to filtering + are now regular commits; they likely have suboptimal commit messages + (e.g. "Merge branch next into master"). Original commit hash on the + left, commit hash after filtering/rewriting on the right: + ''')[1:]).encode()) + for oldhash, newhash in self._commits_no_longer_merges: + f.write(' {} {}\n'.format(oldhash, newhash).encode()) + f.write(b'\n') + + if self._commits_referenced_but_removed: + issues_found = True + f.write(textwrap.dedent(_(''' + The following commits were filtered out, but referenced in another + commit message. The reference to the now-nonexistent commit hash + (or a substring thereof) was left as-is in any commit messages: + ''')[1:]).encode()) + for bad_commit_reference in self._commits_referenced_but_removed: + f.write(' {}\n'.format(bad_commit_reference).encode()) + f.write(b'\n') + + if not issues_found: + f.write(_("No filtering problems encountered.\n").encode()) + + with open(os.path.join(metadata_dir, b'already_ran'), 'bw') as f: + f.write(_("This file exists to allow you to filter again without --force.\n").encode()) + + def finish(self): + ''' Alternative to run() when there is no input of our own to parse, + meaning that run only really needs to close the handle to fast-import + and let it finish, thus making a call to "run" feel like a misnomer. ''' + assert not self._input + assert self._managed_output + self.run() + + def insert(self, obj, direct_insertion = False): + if not direct_insertion: + if type(obj) == Blob: + self._tweak_blob(obj) + elif type(obj) == Commit: + aux_info = {'orig_parents': obj.parents, + 'had_file_changes': bool(obj.file_changes)} + self._tweak_commit(obj, aux_info) + elif type(obj) == Reset: + self._tweak_reset(obj) + elif type(obj) == Tag: + self._tweak_tag(obj) + self._insert_into_stream(obj) + + def _insert_into_stream(self, obj): + if not obj.dumped: + if self._parser: + self._parser.insert(obj) + else: + obj.dump(self._output) + + def get_exported_and_imported_refs(self): + return self._parser.get_exported_and_imported_refs() + + def run(self): + start = time.time() + if not self._input and not self._output: + self._run_sanity_checks() + if not self._args.dry_run and not self._args.partial: + self._migrate_origin_to_heads() + self._setup_input(use_done_feature = True) + self._setup_output() + assert self._sanity_checks_handled + + if self._input: + # Create and run the filter + self._repo_working_dir = self._args.source or b'.' + self._parser = FastExportParser(blob_callback = self._tweak_blob, + commit_callback = self._tweak_commit, + tag_callback = self._tweak_tag, + reset_callback = self._tweak_reset, + done_callback = self._final_commands) + self._parser.run(self._input, self._output) + if not self._finalize_handled: + self._final_commands() + + # Make sure fast-export completed successfully + if not self._args.stdin and self._fep.wait(): + raise SystemExit(_("Error: fast-export failed; see above.")) # pragma: no cover + self._input.close() + + # If we're not the manager of self._output, we should avoid post-run cleanup + if not self._managed_output: + return + + # Close the output and ensure fast-import successfully completes + self._output.close() + if not self._args.dry_run and self._fip.wait(): + raise SystemExit(_("Error: fast-import failed; see above.")) # pragma: no cover + + # With fast-export and fast-import complete, update state if requested + if self._args.state_branch: + self._save_marks_files() + + # Notify user how long it took, before doing a gc and such + msg = "New history written in {:.2f} seconds..." + if self._args.repack: + msg = "New history written in {:.2f} seconds; now repacking/cleaning..." + print(msg.format(time.time()-start)) + + # Exit early, if requested + if self._args.dry_run: + print(_("NOTE: Not running fast-import or cleaning up; --dry-run passed.")) + if self._fe_orig: + print(_(" Requested filtering can be seen by comparing:")) + print(" " + decode(self._fe_orig)) + else: + print(_(" Requested filtering can be seen at:")) + print(" " + decode(self._fe_filt)) + return + + target_working_dir = self._args.target or b'.' + if self._input: + self._ref_update(target_working_dir) + + # Write out data about run + self._record_metadata(self.results_tmp_dir(), self._orig_refs) + + # Final cleanup: + # If we need a repack, then nuke the reflogs and repack. + # If we need a reset, do a reset --hard + reset = not GitUtils.is_repository_bare(target_working_dir) + RepoFilter.cleanup(target_working_dir, self._args.repack, reset, + run_quietly=self._args.quiet, + show_debuginfo=self._args.debug) + + # Let user know how long it took + print(_("Completely finished after {:.2f} seconds.") + .format(time.time()-start)) + +def main(): + setup_gettext() + args = FilteringOptions.parse_args(sys.argv[1:]) + if args.analyze: + RepoAnalyze.run(args) + else: + filter = RepoFilter(args) + filter.run() + +if __name__ == '__main__': + main() \ No newline at end of file