diff --git a/README.md b/README.md index d659bf3..5d921c2 100644 --- a/README.md +++ b/README.md @@ -3,3 +3,4 @@ - src为源代码文件 - doc为作业文档 +123 \ No newline at end of file diff --git a/git-filter-repo b/git-filter-repo new file mode 100644 index 0000000..e6d2914 --- /dev/null +++ b/git-filter-repo @@ -0,0 +1,4984 @@ +#!/usr/bin/env python3 + +""" +git-filter-repo filters git repositories, similar to git filter-branch, BFG +repo cleaner, and others. The basic idea is that it works by running + git fast-export | filter | git fast-import +where this program not only launches the whole pipeline but also serves as +the 'filter' in the middle. It does a few additional things on top as well +in order to make it into a well-rounded filtering tool. + +git-filter-repo can also be used as a library for more involved filtering +operations; however: + ***** API BACKWARD COMPATIBILITY CAVEAT ***** + Programs using git-filter-repo as a library can reach pretty far into its + internals, but I am not prepared to guarantee backward compatibility of + all APIs. I suspect changes will be rare, but I reserve the right to + change any API. Since it is assumed that repository filtering is + something one would do very rarely, and in particular that it's a + one-shot operation, this should not be a problem in practice for anyone. + However, if you want to re-use a program you have written that uses + git-filter-repo as a library (or makes use of one of its --*-callback + arguments), you should either make sure you are using the same version of + git and git-filter-repo, or make sure to re-test it. + + If there are particular pieces of the API you are concerned about, and + there is not already a testcase for it in t9391-lib-usage.sh or + t9392-python-callback.sh, please contribute a testcase. That will not + prevent me from changing the API, but it will allow you to look at the + history of a testcase to see whether and how the API changed. + ***** END API BACKWARD COMPATIBILITY CAVEAT ***** +""" + +import argparse +import collections +import fnmatch +import gettext +import io +import os +import platform +import re +import shutil +import subprocess +import sys +import time +import textwrap + +from datetime import tzinfo, timedelta, datetime + +__all__ = ["Blob", "Reset", "FileChange", "Commit", "Tag", "Progress", + "Checkpoint", "FastExportParser", "ProgressWriter", + "string_to_date", "date_to_string", + "record_id_rename", "GitUtils", "FilteringOptions", "RepoFilter"] + +# The globals to make visible to callbacks. They will see all our imports for +# free, as well as our public API. +public_globals = ["__builtins__", "argparse", "collections", "fnmatch", + "gettext", "io", "os", "platform", "re", "shutil", + "subprocess", "sys", "time", "textwrap", "tzinfo", + "timedelta", "datetime"] + __all__ + +deleted_hash = b'0'*40 +write_marks = True +date_format_permissive = True + +def gettext_poison(msg): + if "GIT_TEST_GETTEXT_POISON" in os.environ: # pragma: no cover + return "# GETTEXT POISON #" + return gettext.gettext(msg) + +_ = gettext_poison + +def setup_gettext(): + TEXTDOMAIN="git-filter-repo" + podir = os.environ.get("GIT_TEXTDOMAINDIR") or "@@LOCALEDIR@@" + if not os.path.isdir(podir): # pragma: no cover + podir = None # Python has its own fallback; use that + + ## This looks like the most straightforward translation of the relevant + ## code in git.git:gettext.c and git.git:perl/Git/I18n.pm: + #import locale + #locale.setlocale(locale.LC_MESSAGES, ""); + #locale.setlocale(locale.LC_TIME, ""); + #locale.textdomain(TEXTDOMAIN); + #locale.bindtextdomain(TEXTDOMAIN, podir); + ## but the python docs suggest using the gettext module (which doesn't + ## have setlocale()) instead, so: + gettext.textdomain(TEXTDOMAIN); + gettext.bindtextdomain(TEXTDOMAIN, podir); + +def _timedelta_to_seconds(delta): + """ + Converts timedelta to seconds + """ + offset = delta.days*86400 + delta.seconds + (delta.microseconds+0.0)/1000000 + return round(offset) + +class FixedTimeZone(tzinfo): + """ + Fixed offset in minutes east from UTC. + """ + + tz_re = re.compile(br'^([-+]?)(\d\d)(\d\d)$') + + def __init__(self, offset_string): + tzinfo.__init__(self) + sign, hh, mm = FixedTimeZone.tz_re.match(offset_string).groups() + factor = -1 if (sign and sign == b'-') else 1 + self._offset = timedelta(minutes = factor*(60*int(hh) + int(mm))) + self._offset_string = offset_string + + def utcoffset(self, dt): + return self._offset + + def tzname(self, dt): + return self._offset_string + + def dst(self, dt): + return timedelta(0) + +def string_to_date(datestring): + (unix_timestamp, tz_offset) = datestring.split() + return datetime.fromtimestamp(int(unix_timestamp), + FixedTimeZone(tz_offset)) + +def date_to_string(dateobj): + epoch = datetime.fromtimestamp(0, dateobj.tzinfo) + return(b'%d %s' % (int(_timedelta_to_seconds(dateobj - epoch)), + dateobj.tzinfo.tzname(0))) + +def decode(bytestr): + 'Try to convert bytestr to utf-8 for outputting as an error message.' + return bytestr.decode('utf-8', 'backslashreplace') + +def glob_to_regex(glob_bytestr): + 'Translate glob_bytestr into a regex on bytestrings' + + # fnmatch.translate is idiotic and won't accept bytestrings + if (decode(glob_bytestr).encode() != glob_bytestr): # pragma: no cover + raise SystemExit(_("Error: Cannot handle glob %s").format(glob_bytestr)) + + # Create regex operating on string + regex = fnmatch.translate(decode(glob_bytestr)) + + # FIXME: This is an ugly hack... + # fnmatch.translate tries to do multi-line matching and wants the glob to + # match up to the end of the input, which isn't relevant for us, so we + # have to modify the regex. fnmatch.translate has used different regex + # constructs to achieve this with different python versions, so we have + # to check for each of them and then fix it up. It would be much better + # if fnmatch.translate could just take some flags to allow us to specify + # what we want rather than employing this hackery, but since it + # doesn't... + if regex.endswith(r'\Z(?ms)'): # pragma: no cover + regex = regex[0:-7] + elif regex.startswith(r'(?s:') and regex.endswith(r')\Z'): # pragma: no cover + regex = regex[4:-3] + + # Finally, convert back to regex operating on bytestr + return regex.encode() + +class PathQuoting: + _unescape = {b'a': b'\a', + b'b': b'\b', + b'f': b'\f', + b'n': b'\n', + b'r': b'\r', + b't': b'\t', + b'v': b'\v', + b'"': b'"', + b'\\':b'\\'} + _unescape_re = re.compile(br'\\([a-z"\\]|[0-9]{3})') + _escape = [bytes([x]) for x in range(127)]+[ + b'\\'+bytes(ord(c) for c in oct(x)[2:]) for x in range(127,256)] + _reverse = dict(map(reversed, _unescape.items())) + for x in _reverse: + _escape[ord(x)] = b'\\'+_reverse[x] + _special_chars = [len(x) > 1 for x in _escape] + + @staticmethod + def unescape_sequence(orig): + seq = orig.group(1) + return PathQuoting._unescape[seq] if len(seq) == 1 else bytes([int(seq, 8)]) + + @staticmethod + def dequote(quoted_string): + if quoted_string.startswith(b'"'): + assert quoted_string.endswith(b'"') + return PathQuoting._unescape_re.sub(PathQuoting.unescape_sequence, + quoted_string[1:-1]) + return quoted_string + + @staticmethod + def enquote(unquoted_string): + # Option 1: Quoting when fast-export would: + # pqsc = PathQuoting._special_chars + # if any(pqsc[x] for x in set(unquoted_string)): + # Option 2, perf hack: do minimal amount of quoting required by fast-import + if unquoted_string.startswith(b'"') or b'\n' in unquoted_string: + pqe = PathQuoting._escape + return b'"' + b''.join(pqe[x] for x in unquoted_string) + b'"' + return unquoted_string + +class AncestryGraph(object): + """ + A class that maintains a direct acycle graph of commits for the purpose of + determining if one commit is the ancestor of another. + + A note about identifiers in Commit objects: + * Commit objects have 2 identifiers: commit.old_id and commit.id, because: + * The original fast-export stream identified commits by an identifier. + This is often an integer, but is sometimes a hash (particularly when + --reference-excluded-parents is provided) + * The new fast-import stream we use may not use the same identifiers. + If new blobs or commits are inserted (such as lint-history does), then + the integer (or hash) are no longer valid. + + A note about identifiers in AncestryGraph objects, of which there are three: + * A given AncestryGraph is based on either commit.old_id or commit.id, but + not both. These are the keys for self.value. + * Using full hashes (occasionally) for children in self.graph felt + wasteful, so we use our own internal integer within self.graph. + self.value maps from commit {old_}id to our internal integer id. + * When working with commit.old_id, it is also sometimes useful to be able + to map these to the original hash, i.e. commit.original_id. So, we + also have self.git_hash for mapping from commit.old_id to git's commit + hash. + """ + + def __init__(self): + # The next internal identifier we will use; increments with every commit + # added to the AncestryGraph + self.cur_value = 0 + + # A mapping from the external identifers given to us to the simple integers + # we use in self.graph + self.value = {} + + # A tuple of (depth, list-of-ancestors). Values and keys in this graph are + # all integers from the (values of the) self.value dict. The depth of a + # commit is one more than the max depth of any of its ancestors. + self.graph = {} + + # A mapping from external identifier (i.e. from the keys of self.value) to + # the hash of the given commit. Only populated for graphs based on + # commit.old_id, since we won't know until later what the git_hash for + # graphs based on commit.id (since we have to wait for fast-import to + # create the commit and notify us of its hash; see _pending_renames). + # elsewhere + self.git_hash = {} + + # Reverse maps; only populated if needed. Caller responsible to check + # and ensure they are populated + self._reverse_value = {} + self._hash_to_id = {} + + # Cached results from previous calls to is_ancestor(). + self._cached_is_ancestor = {} + + def record_external_commits(self, external_commits): + """ + Record in graph that each commit in external_commits exists, and is + treated as a root commit with no parents. + """ + for c in external_commits: + if c not in self.value: + self.cur_value += 1 + self.value[c] = self.cur_value + self.graph[self.cur_value] = (1, []) + self.git_hash[c] = c + + def add_commit_and_parents(self, commit, parents, githash = None): + """ + Record in graph that commit has the given parents (all identified by + fast export stream identifiers, usually integers but sometimes hashes). + parents _MUST_ have been first recorded. commit _MUST_ not have been + recorded yet. Also, record the mapping between commit and githash, if + githash is given. + """ + assert all(p in self.value for p in parents) + assert commit not in self.value + + # Get values for commit and parents + self.cur_value += 1 + self.value[commit] = self.cur_value + if githash: + self.git_hash[commit] = githash + graph_parents = [self.value[x] for x in parents] + + # Determine depth for commit, then insert the info into the graph + depth = 1 + if parents: + depth += max(self.graph[p][0] for p in graph_parents) + self.graph[self.cur_value] = (depth, graph_parents) + + def record_hash(self, commit_id, githash): + ''' + If a githash was not recorded for commit_id, when add_commit_and_parents + was called, add it now. + ''' + assert commit_id in self.value + assert commit_id not in self.git_hash + self.git_hash[commit_id] = githash + + def _ensure_reverse_maps_populated(self): + if not self._hash_to_id: + assert not self._reverse_value + self._hash_to_id = {v: k for k, v in self.git_hash.items()} + self._reverse_value = {v: k for k, v in self.value.items()} + + def get_parent_hashes(self, commit_hash): + ''' + Given a commit_hash, return its parents hashes + ''' + # + # We have to map: + # commit hash -> fast export stream id -> graph id + # then lookup + # parent graph ids for given graph id + # then we need to map + # parent graph ids -> parent fast export ids -> parent commit hashes + # + self._ensure_reverse_maps_populated() + commit_fast_export_id = self._hash_to_id[commit_hash] + commit_graph_id = self.value[commit_fast_export_id] + parent_graph_ids = self.graph[commit_graph_id][1] + parent_fast_export_ids = [self._reverse_value[x] for x in parent_graph_ids] + parent_hashes = [self.git_hash[x] for x in parent_fast_export_ids] + return parent_hashes + + def map_to_hash(self, commit_id): + ''' + Given a commit (by fast export stream id), return its hash + ''' + return self.git_hash.get(commit_id, None) + + def is_ancestor(self, possible_ancestor, check): + """ + Return whether possible_ancestor is an ancestor of check + """ + a, b = self.value[possible_ancestor], self.value[check] + original_pair = (a,b) + a_depth = self.graph[a][0] + ancestors = [b] + visited = set() + while ancestors: + ancestor = ancestors.pop() + prev_pair = (a, ancestor) + if prev_pair in self._cached_is_ancestor: + if not self._cached_is_ancestor[prev_pair]: + continue + self._cached_is_ancestor[original_pair] = True + return True + if ancestor in visited: + continue + visited.add(ancestor) + depth, more_ancestors = self.graph[ancestor] + if ancestor == a: + self._cached_is_ancestor[original_pair] = True + return True + elif depth <= a_depth: + continue + ancestors.extend(more_ancestors) + self._cached_is_ancestor[original_pair] = False + return False + +class MailmapInfo(object): + def __init__(self, filename): + self.changes = {} + self._parse_file(filename) + + def _parse_file(self, filename): + name_and_email_re = re.compile(br'(.*?)\s*<([^>]*)>\s*') + comment_re = re.compile(br'\s*#.*') + if not os.access(filename, os.R_OK): + raise SystemExit(_("Cannot read %s") % decode(filename)) + with open(filename, 'br') as f: + count = 0 + for line in f: + count += 1 + err = "Unparseable mailmap file: line #{} is bad: {}".format(count, line) + # Remove comments + line = comment_re.sub(b'', line) + # Remove leading and trailing whitespace + line = line.strip() + if not line: + continue + + m = name_and_email_re.match(line) + if not m: + raise SystemExit(err) + proper_name, proper_email = m.groups() + if len(line) == m.end(): + self.changes[(None, proper_email)] = (proper_name, proper_email) + continue + rest = line[m.end():] + m = name_and_email_re.match(rest) + if m: + commit_name, commit_email = m.groups() + if len(rest) != m.end(): + raise SystemExit(err) + else: + commit_name, commit_email = rest, None + self.changes[(commit_name, commit_email)] = (proper_name, proper_email) + + def translate(self, name, email): + ''' Given a name and email, return the expected new name and email from the + mailmap if there is a translation rule for it, otherwise just return + the given name and email.''' + for old, new in self.changes.items(): + old_name, old_email = old + new_name, new_email = new + if (old_email is None or email.lower() == old_email.lower()) and ( + name == old_name or not old_name): + return (new_name or name, new_email or email) + return (name, email) + +class ProgressWriter(object): + def __init__(self): + self._last_progress_update = time.time() + self._last_message = None + + def show(self, msg): + self._last_message = msg + now = time.time() + if now - self._last_progress_update > .1: + self._last_progress_update = now + sys.stdout.write("\r{}".format(msg)) + sys.stdout.flush() + + def finish(self): + self._last_progress_update = 0 + if self._last_message: + self.show(self._last_message) + sys.stdout.write("\n") + +class _IDs(object): + """ + A class that maintains the 'name domain' of all the 'marks' (short int + id for a blob/commit git object). There are two reasons this mechanism + is necessary: + (1) the output text of fast-export may refer to an object using a different + mark than the mark that was assigned to that object using IDS.new(). + (This class allows you to translate the fast-export marks, "old" to + the marks assigned from IDS.new(), "new"). + (2) when we prune a commit, its "old" id becomes invalid. Any commits + which had that commit as a parent needs to use the nearest unpruned + ancestor as its parent instead. + + Note that for purpose (1) above, this typically comes about because the user + manually creates Blob or Commit objects (for insertion into the stream). + It could also come about if we attempt to read the data from two different + repositories and trying to combine the data (git fast-export will number ids + from 1...n, and having two 1's, two 2's, two 3's, causes issues; granted, we + this scheme doesn't handle the two streams perfectly either, but if the first + fast export stream is entirely processed and handled before the second stream + is started, this mechanism may be sufficient to handle it). + """ + + def __init__(self): + """ + Init + """ + # The id for the next created blob/commit object + self._next_id = 1 + + # A map of old-ids to new-ids (1:1 map) + self._translation = {} + + # A map of new-ids to every old-id that points to the new-id (1:N map) + self._reverse_translation = {} + + def has_renames(self): + """ + Return whether there have been ids remapped to new values + """ + return bool(self._translation) + + def new(self): + """ + Should be called whenever a new blob or commit object is created. The + returned value should be used as the id/mark for that object. + """ + rv = self._next_id + self._next_id += 1 + return rv + + def record_rename(self, old_id, new_id, handle_transitivity = False): + """ + Record that old_id is being renamed to new_id. + """ + if old_id != new_id or old_id in self._translation: + # old_id -> new_id + self._translation[old_id] = new_id + + # Transitivity will be needed if new commits are being inserted mid-way + # through a branch. + if handle_transitivity: + # Anything that points to old_id should point to new_id + if old_id in self._reverse_translation: + for id_ in self._reverse_translation[old_id]: + self._translation[id_] = new_id + + # Record that new_id is pointed to by old_id + if new_id not in self._reverse_translation: + self._reverse_translation[new_id] = [] + self._reverse_translation[new_id].append(old_id) + + def translate(self, old_id): + """ + If old_id has been mapped to an alternate id, return the alternate id. + """ + if old_id in self._translation: + return self._translation[old_id] + else: + return old_id + + def __str__(self): + """ + Convert IDs to string; used for debugging + """ + rv = "Current count: %d\nTranslation:\n" % self._next_id + for k in sorted(self._translation): + rv += " %d -> %s\n" % (k, self._translation[k]) + + rv += "Reverse translation:\n" + reverse_keys = list(self._reverse_translation.keys()) + if None in reverse_keys: # pragma: no cover + reverse_keys.remove(None) + reverse_keys = sorted(reverse_keys) + reverse_keys.append(None) + for k in reverse_keys: + rv += " " + str(k) + " -> " + str(self._reverse_translation[k]) + "\n" + + return rv + +class _GitElement(object): + """ + The base class for all git elements that we create. + """ + + def __init__(self): + # A string that describes what type of Git element this is + self.type = None + + # A flag telling us if this Git element has been dumped + # (i.e. printed) or skipped. Typically elements that have been + # dumped or skipped will not be dumped again. + self.dumped = 0 + + def dump(self, file_): + """ + This version should never be called. Derived classes need to + override! We should note that subclasses should implement this + method such that the output would match the format produced by + fast-export. + """ + raise SystemExit(_("Unimplemented function: %s") % type(self).__name__ + +".dump()") # pragma: no cover + + def __bytes__(self): + """ + Convert GitElement to bytestring; used for debugging + """ + old_dumped = self.dumped + writeme = io.BytesIO() + self.dump(writeme) + output_lines = writeme.getvalue().splitlines() + writeme.close() + self.dumped = old_dumped + return b"%s:\n %s" % (type(self).__name__.encode(), + b"\n ".join(output_lines)) + + def skip(self, new_id=None): + """ + Ensures this element will not be written to output + """ + self.dumped = 2 + +class _GitElementWithId(_GitElement): + """ + The base class for Git elements that have IDs (commits and blobs) + """ + + def __init__(self): + _GitElement.__init__(self) + + # The mark (short, portable id) for this element + self.id = _IDS.new() + + # The previous mark for this element + self.old_id = None + + def skip(self, new_id=None): + """ + This element will no longer be automatically written to output. When a + commit gets skipped, it's ID will need to be translated to that of its + parent. + """ + self.dumped = 2 + + _IDS.record_rename(self.old_id or self.id, new_id) + +class Blob(_GitElementWithId): + """ + This class defines our representation of git blob elements (i.e. our + way of representing file contents). + """ + + def __init__(self, data, original_id = None): + _GitElementWithId.__init__(self) + + # Denote that this is a blob + self.type = 'blob' + + # Record original id + self.original_id = original_id + + # Stores the blob's data + assert(type(data) == bytes) + self.data = data + + def dump(self, file_): + """ + Write this blob element to a file. + """ + self.dumped = 1 + BLOB_HASH_TO_NEW_ID[self.original_id] = self.id + BLOB_NEW_ID_TO_HASH[self.id] = self.original_id + + file_.write(b'blob\n') + file_.write(b'mark :%d\n' % self.id) + file_.write(b'data %d\n%s' % (len(self.data), self.data)) + file_.write(b'\n') + + +class Reset(_GitElement): + """ + This class defines our representation of git reset elements. A reset + event is the creation (or recreation) of a named branch, optionally + starting from a specific revision). + """ + + def __init__(self, ref, from_ref = None): + _GitElement.__init__(self) + + # Denote that this is a reset + self.type = 'reset' + + # The name of the branch being (re)created + self.ref = ref + + # Some reference to the branch/commit we are resetting from + self.from_ref = from_ref + + def dump(self, file_): + """ + Write this reset element to a file + """ + self.dumped = 1 + + file_.write(b'reset %s\n' % self.ref) + if self.from_ref: + if isinstance(self.from_ref, int): + file_.write(b'from :%d\n' % self.from_ref) + else: + file_.write(b'from %s\n' % self.from_ref) + file_.write(b'\n') + +class FileChange(_GitElement): + """ + This class defines our representation of file change elements. File change + elements are components within a Commit element. + """ + + def __init__(self, type_, filename = None, id_ = None, mode = None): + _GitElement.__init__(self) + + # Denote the type of file-change (b'M' for modify, b'D' for delete, etc) + # We could + # assert(type(type_) == bytes) + # here but I don't just due to worries about performance overhead... + self.type = type_ + + # Record the name of the file being changed + self.filename = filename + + # Record the mode (mode describes type of file entry (non-executable, + # executable, or symlink)). + self.mode = mode + + # blob_id is the id (mark) of the affected blob + self.blob_id = id_ + + if type_ == b'DELETEALL': + assert filename is None and id_ is None and mode is None + self.filename = b'' # Just so PathQuoting.enquote doesn't die + else: + assert filename is not None + + if type_ == b'M': + assert id_ is not None and mode is not None + elif type_ == b'D': + assert id_ is None and mode is None + elif type_ == b'R': # pragma: no cover (now avoid fast-export renames) + assert mode is None + if id_ is None: + raise SystemExit(_("new name needed for rename of %s") % filename) + self.filename = (self.filename, id_) + self.blob_id = None + + def dump(self, file_): + """ + Write this file-change element to a file + """ + skipped_blob = (self.type == b'M' and self.blob_id is None) + if skipped_blob: return + self.dumped = 1 + + quoted_filename = PathQuoting.enquote(self.filename) + if self.type == b'M' and isinstance(self.blob_id, int): + file_.write(b'M %s :%d %s\n' % (self.mode, self.blob_id, quoted_filename)) + elif self.type == b'M': + file_.write(b'M %s %s %s\n' % (self.mode, self.blob_id, quoted_filename)) + elif self.type == b'D': + file_.write(b'D %s\n' % quoted_filename) + elif self.type == b'DELETEALL': + file_.write(b'deleteall\n') + else: + raise SystemExit(_("Unhandled filechange type: %s") % self.type) # pragma: no cover + +class Commit(_GitElementWithId): + """ + This class defines our representation of commit elements. Commit elements + contain all the information associated with a commit. + """ + + def __init__(self, branch, + author_name, author_email, author_date, + committer_name, committer_email, committer_date, + message, + file_changes, + parents, + original_id = None, + encoding = None, # encoding for message; None implies UTF-8 + **kwargs): + _GitElementWithId.__init__(self) + self.old_id = self.id + + # Denote that this is a commit element + self.type = 'commit' + + # Record the affected branch + self.branch = branch + + # Record original id + self.original_id = original_id + + # Record author's name + self.author_name = author_name + + # Record author's email + self.author_email = author_email + + # Record date of authoring + self.author_date = author_date + + # Record committer's name + self.committer_name = committer_name + + # Record committer's email + self.committer_email = committer_email + + # Record date the commit was made + self.committer_date = committer_date + + # Record commit message and its encoding + self.encoding = encoding + self.message = message + + # List of file-changes associated with this commit. Note that file-changes + # are also represented as git elements + self.file_changes = file_changes + + self.parents = parents + + def dump(self, file_): + """ + Write this commit element to a file. + """ + self.dumped = 1 + + # Make output to fast-import slightly easier for humans to read if the + # message has no trailing newline of its own; cosmetic, but a nice touch... + extra_newline = b'\n' + if self.message.endswith(b'\n') or not (self.parents or self.file_changes): + extra_newline = b'' + + if not self.parents: + file_.write(b'reset %s\n' % self.branch) + file_.write((b'commit %s\n' + b'mark :%d\n' + b'author %s <%s> %s\n' + b'committer %s <%s> %s\n' + ) % ( + self.branch, self.id, + self.author_name, self.author_email, self.author_date, + self.committer_name, self.committer_email, self.committer_date + )) + if self.encoding: + file_.write(b'encoding %s\n' % self.encoding) + file_.write(b'data %d\n%s%s' % + (len(self.message), self.message, extra_newline)) + for i, parent in enumerate(self.parents): + file_.write(b'from ' if i==0 else b'merge ') + if isinstance(parent, int): + file_.write(b':%d\n' % parent) + else: + file_.write(b'%s\n' % parent) + for change in self.file_changes: + change.dump(file_) + if not self.parents and not self.file_changes: + # Workaround a bug in pre-git-2.22 versions of fast-import with + # the get-mark directive. + file_.write(b'\n') + file_.write(b'\n') + + def first_parent(self): + """ + Return first parent commit + """ + if self.parents: + return self.parents[0] + return None + + def skip(self, new_id=None): + _SKIPPED_COMMITS.add(self.old_id or self.id) + _GitElementWithId.skip(self, new_id) + +class Tag(_GitElementWithId): + """ + This class defines our representation of annotated tag elements. + """ + + def __init__(self, ref, from_ref, + tagger_name, tagger_email, tagger_date, tag_msg, + original_id = None): + _GitElementWithId.__init__(self) + self.old_id = self.id + + # Denote that this is a tag element + self.type = 'tag' + + # Store the name of the tag + self.ref = ref + + # Store the entity being tagged (this should be a commit) + self.from_ref = from_ref + + # Record original id + self.original_id = original_id + + # Store the name of the tagger + self.tagger_name = tagger_name + + # Store the email of the tagger + self.tagger_email = tagger_email + + # Store the date + self.tagger_date = tagger_date + + # Store the tag message + self.message = tag_msg + + def dump(self, file_): + """ + Write this tag element to a file + """ + + self.dumped = 1 + + file_.write(b'tag %s\n' % self.ref) + if (write_marks and self.id): + file_.write(b'mark :%d\n' % self.id) + markfmt = b'from :%d\n' if isinstance(self.from_ref, int) else b'from %s\n' + file_.write(markfmt % self.from_ref) + if self.tagger_name: + file_.write(b'tagger %s <%s> ' % (self.tagger_name, self.tagger_email)) + file_.write(self.tagger_date) + file_.write(b'\n') + file_.write(b'data %d\n%s' % (len(self.message), self.message)) + file_.write(b'\n') + +class Progress(_GitElement): + """ + This class defines our representation of progress elements. The progress + element only contains a progress message, which is printed by fast-import + when it processes the progress output. + """ + + def __init__(self, message): + _GitElement.__init__(self) + + # Denote that this is a progress element + self.type = 'progress' + + # Store the progress message + self.message = message + + def dump(self, file_): + """ + Write this progress element to a file + """ + self.dumped = 1 + + file_.write(b'progress %s\n' % self.message) + file_.write(b'\n') + +class Checkpoint(_GitElement): + """ + This class defines our representation of checkpoint elements. These + elements represent events which force fast-import to close the current + packfile, start a new one, and to save out all current branch refs, tags + and marks. + """ + + def __init__(self): + _GitElement.__init__(self) + + # Denote that this is a checkpoint element + self.type = 'checkpoint' + + def dump(self, file_): + """ + Write this checkpoint element to a file + """ + self.dumped = 1 + + file_.write(b'checkpoint\n') + file_.write(b'\n') + +class LiteralCommand(_GitElement): + """ + This class defines our representation of commands. The literal command + includes only a single line, and is not processed in any special way. + """ + + def __init__(self, line): + _GitElement.__init__(self) + + # Denote that this is a literal element + self.type = 'literal' + + # Store the command + self.line = line + + def dump(self, file_): + """ + Write this progress element to a file + """ + self.dumped = 1 + + file_.write(self.line) + +class Alias(_GitElement): + """ + This class defines our representation of fast-import alias elements. An + alias element is the setting of one mark to the same sha1sum as another, + usually because the newer mark corresponded to a pruned commit. + """ + + def __init__(self, ref, to_ref): + _GitElement.__init__(self) + # Denote that this is a reset + self.type = 'alias' + + self.ref = ref + self.to_ref = to_ref + + def dump(self, file_): + """ + Write this reset element to a file + """ + self.dumped = 1 + + file_.write(b'alias\nmark :%d\nto :%d\n\n' % (self.ref, self.to_ref)) + +class FastExportParser(object): + """ + A class for parsing and handling the output from fast-export. This + class allows the user to register callbacks when various types of + data are encountered in the fast-export output. The basic idea is that, + FastExportParser takes fast-export output, creates the various objects + as it encounters them, the user gets to use/modify these objects via + callbacks, and finally FastExportParser outputs the modified objects + in fast-import format (presumably so they can be used to create a new + repo). + """ + + def __init__(self, + tag_callback = None, commit_callback = None, + blob_callback = None, progress_callback = None, + reset_callback = None, checkpoint_callback = None, + done_callback = None): + # Members below simply store callback functions for the various git + # elements + self._tag_callback = tag_callback + self._blob_callback = blob_callback + self._reset_callback = reset_callback + self._commit_callback = commit_callback + self._progress_callback = progress_callback + self._checkpoint_callback = checkpoint_callback + self._done_callback = done_callback + + # Keep track of which refs appear from the export, and which make it to + # the import (pruning of empty commits, renaming of refs, and creating + # new manual objects and inserting them can cause these to differ). + self._exported_refs = set() + self._imported_refs = set() + + # A list of the branches we've seen, plus the last known commit they + # pointed to. An entry in latest_*commit will be deleted if we get a + # reset for that branch. These are used because of fast-import's weird + # decision to allow having an implicit parent via naming the branch + # instead of requiring branches to be specified via 'from' directives. + self._latest_commit = {} + self._latest_orig_commit = {} + + # A handle to the input source for the fast-export data + self._input = None + + # A handle to the output file for the output we generate (we call dump + # on many of the git elements we create). + self._output = None + + # Stores the contents of the current line of input being parsed + self._currentline = '' + + # Tracks LFS objects we have found + self._lfs_object_tracker = None + + # Compile some regexes and cache those + self._mark_re = re.compile(br'mark :(\d+)\n$') + self._parent_regexes = {} + parent_regex_rules = (br' :(\d+)\n$', br' ([0-9a-f]{40})\n') + for parent_refname in (b'from', b'merge'): + ans = [re.compile(parent_refname+x) for x in parent_regex_rules] + self._parent_regexes[parent_refname] = ans + self._quoted_string_re = re.compile(br'"(?:[^"\\]|\\.)*"') + self._refline_regexes = {} + for refline_name in (b'reset', b'commit', b'tag', b'progress'): + self._refline_regexes[refline_name] = re.compile(refline_name+b' (.*)\n$') + self._user_regexes = {} + for user in (b'author', b'committer', b'tagger'): + self._user_regexes[user] = re.compile(user + b' (.*?) <(.*?)> (.*)\n$') + + def _advance_currentline(self): + """ + Grab the next line of input + """ + self._currentline = self._input.readline() + + def _parse_optional_mark(self): + """ + If the current line contains a mark, parse it and advance to the + next line; return None otherwise + """ + mark = None + matches = self._mark_re.match(self._currentline) + if matches: + mark = int(matches.group(1)) + self._advance_currentline() + return mark + + def _parse_optional_parent_ref(self, refname): + """ + If the current line contains a reference to a parent commit, then + parse it and advance the current line; otherwise return None. Note + that the name of the reference ('from', 'merge') must match the + refname arg. + """ + orig_baseref, baseref = None, None + rule, altrule = self._parent_regexes[refname] + matches = rule.match(self._currentline) + if matches: + orig_baseref = int(matches.group(1)) + # We translate the parent commit mark to what it needs to be in + # our mark namespace + baseref = _IDS.translate(orig_baseref) + self._advance_currentline() + else: + matches = altrule.match(self._currentline) + if matches: + orig_baseref = matches.group(1) + baseref = orig_baseref + self._advance_currentline() + return orig_baseref, baseref + + def _parse_optional_filechange(self): + """ + If the current line contains a file-change object, then parse it + and advance the current line; otherwise return None. We only care + about file changes of type b'M' and b'D' (these are the only types + of file-changes that fast-export will provide). + """ + filechange = None + changetype = self._currentline[0:1] + if changetype == b'M': + (changetype, mode, idnum, path) = self._currentline.split(None, 3) + if idnum[0:1] == b':': + idnum = idnum[1:] + path = path.rstrip(b'\n') + # Check for LFS objects from sources before we might toss this filechange + if self._lfs_object_tracker: + value = int(idnum) if len(idnum) != 40 else idnum + self._lfs_object_tracker.check_file_change_data(value, True) + # We translate the idnum to our id system + if len(idnum) != 40: + idnum = _IDS.translate( int(idnum) ) + if idnum is not None: + if path.startswith(b'"'): + path = PathQuoting.dequote(path) + filechange = FileChange(b'M', path, idnum, mode) + else: + filechange = b'skipped' + self._advance_currentline() + elif changetype == b'D': + (changetype, path) = self._currentline.split(None, 1) + path = path.rstrip(b'\n') + if path.startswith(b'"'): + path = PathQuoting.dequote(path) + filechange = FileChange(b'D', path) + self._advance_currentline() + elif changetype == b'R': # pragma: no cover (now avoid fast-export renames) + rest = self._currentline[2:-1] + if rest.startswith(b'"'): + m = self._quoted_string_re.match(rest) + if not m: + raise SystemExit(_("Couldn't parse rename source")) + orig = PathQuoting.dequote(m.group(0)) + new = rest[m.end()+1:] + else: + orig, new = rest.split(b' ', 1) + if new.startswith(b'"'): + new = PathQuoting.dequote(new) + filechange = FileChange(b'R', orig, new) + self._advance_currentline() + return filechange + + def _parse_original_id(self): + original_id = self._currentline[len(b'original-oid '):].rstrip() + self._advance_currentline() + return original_id + + def _parse_encoding(self): + encoding = self._currentline[len(b'encoding '):].rstrip() + self._advance_currentline() + return encoding + + def _parse_ref_line(self, refname): + """ + Parses string data (often a branch name) from current-line. The name of + the string data must match the refname arg. The program will crash if + current-line does not match, so current-line will always be advanced if + this method returns. + """ + matches = self._refline_regexes[refname].match(self._currentline) + if not matches: + raise SystemExit(_("Malformed %(refname)s line: '%(line)s'") % + ({'refname': refname, 'line':self._currentline}) + ) # pragma: no cover + ref = matches.group(1) + self._advance_currentline() + return ref + + def _parse_user(self, usertype): + """ + Get user name, email, datestamp from current-line. Current-line will + be advanced. + """ + user_regex = self._user_regexes[usertype] + (name, email, when) = user_regex.match(self._currentline).groups() + + self._advance_currentline() + return (name, email, when) + + def _parse_data(self): + """ + Reads data from _input. Current-line will be advanced until it is beyond + the data. + """ + fields = self._currentline.split() + assert fields[0] == b'data' + size = int(fields[1]) + data = self._input.read(size) + self._advance_currentline() + if self._currentline == b'\n': + self._advance_currentline() + return data + + def _parse_blob(self): + """ + Parse input data into a Blob object. Once the Blob has been created, it + will be handed off to the appropriate callbacks. Current-line will be + advanced until it is beyond this blob's data. The Blob will be dumped + to _output once everything else is done (unless it has been skipped by + the callback). + """ + # Parse the Blob + self._advance_currentline() + id_ = self._parse_optional_mark() + + original_id = None + if self._currentline.startswith(b'original-oid'): + original_id = self._parse_original_id(); + + data = self._parse_data() + if self._currentline == b'\n': + self._advance_currentline() + + # Create the blob + blob = Blob(data, original_id) + + # If fast-export text had a mark for this blob, need to make sure this + # mark translates to the blob's true id. + if id_: + blob.old_id = id_ + _IDS.record_rename(id_, blob.id) + + # Check for LFS objects + if self._lfs_object_tracker: + self._lfs_object_tracker.check_blob_data(data, blob.old_id, True) + + # Call any user callback to allow them to use/modify the blob + if self._blob_callback: + self._blob_callback(blob) + + # Now print the resulting blob + if not blob.dumped: + blob.dump(self._output) + + def _parse_reset(self): + """ + Parse input data into a Reset object. Once the Reset has been created, + it will be handed off to the appropriate callbacks. Current-line will + be advanced until it is beyond the reset data. The Reset will be dumped + to _output once everything else is done (unless it has been skipped by + the callback). + """ + # Parse the Reset + ref = self._parse_ref_line(b'reset') + self._exported_refs.add(ref) + ignoreme, from_ref = self._parse_optional_parent_ref(b'from') + if self._currentline == b'\n': + self._advance_currentline() + + # fast-export likes to print extraneous resets that serve no purpose. + # While we could continue processing such resets, that is a waste of + # resources. Also, we want to avoid recording that this ref was + # seen in such cases, since this ref could be rewritten to nothing. + if not from_ref: + self._latest_commit.pop(ref, None) + self._latest_orig_commit.pop(ref, None) + return + + # Create the reset + reset = Reset(ref, from_ref) + + # Call any user callback to allow them to modify the reset + if self._reset_callback: + self._reset_callback(reset) + + # Update metadata + self._latest_commit[reset.ref] = reset.from_ref + self._latest_orig_commit[reset.ref] = reset.from_ref + + # Now print the resulting reset + if not reset.dumped: + self._imported_refs.add(reset.ref) + reset.dump(self._output) + + def _parse_commit(self): + """ + Parse input data into a Commit object. Once the Commit has been created, + it will be handed off to the appropriate callbacks. Current-line will + be advanced until it is beyond the commit data. The Commit will be dumped + to _output once everything else is done (unless it has been skipped by + the callback OR the callback has removed all file-changes from the commit). + """ + # Parse the Commit. This may look involved, but it's pretty simple; it only + # looks bad because a commit object contains many pieces of data. + branch = self._parse_ref_line(b'commit') + self._exported_refs.add(branch) + id_ = self._parse_optional_mark() + + original_id = None + if self._currentline.startswith(b'original-oid'): + original_id = self._parse_original_id(); + + author_name = None + author_email = None + if self._currentline.startswith(b'author'): + (author_name, author_email, author_date) = self._parse_user(b'author') + + (committer_name, committer_email, committer_date) = \ + self._parse_user(b'committer') + + if not author_name and not author_email: + (author_name, author_email, author_date) = \ + (committer_name, committer_email, committer_date) + + encoding = None + if self._currentline.startswith(b'encoding '): + encoding = self._parse_encoding() + + commit_msg = self._parse_data() + + pinfo = [self._parse_optional_parent_ref(b'from')] + # Due to empty pruning, we can have real 'from' and 'merge' lines that + # due to commit rewriting map to a parent of None. We need to record + # 'from' if its non-None, and we need to parse all 'merge' lines. + while self._currentline.startswith(b'merge '): + pinfo.append(self._parse_optional_parent_ref(b'merge')) + orig_parents, parents = [list(tmp) for tmp in zip(*pinfo)] + + # No parents is oddly represented as [None] instead of [], due to the + # special 'from' handling. Convert it here to a more canonical form. + if parents == [None]: + parents = [] + if orig_parents == [None]: + orig_parents = [] + + # fast-import format is kinda stupid in that it allows implicit parents + # based on the branch name instead of requiring them to be specified by + # 'from' directives. The only way to get no parent is by using a reset + # directive first, which clears the latest_commit_for_this_branch tracking. + if not orig_parents and self._latest_commit.get(branch): + parents = [self._latest_commit[branch]] + if not orig_parents and self._latest_orig_commit.get(branch): + orig_parents = [self._latest_orig_commit[branch]] + + # Get the list of file changes + file_changes = [] + file_change = self._parse_optional_filechange() + had_file_changes = file_change is not None + while file_change: + if not (type(file_change) == bytes and file_change == b'skipped'): + file_changes.append(file_change) + file_change = self._parse_optional_filechange() + if self._currentline == b'\n': + self._advance_currentline() + + # Okay, now we can finally create the Commit object + commit = Commit(branch, + author_name, author_email, author_date, + committer_name, committer_email, committer_date, + commit_msg, file_changes, parents, original_id, encoding) + + # If fast-export text had a mark for this commit, need to make sure this + # mark translates to the commit's true id. + if id_: + commit.old_id = id_ + _IDS.record_rename(id_, commit.id) + + # refs/notes/ put commit-message-related material in blobs, and name their + # files according to the hash of other commits. That totally messes with + # all normal callbacks; fast-export should really export these as different + # kinds of objects. Until then, let's just pass these commits through as-is + # and hope the blob callbacks don't mess things up. + if commit.branch.startswith(b'refs/notes/'): + self._imported_refs.add(commit.branch) + commit.dump(self._output) + return + + # Call any user callback to allow them to modify the commit + aux_info = {'orig_parents': orig_parents, + 'had_file_changes': had_file_changes} + if self._commit_callback: + self._commit_callback(commit, aux_info) + + # Now print the resulting commit, or if prunable skip it + self._latest_orig_commit[branch] = commit.id + if not (commit.old_id or commit.id) in _SKIPPED_COMMITS: + self._latest_commit[branch] = commit.id + if not commit.dumped: + self._imported_refs.add(commit.branch) + commit.dump(self._output) + + def _parse_tag(self): + """ + Parse input data into a Tag object. Once the Tag has been created, + it will be handed off to the appropriate callbacks. Current-line will + be advanced until it is beyond the tag data. The Tag will be dumped + to _output once everything else is done (unless it has been skipped by + the callback). + """ + # Parse the Tag + tag = self._parse_ref_line(b'tag') + self._exported_refs.add(b'refs/tags/'+tag) + id_ = self._parse_optional_mark() + ignoreme, from_ref = self._parse_optional_parent_ref(b'from') + + original_id = None + if self._currentline.startswith(b'original-oid'): + original_id = self._parse_original_id(); + + tagger_name, tagger_email, tagger_date = None, None, None + if self._currentline.startswith(b'tagger'): + (tagger_name, tagger_email, tagger_date) = self._parse_user(b'tagger') + tag_msg = self._parse_data() + if self._currentline == b'\n': + self._advance_currentline() + + # Create the tag + tag = Tag(tag, from_ref, + tagger_name, tagger_email, tagger_date, tag_msg, + original_id) + + # If fast-export text had a mark for this tag, need to make sure this + # mark translates to the tag's true id. + if id_: + tag.old_id = id_ + _IDS.record_rename(id_, tag.id) + + # Call any user callback to allow them to modify the tag + if self._tag_callback: + self._tag_callback(tag) + + # The tag might not point at anything that still exists (self.from_ref + # will be None if the commit it pointed to and all its ancestors were + # pruned due to being empty) + if tag.from_ref: + # Print out this tag's information + if not tag.dumped: + self._imported_refs.add(b'refs/tags/'+tag.ref) + tag.dump(self._output) + else: + tag.skip() + + def _parse_progress(self): + """ + Parse input data into a Progress object. Once the Progress has + been created, it will be handed off to the appropriate + callbacks. Current-line will be advanced until it is beyond the + progress data. The Progress will be dumped to _output once + everything else is done (unless it has been skipped by the callback). + """ + # Parse the Progress + message = self._parse_ref_line(b'progress') + if self._currentline == b'\n': + self._advance_currentline() + + # Create the progress message + progress = Progress(message) + + # Call any user callback to allow them to modify the progress messsage + if self._progress_callback: + self._progress_callback(progress) + + # NOTE: By default, we do NOT print the progress message; git + # fast-import would write it to fast_import_pipes which could mess with + # our parsing of output from the 'ls' and 'get-mark' directives we send + # to fast-import. If users want these messages, they need to process + # and handle them in the appropriate callback above. + + def _parse_checkpoint(self): + """ + Parse input data into a Checkpoint object. Once the Checkpoint has + been created, it will be handed off to the appropriate + callbacks. Current-line will be advanced until it is beyond the + checkpoint data. The Checkpoint will be dumped to _output once + everything else is done (unless it has been skipped by the callback). + """ + # Parse the Checkpoint + self._advance_currentline() + if self._currentline == b'\n': + self._advance_currentline() + + # Create the checkpoint + checkpoint = Checkpoint() + + # Call any user callback to allow them to drop the checkpoint + if self._checkpoint_callback: + self._checkpoint_callback(checkpoint) + + # NOTE: By default, we do NOT print the checkpoint message; although it + # we would only realistically get them with --stdin, the fact that we + # are filtering makes me think the checkpointing is less likely to be + # reasonable. In fact, I don't think it's necessary in general. If + # users do want it, they should process it in the checkpoint_callback. + + def _parse_literal_command(self): + """ + Parse literal command. Then just dump the line as is. + """ + # Create the literal command object + command = LiteralCommand(self._currentline) + self._advance_currentline() + + # Now print the resulting literal command + if not command.dumped: + command.dump(self._output) + + def insert(self, obj): + assert not obj.dumped + obj.dump(self._output) + if type(obj) == Commit: + self._imported_refs.add(obj.branch) + elif type(obj) in (Reset, Tag): + self._imported_refs.add(obj.ref) + + def run(self, input, output): + """ + This method filters fast export output. + """ + # Set input. If no args provided, use stdin. + self._input = input + self._output = output + + # Run over the input and do the filtering + self._advance_currentline() + while self._currentline: + if self._currentline.startswith(b'blob'): + self._parse_blob() + elif self._currentline.startswith(b'reset'): + self._parse_reset() + elif self._currentline.startswith(b'commit'): + self._parse_commit() + elif self._currentline.startswith(b'tag'): + self._parse_tag() + elif self._currentline.startswith(b'progress'): + self._parse_progress() + elif self._currentline.startswith(b'checkpoint'): + self._parse_checkpoint() + elif self._currentline.startswith(b'feature'): + self._parse_literal_command() + elif self._currentline.startswith(b'option'): + self._parse_literal_command() + elif self._currentline.startswith(b'done'): + if self._done_callback: + self._done_callback() + self._parse_literal_command() + # Prevent confusion from others writing additional stuff that'll just + # be ignored + self._output.close() + elif self._currentline.startswith(b'#'): + self._parse_literal_command() + elif self._currentline.startswith(b'get-mark') or \ + self._currentline.startswith(b'cat-blob') or \ + self._currentline.startswith(b'ls'): + raise SystemExit(_("Unsupported command: '%s'") % self._currentline) + else: + raise SystemExit(_("Could not parse line: '%s'") % self._currentline) + + def get_exported_and_imported_refs(self): + return self._exported_refs, self._imported_refs + +def record_id_rename(old_id, new_id): + """ + Register a new translation + """ + handle_transitivity = True + _IDS.record_rename(old_id, new_id, handle_transitivity) + +# Internal globals +_IDS = _IDs() +_SKIPPED_COMMITS = set() +BLOB_HASH_TO_NEW_ID = {} +BLOB_NEW_ID_TO_HASH = {} +sdr_next_steps = _(""" +NEXT STEPS FOR YOUR SENSITIVE DATA REMOVAL: + * If you are doing your rewrite in multiple steps, ignore these next steps + until you have completed all your invocations of git-filter-repo. + * See the "Sensitive Data Removal" subsection of the "DISCUSSION" section + of the manual for more details about any of the steps below. + * Inspect this repository and verify that the sensitive data is indeed + completely removed from all commits. + * Force push the rewritten history to the server: + %s + * Contact the server admins for additional steps they need to take; the + First Changed Commit(s)%s may come in handy here. + * Have other colleagues with a clone either discard their clone and reclone + OR follow the detailed steps in the manual to repeatedly rebase and + purge the sensitive data from their copy. Again, the First Changed + Commit(s)%s may come in handy. + * See the "Prevent repeats and avoid future sensitive data spills" section + of the manual. +"""[1:]) + +class SubprocessWrapper(object): + @staticmethod + def decodify(args): + if type(args) == str: + return args + else: + assert type(args) == list + return [decode(x) if type(x)==bytes else x for x in args] + + @staticmethod + def call(*args, **kwargs): + if 'cwd' in kwargs: + kwargs['cwd'] = decode(kwargs['cwd']) + return subprocess.call(SubprocessWrapper.decodify(*args), **kwargs) + + @staticmethod + def check_output(*args, **kwargs): + if 'cwd' in kwargs: + kwargs['cwd'] = decode(kwargs['cwd']) + return subprocess.check_output(SubprocessWrapper.decodify(*args), **kwargs) + + @staticmethod + def check_call(*args, **kwargs): # pragma: no cover # used by filter-lamely + if 'cwd' in kwargs: + kwargs['cwd'] = decode(kwargs['cwd']) + return subprocess.check_call(SubprocessWrapper.decodify(*args), **kwargs) + + @staticmethod + def Popen(*args, **kwargs): + if 'cwd' in kwargs: + kwargs['cwd'] = decode(kwargs['cwd']) + return subprocess.Popen(SubprocessWrapper.decodify(*args), **kwargs) + +subproc = subprocess +if platform.system() == 'Windows' or 'PRETEND_UNICODE_ARGS' in os.environ: + subproc = SubprocessWrapper + +class GitUtils(object): + @staticmethod + def get_commit_count(repo, *args): + """ + Return the number of commits that have been made on repo. + """ + if not args: + args = ['--all'] + if len(args) == 1 and isinstance(args[0], list): + args = args[0] + p = subproc.Popen(["git", "rev-list", "--count"] + args, + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + cwd=repo) + if p.wait() != 0: + raise SystemExit(_("%s does not appear to be a valid git repository") + % decode(repo)) + return int(p.stdout.read()) + + @staticmethod + def get_total_objects(repo): + """ + Return the number of objects (both packed and unpacked) + """ + p1 = subproc.Popen(["git", "count-objects", "-v"], + stdout=subprocess.PIPE, cwd=repo) + lines = p1.stdout.read().splitlines() + # Return unpacked objects + packed-objects + return int(lines[0].split()[1]) + int(lines[2].split()[1]) + + @staticmethod + def is_repository_bare(repo_working_dir): + out = subproc.check_output('git rev-parse --is-bare-repository'.split(), + cwd=repo_working_dir) + return (out.strip() == b'true') + + @staticmethod + def determine_git_dir(repo_working_dir): + d = subproc.check_output('git rev-parse --git-dir'.split(), + cwd=repo_working_dir).strip() + if repo_working_dir==b'.' or d.startswith(b'/'): + return d + return os.path.join(repo_working_dir, d) + + @staticmethod + def get_refs(repo_working_dir): + try: + output = subproc.check_output('git show-ref'.split(), + cwd=repo_working_dir) + except subprocess.CalledProcessError as e: + # If error code is 1, there just aren't any refs; i.e. new repo. + # If error code is other than 1, some other error (e.g. not a git repo) + if e.returncode != 1: + raise SystemExit('fatal: {}'.format(e)) + output = '' + return dict(reversed(x.split()) for x in output.splitlines()) + + @staticmethod + def get_config_settings(repo_working_dir): + output = '' + try: + output = subproc.check_output('git config --list --null'.split(), + cwd=repo_working_dir) + except subprocess.CalledProcessError as e: # pragma: no cover + raise SystemExit('fatal: {}'.format(e)) + + # FIXME: Ignores multi-valued keys, just let them overwrite for now + return dict(item.split(b'\n', maxsplit=1) + for item in output.strip().split(b"\0") if item) + + @staticmethod + def get_blob_sizes(quiet = False): + blob_size_progress = ProgressWriter() + num_blobs = 0 + processed_blobs_msg = _("Processed %d blob sizes") + + # Get sizes of blobs by sha1 + cmd = '--batch-check=%(objectname) %(objecttype) ' + \ + '%(objectsize) %(objectsize:disk)' + cf = subproc.Popen(['git', 'cat-file', '--batch-all-objects', cmd], + bufsize = -1, + stdout = subprocess.PIPE) + unpacked_size = {} + packed_size = {} + for line in cf.stdout: + try: + sha, objtype, objsize, objdisksize = line.split() + objsize, objdisksize = int(objsize), int(objdisksize) + if objtype == b'blob': + unpacked_size[sha] = objsize + packed_size[sha] = objdisksize + num_blobs += 1 + except ValueError: # pragma: no cover + sys.stderr.write(_("Error: unexpected `git cat-file` output: \"%s\"\n") % line) + if not quiet: + blob_size_progress.show(processed_blobs_msg % num_blobs) + cf.wait() + if not quiet: + blob_size_progress.finish() + return unpacked_size, packed_size + + @staticmethod + def get_file_changes(repo, parent_hash, commit_hash): + """ + Return a FileChanges list with the differences between parent_hash + and commit_hash + """ + file_changes = [] + + cmd = ["git", "diff-tree", "-r", parent_hash, commit_hash] + output = subproc.check_output(cmd, cwd=repo) + for line in output.splitlines(): + fileinfo, path = line.split(b'\t', 1) + if path.startswith(b'"'): + path = PathQuoting.dequote(path) + oldmode, mode, oldhash, newhash, changetype = fileinfo.split() + if changetype == b'D': + file_changes.append(FileChange(b'D', path)) + elif changetype in (b'A', b'M', b'T'): + identifier = BLOB_HASH_TO_NEW_ID.get(newhash, newhash) + file_changes.append(FileChange(b'M', path, identifier, mode)) + else: # pragma: no cover + raise SystemExit("Unknown change type for line {}".format(line)) + + return file_changes + + @staticmethod + def print_my_version(): + with open(__file__, 'br') as f: + contents = f.read() + # If people replaced @@LOCALEDIR@@ string to point at their local + # directory, undo it so we can get original source version. + contents = re.sub(br'\A#\!.*', + br'#!/usr/bin/env python3', contents) + contents = re.sub(br'(\("GIT_TEXTDOMAINDIR"\) or ").*"', + br'\1@@LOCALEDIR@@"', contents) + + cmd = 'git hash-object --stdin'.split() + version = subproc.check_output(cmd, input=contents).strip() + print(decode(version[0:12])) + +class FilteringOptions(object): + default_replace_text = b'***REMOVED***' + class AppendFilter(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + user_path = values + suffix = option_string[len('--path-'):] or 'match' + if suffix.startswith('rename'): + mod_type = 'rename' + match_type = option_string[len('--path-rename-'):] or 'match' + values = values.split(b':') + if len(values) != 2: + raise SystemExit(_("Error: --path-rename expects one colon in its" + " argument: .")) + if values[0] and values[1] and not ( + values[0].endswith(b'/') == values[1].endswith(b'/')): + raise SystemExit(_("Error: With --path-rename, if OLD_NAME and " + "NEW_NAME are both non-empty and either ends " + "with a slash then both must.")) + if any(v.startswith(b'/') for v in values): + raise SystemExit(_("Error: Pathnames cannot begin with a '/'")) + components = values[0].split(b'/') + values[1].split(b'/') + else: + mod_type = 'filter' + match_type = suffix + components = values.split(b'/') + if values.startswith(b'/'): + raise SystemExit(_("Error: Pathnames cannot begin with a '/'")) + for illegal_path in [b'.', b'..']: + if illegal_path in components: + raise SystemExit(_("Error: Invalid path component '%s' found in '%s'") + % (decode(illegal_path), decode(user_path))) + if match_type == 'regex': + values = re.compile(values) + items = getattr(namespace, self.dest, []) or [] + items.append((mod_type, match_type, values)) + if (match_type, mod_type) == ('glob', 'filter'): + if not values.endswith(b'*'): + extension = b'*' if values.endswith(b'/') else b'/*' + items.append((mod_type, match_type, values+extension)) + setattr(namespace, self.dest, items) + + class HelperFilter(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + af = FilteringOptions.AppendFilter(dest='path_changes', + option_strings=None) + dirname = values if values[-1:] == b'/' else values+b'/' + if option_string == '--subdirectory-filter': + af(parser, namespace, dirname, '--path-match') + af(parser, namespace, dirname+b':', '--path-rename') + elif option_string == '--to-subdirectory-filter': + af(parser, namespace, b':'+dirname, '--path-rename') + else: + raise SystemExit(_("Error: HelperFilter given invalid option_string: %s") + % option_string) # pragma: no cover + + class FileWithPathsFilter(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + if not namespace.path_changes: + namespace.path_changes = [] + namespace.path_changes += FilteringOptions.get_paths_from_file(values) + + @staticmethod + def create_arg_parser(): + # Include usage in the summary, so we can put the description first + summary = _('''Rewrite (or analyze) repository history + + git-filter-repo destructively rewrites history (unless --analyze or + --dry-run are given) according to specified rules. It refuses to do any + rewriting unless either run from a clean fresh clone, or --force was + given. + + Basic Usage: + git-filter-repo --analyze + git-filter-repo [FILTER/RENAME/CONTROL OPTIONS] + + See EXAMPLES section for details. + ''').rstrip() + + # Provide a long helpful examples section + example_text = _('''CALLBACKS + + Most callback functions are of the same general format. For a command line + argument like + --foo-callback 'BODY' + + the following code will be compiled and called: + def foo_callback(foo): + BODY + + The exception on callbacks is the --file-info-callback, which will be + discussed further below. + + Given the callback style, we can thus make a simple callback to replace + 'Jon' with 'John' in author/committer/tagger names: + git filter-repo --name-callback 'return name.replace(b"Jon", b"John")' + + To remove all 'Tested-by' tags in commit (or tag) messages: + git filter-repo --message-callback 'return re.sub(br"\\nTested-by:.*", "", message)' + + To remove all .DS_Store files: + git filter-repo --filename-callback 'return None if os.path.basename(filename) == b".DS_Store" else filename' + + Note that if BODY resolves to a filename, then the contents of that file + will be used as the BODY in the callback function. + + The --file-info-callback has a more involved function callback; for it the + following code will be compiled and called: + def file_info_callback(filename, mode, blob_id, value): + BODY + + It is designed to be used in cases where filtering depends on both + filename and contents (and maybe mode). It is called for file changes + other than deletions (since deletions have no file contents to operate + on). This callback is expected to return a tuple of (filename, mode, + blob_id). It can make use of the following functions from the value + instance: + value.get_contents_by_identifier(blob_id) -> contents (bytestring) + value.get_size_by_identifier(blob_id) -> size_of_blob (int) + value.insert_file_with_contents(contents) -> blob_id + value.is_binary(contents) -> bool + value.apply_replace_text(contents) -> new_contents (bytestring) + and can read/write the following data member from the value instance: + value.data (dict) + + The filename can be used for renaming the file similar to + --filename-callback (or None to drop the change), and mode is one + of b'100644', b'100755', b'120000', or b'160000'. + + For more detailed examples and explanations AND caveats, see + https://htmlpreview.github.io/?https://github.com/newren/git-filter-repo/blob/docs/html/git-filter-repo.html#CALLBACKS + +EXAMPLES + + To get a bunch of reports mentioning renames that have occurred in + your repo and listing sizes of objects aggregated by any of path, + directory, extension, or blob-id: + git filter-repo --analyze + + (These reports can help you choose how to filter your repo; it can + be useful to re-run this command after filtering to regenerate the + report and verify the changes look correct.) + + To extract the history that touched just 'guides' and 'tools/releases': + git filter-repo --path guides/ --path tools/releases + + To remove foo.zip and bar/baz/zips from every revision in history: + git filter-repo --path foo.zip --path bar/baz/zips/ --invert-paths + + To replace the text 'password' with 'p455w0rd': + git filter-repo --replace-text <(echo "password==>p455w0rd") + + To use the current version of the .mailmap file to update authors, + committers, and taggers throughout history and make it permanent: + git filter-repo --use-mailmap + + To extract the history of 'src/', rename all files to have a new leading + directory 'my-module' (e.g. src/foo.java -> my-module/src/foo.java), and + add a 'my-module-' prefix to all tags: + git filter-repo --path src/ --to-subdirectory-filter my-module --tag-rename '':'my-module-' + + For more detailed examples and explanations, see + https://htmlpreview.github.io/?https://github.com/newren/git-filter-repo/blob/docs/html/git-filter-repo.html#EXAMPLES''') + + # Create the basic parser + parser = argparse.ArgumentParser(description=summary, + usage = argparse.SUPPRESS, + add_help = False, + epilog = example_text, + formatter_class=argparse.RawDescriptionHelpFormatter) + + analyze = parser.add_argument_group(title=_("Analysis")) + analyze.add_argument('--analyze', action='store_true', + help=_("Analyze repository history and create a report that may be " + "useful in determining what to filter in a subsequent run. " + "Will not modify your repo.")) + analyze.add_argument('--report-dir', + metavar='DIR_OR_FILE', + type=os.fsencode, + dest='report_dir', + help=_("Directory to write report, defaults to GIT_DIR/filter_repo/analysis," + "refuses to run if exists, --force delete existing dir first.")) + + path = parser.add_argument_group(title=_("Filtering based on paths " + "(see also --filename-callback)"), + description=textwrap.dedent(_(""" + These options specify the paths to select. Note that much like git + itself, renames are NOT followed so you may need to specify multiple + paths, e.g. `--path olddir/ --path newdir/` + """[1:]))) + + path.add_argument('--invert-paths', action='store_false', dest='inclusive', + help=_("Invert the selection of files from the specified " + "--path-{match,glob,regex} options below, i.e. only select " + "files matching none of those options.")) + + path.add_argument('--path-match', '--path', metavar='DIR_OR_FILE', + type=os.fsencode, + action=FilteringOptions.AppendFilter, dest='path_changes', + help=_("Exact paths (files or directories) to include in filtered " + "history. Multiple --path options can be specified to get " + "a union of paths.")) + path.add_argument('--path-glob', metavar='GLOB', type=os.fsencode, + action=FilteringOptions.AppendFilter, dest='path_changes', + help=_("Glob of paths to include in filtered history. Multiple " + "--path-glob options can be specified to get a union of " + "paths.")) + path.add_argument('--path-regex', metavar='REGEX', type=os.fsencode, + action=FilteringOptions.AppendFilter, dest='path_changes', + help=_("Regex of paths to include in filtered history. Multiple " + "--path-regex options can be specified to get a union of " + "paths")) + path.add_argument('--use-base-name', action='store_true', + help=_("Match on file base name instead of full path from the top " + "of the repo. Incompatible with --path-rename, and " + "incompatible with matching against directory names.")) + + rename = parser.add_argument_group(title=_("Renaming based on paths " + "(see also --filename-callback)")) + rename.add_argument('--path-rename', '--path-rename-match', + metavar='OLD_NAME:NEW_NAME', dest='path_changes', type=os.fsencode, + action=FilteringOptions.AppendFilter, + help=_("Path to rename; if filename or directory matches OLD_NAME " + "rename to NEW_NAME. Multiple --path-rename options can be " + "specified. NOTE: If you combine filtering options with " + "renaming ones, do not rely on a rename argument to select " + "paths; you also need a filter to select them.")) + + helpers = parser.add_argument_group(title=_("Path shortcuts")) + helpers.add_argument('--paths', help=argparse.SUPPRESS, metavar='IGNORE') + helpers.add_argument('--paths-from-file', metavar='FILENAME', + type=os.fsencode, + action=FilteringOptions.FileWithPathsFilter, dest='path_changes', + help=_("Specify several path filtering and renaming directives, one " + "per line. Lines with '==>' in them specify path renames, " + "and lines can begin with 'literal:' (the default), 'glob:', " + "or 'regex:' to specify different matching styles. Blank " + "lines and lines starting with a '#' are ignored.")) + helpers.add_argument('--subdirectory-filter', metavar='DIRECTORY', + action=FilteringOptions.HelperFilter, type=os.fsencode, + help=_("Only look at history that touches the given subdirectory " + "and treat that directory as the project root. Equivalent " + "to using '--path DIRECTORY/ --path-rename DIRECTORY/:'")) + helpers.add_argument('--to-subdirectory-filter', metavar='DIRECTORY', + action=FilteringOptions.HelperFilter, type=os.fsencode, + help=_("Treat the project root as if it were under DIRECTORY. " + "Equivalent to using '--path-rename :DIRECTORY/'")) + + contents = parser.add_argument_group(title=_("Content editing filters " + "(see also --blob-callback)")) + contents.add_argument('--replace-text', metavar='EXPRESSIONS_FILE', + help=_("A file with expressions that, if found, will be replaced. " + "By default, each expression is treated as literal text, " + "but 'regex:' and 'glob:' prefixes are supported. You can " + "end the line with '==>' and some replacement text to " + "choose a replacement choice other than the default of '{}'." + .format(decode(FilteringOptions.default_replace_text)))) + contents.add_argument('--strip-blobs-bigger-than', metavar='SIZE', + dest='max_blob_size', default=0, + help=_("Strip blobs (files) bigger than specified size (e.g. '5M', " + "'2G', etc)")) + contents.add_argument('--strip-blobs-with-ids', metavar='BLOB-ID-FILENAME', + help=_("Read git object ids from each line of the given file, and " + "strip all of them from history")) + + refrename = parser.add_argument_group(title=_("Renaming of refs " + "(see also --refname-callback)")) + refrename.add_argument('--tag-rename', metavar='OLD:NEW', type=os.fsencode, + help=_("Rename tags starting with OLD to start with NEW. For " + "example, --tag-rename foo:bar will rename tag foo-1.2.3 " + "to bar-1.2.3; either OLD or NEW can be empty.")) + + messages = parser.add_argument_group(title=_("Filtering of commit messages " + "(see also --message-callback)")) + messages.add_argument('--replace-message', metavar='EXPRESSIONS_FILE', + help=_("A file with expressions that, if found in commit or tag " + "messages, will be replaced. This file uses the same syntax " + "as --replace-text.")) + messages.add_argument('--preserve-commit-hashes', action='store_true', + help=_("By default, since commits are rewritten and thus gain new " + "hashes, references to old commit hashes in commit messages " + "are replaced with new commit hashes (abbreviated to the same " + "length as the old reference). Use this flag to turn off " + "updating commit hashes in commit messages.")) + messages.add_argument('--preserve-commit-encoding', action='store_true', + help=_("Do not reencode commit messages into UTF-8. By default, if " + "the commit object specifies an encoding for the commit " + "message, the message is re-encoded into UTF-8.")) + + people = parser.add_argument_group(title=_("Filtering of names & emails " + "(see also --name-callback " + "and --email-callback)")) + people.add_argument('--mailmap', dest='mailmap', metavar='FILENAME', + type=os.fsencode, + help=_("Use specified mailmap file (see git-shortlog(1) for " + "details on the format) when rewriting author, committer, " + "and tagger names and emails. If the specified file is " + "part of git history, historical versions of the file will " + "be ignored; only the current contents are consulted.")) + people.add_argument('--use-mailmap', dest='mailmap', + action='store_const', const=b'.mailmap', + help=_("Same as: '--mailmap .mailmap' ")) + + parents = parser.add_argument_group(title=_("Parent rewriting")) + parents.add_argument('--replace-refs', default=None, + choices=['delete-no-add', 'delete-and-add', + 'update-no-add', 'update-or-add', + 'update-and-add', 'old-default'], + help=_("How to handle replace refs (see git-replace(1)). Replace " + "refs can be added during the history rewrite as a way to " + "allow users to pass old commit IDs (from before " + "git-filter-repo was run) to git commands and have git know " + "how to translate those old commit IDs to the new " + "(post-rewrite) commit IDs. Also, replace refs that existed " + "before the rewrite can either be deleted or updated. The " + "choices to pass to --replace-refs thus need to specify both " + "what to do with existing refs and what to do with commit " + "rewrites. Thus 'update-and-add' means to update existing " + "replace refs, and for any commit rewrite (even if already " + "pointed at by a replace ref) add a new refs/replace/ reference " + "to map from the old commit ID to the new commit ID. The " + "default is update-no-add, meaning update existing replace refs " + "but do not add any new ones. There is also a special " + "'old-default' option for picking the default used in versions " + "prior to git-filter-repo-2.45, namely 'update-and-add' upon " + "the first run of git-filter-repo in a repository and " + "'update-or-add' if running git-filter-repo again on a " + "repository.")) + parents.add_argument('--prune-empty', default='auto', + choices=['always', 'auto', 'never'], + help=_("Whether to prune empty commits. 'auto' (the default) means " + "only prune commits which become empty (not commits which were " + "empty in the original repo, unless their parent was pruned). " + "When the parent of a commit is pruned, the first non-pruned " + "ancestor becomes the new parent.")) + parents.add_argument('--prune-degenerate', default='auto', + choices=['always', 'auto', 'never'], + help=_("Since merge commits are needed for history topology, they " + "are typically exempt from pruning. However, they can become " + "degenerate with the pruning of other commits (having fewer " + "than two parents, having one commit serve as both parents, or " + "having one parent as the ancestor of the other.) If such " + "merge commits have no file changes, they can be pruned. The " + "default ('auto') is to only prune empty merge commits which " + "become degenerate (not which started as such).")) + parents.add_argument('--no-ff', action='store_true', + help=_("Even if the first parent is or becomes an ancestor of another " + "parent, do not prune it. This modifies how " + "--prune-degenerate behaves, and may be useful in projects who " + "always use merge --no-ff.")) + + callback = parser.add_argument_group(title=_("Generic callback code snippets")) + callback.add_argument('--filename-callback', metavar="FUNCTION_BODY_OR_FILE", + help=_("Python code body for processing filenames; see CALLBACKS " + "sections below.")) + callback.add_argument('--file-info-callback', metavar="FUNCTION_BODY_OR_FILE", + help=_("Python code body for processing file and metadata; see " + "CALLBACKS sections below.")) + callback.add_argument('--message-callback', metavar="FUNCTION_BODY_OR_FILE", + help=_("Python code body for processing messages (both commit " + "messages and tag messages); see CALLBACKS section below.")) + callback.add_argument('--name-callback', metavar="FUNCTION_BODY_OR_FILE", + help=_("Python code body for processing names of people; see " + "CALLBACKS section below.")) + callback.add_argument('--email-callback', metavar="FUNCTION_BODY_OR_FILE", + help=_("Python code body for processing emails addresses; see " + "CALLBACKS section below.")) + callback.add_argument('--refname-callback', metavar="FUNCTION_BODY_OR_FILE", + help=_("Python code body for processing refnames; see CALLBACKS " + "section below.")) + + callback.add_argument('--blob-callback', metavar="FUNCTION_BODY_OR_FILE", + help=_("Python code body for processing blob objects; see " + "CALLBACKS section below.")) + callback.add_argument('--commit-callback', metavar="FUNCTION_BODY_OR_FILE", + help=_("Python code body for processing commit objects; see " + "CALLBACKS section below.")) + callback.add_argument('--tag-callback', metavar="FUNCTION_BODY_OR_FILE", + help=_("Python code body for processing tag objects. Note that " + "lightweight tags have no tag object and are thus not " + "handled by this callback. See CALLBACKS section below.")) + callback.add_argument('--reset-callback', metavar="FUNCTION_BODY_OR_FILE", + help=_("Python code body for processing reset objects; see " + "CALLBACKS section below.")) + + sdr = parser.add_argument_group(title=_("Sensitive Data Removal Handling")) + sdr.add_argument('--sensitive-data-removal', '--sdr', action='store_true', + help=_("This rewrite is intended to remove sensitive data from a " + "repository. Gather extra information from the rewrite needed " + "to provide additional instructions on how to clean up other " + "copies.")) + sdr.add_argument('--no-fetch', action='store_true', + help=_("By default, --sensitive-data-removal will trigger a " + "mirror-like fetch of all refs from origin, discarding local " + "changes, but ensuring that _all_ fetchable refs that hold on " + "to the sensitve data are rewritten. This flag removes that " + "fetch, risking that other refs continue holding on to the " + "sensitive data. This option is implied by --partial or any " + "flag that implies --partial.")) + + desc = _( + "Specifying alternate source or target locations implies --partial,\n" + "except that the normal default for --replace-refs is used. However,\n" + "unlike normal uses of --partial, this doesn't risk mixing old and new\n" + "history since the old and new histories are in different repositories.") + location = parser.add_argument_group(title=_("Location to filter from/to"), + description=desc) + location.add_argument('--source', type=os.fsencode, + help=_("Git repository to read from")) + location.add_argument('--target', type=os.fsencode, + help=_("Git repository to overwrite with filtered history")) + + order = parser.add_argument_group(title=_("Ordering of commits")) + order.add_argument('--date-order', action='store_true', + help=_("Processes commits in commit timestamp order.")) + + misc = parser.add_argument_group(title=_("Miscellaneous options")) + misc.add_argument('--help', '-h', action='store_true', + help=_("Show this help message and exit.")) + misc.add_argument('--version', action='store_true', + help=_("Display filter-repo's version and exit.")) + misc.add_argument('--proceed', action='store_true', + help=_("Avoid triggering the no-arguments-specified check.")) + misc.add_argument('--force', '-f', action='store_true', + help=_("Rewrite repository history even if the current repo does not " + "look like a fresh clone. History rewriting is irreversible " + "(and includes immediate pruning of reflogs and old objects), " + "so be cautious about using this flag.")) + misc.add_argument('--partial', action='store_true', + help=_("Do a partial history rewrite, resulting in the mixture of " + "old and new history. This disables rewriting " + "refs/remotes/origin/* to refs/heads/*, disables removing " + "of the 'origin' remote, disables removing unexported refs, " + "disables expiring the reflog, and disables the automatic " + "post-filter gc. Also, this modifies --tag-rename and " + "--refname-callback options such that instead of replacing " + "old refs with new refnames, it will instead create new " + "refs and keep the old ones around. Use with caution.")) + misc.add_argument('--no-gc', action='store_true', + help=_("Do not run 'git gc' after filtering.")) + # WARNING: --refs presents a problem with become-degenerate pruning: + # * Excluding a commit also excludes its ancestors so when some other + # commit has an excluded ancestor as a parent we have no way of + # knowing what it is an ancestor of without doing a special + # full-graph walk. + misc.add_argument('--refs', nargs='+', + help=_("Limit history rewriting to the specified refs. Implies " + "--partial. In addition to the normal caveats of --partial " + "(mixing old and new history, no automatic remapping of " + "refs/remotes/origin/* to refs/heads/*, etc.), this also may " + "cause problems for pruning of degenerate empty merge " + "commits when negative revisions are specified.")) + + misc.add_argument('--dry-run', action='store_true', + help=_("Do not change the repository. Run `git fast-export` and " + "filter its output, and save both the original and the " + "filtered version for comparison. This also disables " + "rewriting commit messages due to not knowing new commit " + "IDs and disables filtering of some empty commits due to " + "inability to query the fast-import backend." )) + misc.add_argument('--debug', action='store_true', + help=_("Print additional information about operations being " + "performed and commands being run. When used together " + "with --dry-run, also show extra information about what " + "would be run.")) + # WARNING: --state-branch has some problems: + # * It does not work well with manually inserted objects (user creating + # Blob() or Commit() or Tag() objects and calling + # RepoFilter.insert(obj) on them). + # * It does not work well with multiple source or multiple target repos + # * It doesn't work so well with pruning become-empty commits (though + # --refs doesn't work so well with it either) + # These are probably fixable, given some work (e.g. re-importing the + # graph at the beginning to get the AncestryGraph right, doing our own + # export of marks instead of using fast-export --export-marks, etc.), but + # for now just hide the option. + misc.add_argument('--state-branch', + #help=_("Enable incremental filtering by saving the mapping of old " + # "to new objects to the specified branch upon exit, and" + # "loading that mapping from that branch (if it exists) " + # "upon startup.")) + help=argparse.SUPPRESS) + misc.add_argument('--stdin', action='store_true', + help=_("Instead of running `git fast-export` and filtering its " + "output, filter the fast-export stream from stdin. The " + "stdin must be in the expected input format (e.g. it needs " + "to include original-oid directives).")) + misc.add_argument('--quiet', action='store_true', + help=_("Pass --quiet to other git commands called")) + return parser + + @staticmethod + def sanity_check_args(args): + if args.analyze and args.path_changes: + raise SystemExit(_("Error: --analyze is incompatible with --path* flags; " + "it's a read-only operation.")) + if args.analyze and args.stdin: + raise SystemExit(_("Error: --analyze is incompatible with --stdin.")) + # If no path_changes are found, initialize with empty list but mark as + # not inclusive so that all files match + if args.path_changes == None: + args.path_changes = [] + args.inclusive = False + else: + # Similarly, if we have no filtering paths, then no path should be + # filtered out. Based on how newname() works, the easiest way to + # achieve that is setting args.inclusive to False. + if not any(x[0] == 'filter' for x in args.path_changes): + args.inclusive = False + # Also check for incompatible --use-base-name and --path-rename flags. + if args.use_base_name: + if any(x[0] == 'rename' for x in args.path_changes): + raise SystemExit(_("Error: --use-base-name and --path-rename are " + "incompatible.")) + # Also throw some sanity checks on git version here; + # PERF: remove these checks once new enough git versions are common + p = subproc.Popen('git fast-export -h'.split(), + stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + output = p.stdout.read() + if b'--anonymize-map' not in output: # pragma: no cover + global date_format_permissive + date_format_permissive = False + if not any(x in output for x in [b'--mark-tags',b'--[no-]mark-tags']): # pragma: no cover + global write_marks + write_marks = False + if args.state_branch: + # We need a version of git-fast-export with --mark-tags + raise SystemExit(_("Error: need git >= 2.24.0")) + if not any(x in output for x in [b'--reencode', b'--[no-]reencode']): # pragma: no cover + if args.preserve_commit_encoding: + # We need a version of git-fast-export with --reencode + raise SystemExit(_("Error: need git >= 2.23.0")) + else: + # Set args.preserve_commit_encoding to None which we'll check for later + # to avoid passing --reencode=yes to fast-export (that option was the + # default prior to git-2.23) + args.preserve_commit_encoding = None + # If we don't have fast-exoprt --reencode, we may also be missing + # diff-tree --combined-all-paths, which is even more important... + p = subproc.Popen('git diff-tree -h'.split(), + stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + output = p.stdout.read() + if b'--combined-all-paths' not in output: + # We need a version of git-diff-tree with --combined-all-paths + raise SystemExit(_("Error: need git >= 2.22.0")) + if args.sensitive_data_removal: + p = subproc.Popen('git cat-file -h'.split(), + stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + output = p.stdout.read() + if b"--batch-command" not in output: # pragma: no cover + raise SystemExit(_("Error: need git >= 2.36.0")) + # End of sanity checks on git version + if args.max_blob_size: + suffix = args.max_blob_size[-1] + if suffix not in '1234567890': + mult = {'K': 1024, 'M': 1024**2, 'G': 1024**3} + if suffix not in mult: + raise SystemExit(_("Error: could not parse --strip-blobs-bigger-than" + " argument %s") + % args.max_blob_size) + args.max_blob_size = int(args.max_blob_size[0:-1]) * mult[suffix] + else: + args.max_blob_size = int(args.max_blob_size) + if args.file_info_callback and ( + args.stdin or args.blob_callback or args.filename_callback): + raise SystemExit(_("Error: --file-info-callback is incompatible with " + "--stdin, --blob-callback,\nand --filename-callback.")) + + @staticmethod + def get_replace_text(filename): + replace_literals = [] + replace_regexes = [] + with open(filename, 'br') as f: + for line in f: + line = line.rstrip(b'\r\n') + + # Determine the replacement + replacement = FilteringOptions.default_replace_text + if b'==>' in line: + line, replacement = line.rsplit(b'==>', 1) + + # See if we need to match via regex + regex = None + if line.startswith(b'regex:'): + regex = line[6:] + elif line.startswith(b'glob:'): + regex = glob_to_regex(line[5:]) + if regex: + replace_regexes.append((re.compile(regex), replacement)) + else: + # Otherwise, find the literal we need to replace + if line.startswith(b'literal:'): + line = line[8:] + if not line: + continue + replace_literals.append((line, replacement)) + return {'literals': replace_literals, 'regexes': replace_regexes} + + @staticmethod + def get_paths_from_file(filename): + new_path_changes = [] + with open(filename, 'br') as f: + for line in f: + line = line.rstrip(b'\r\n') + + # Skip blank lines + if not line: + continue + # Skip comment lines + if line.startswith(b'#'): + continue + + # Determine the replacement + match_type, repl = 'literal', None + if b'==>' in line: + line, repl = line.rsplit(b'==>', 1) + + # See if we need to match via regex + match_type = 'match' # a.k.a. 'literal' + if line.startswith(b'regex:'): + match_type = 'regex' + match = re.compile(line[6:]) + elif line.startswith(b'glob:'): + match_type = 'glob' + match = line[5:] + if repl: + raise SystemExit(_("Error: In %s, 'glob:' and '==>' are incompatible (renaming globs makes no sense)" % decode(filename))) + else: + if line.startswith(b'literal:'): + match = line[8:] + else: + match = line + if repl is not None: + if match and repl and match.endswith(b'/') != repl.endswith(b'/'): + raise SystemExit(_("Error: When rename directories, if OLDNAME " + "and NEW_NAME are both non-empty and either " + "ends with a slash then both must.")) + + # Record the filter or rename + if repl is not None: + new_path_changes.append(['rename', match_type, (match, repl)]) + else: + new_path_changes.append(['filter', match_type, match]) + if match_type == 'glob' and not match.endswith(b'*'): + extension = b'*' if match.endswith(b'/') else b'/*' + new_path_changes.append(['filter', match_type, match+extension]) + return new_path_changes + + @staticmethod + def default_options(): + return FilteringOptions.parse_args([], error_on_empty = False) + + @staticmethod + def parse_args(input_args, error_on_empty = True): + parser = FilteringOptions.create_arg_parser() + if not input_args and error_on_empty: + parser.print_usage() + raise SystemExit(_("No arguments specified.")) + args = parser.parse_args(input_args) + if args.help: + parser.print_help() + raise SystemExit() + if args.paths: + raise SystemExit("Error: Option `--paths` unrecognized; did you mean --path or --paths-from-file?") + if args.version: + GitUtils.print_my_version() + raise SystemExit() + FilteringOptions.sanity_check_args(args) + if args.mailmap: + args.mailmap = MailmapInfo(args.mailmap) + if args.replace_text: + args.replace_text = FilteringOptions.get_replace_text(args.replace_text) + if args.replace_message: + args.replace_message = FilteringOptions.get_replace_text(args.replace_message) + if args.strip_blobs_with_ids: + with open(args.strip_blobs_with_ids, 'br') as f: + args.strip_blobs_with_ids = set(f.read().split()) + else: + args.strip_blobs_with_ids = set() + if (args.partial or args.refs) and not args.replace_refs: + args.replace_refs = 'update-no-add' + args.repack = not (args.partial or args.refs or args.no_gc) + if args.refs or args.source or args.target: + args.partial = True + if args.partial: + args.no_fetch = True + if not args.refs: + args.refs = ['--all'] + return args + +class RepoAnalyze(object): + + # First, several helper functions for analyze_commit() + + @staticmethod + def equiv_class(stats, filename): + return stats['equivalence'].get(filename, (filename,)) + + @staticmethod + def setup_equivalence_for_rename(stats, oldname, newname): + # if A is renamed to B and B is renamed to C, then the user thinks of + # A, B, and C as all being different names for the same 'file'. We record + # this as an equivalence class: + # stats['equivalence'][name] = (A,B,C) + # for name being each of A, B, and C. + old_tuple = stats['equivalence'].get(oldname, ()) + if newname in old_tuple: + return + elif old_tuple: + new_tuple = tuple(list(old_tuple)+[newname]) + else: + new_tuple = (oldname, newname) + for f in new_tuple: + stats['equivalence'][f] = new_tuple + + @staticmethod + def setup_or_update_rename_history(stats, commit, oldname, newname): + rename_commits = stats['rename_history'].get(oldname, set()) + rename_commits.add(commit) + stats['rename_history'][oldname] = rename_commits + + @staticmethod + def handle_renames(stats, commit, change_types, filenames): + for index, change_type in enumerate(change_types): + if change_type == ord(b'R'): + oldname, newname = filenames[index], filenames[-1] + RepoAnalyze.setup_equivalence_for_rename(stats, oldname, newname) + RepoAnalyze.setup_or_update_rename_history(stats, commit, + oldname, newname) + + @staticmethod + def handle_file(stats, graph, commit, modes, shas, filenames): + mode, sha, filename = modes[-1], shas[-1], filenames[-1] + + # Figure out kind of deletions to undo for this file, and update lists + # of all-names-by-sha and all-filenames + delmode = 'tree_deletions' + if mode != b'040000': + delmode = 'file_deletions' + stats['names'][sha].add(filename) + stats['allnames'].add(filename) + + # If the file (or equivalence class of files) was recorded as deleted, + # clearly it isn't anymore + equiv = RepoAnalyze.equiv_class(stats, filename) + for f in equiv: + stats[delmode].pop(f, None) + + # If we get a modify/add for a path that was renamed, we may need to break + # the equivalence class. However, if the modify/add was on a branch that + # doesn't have the rename in its history, we are still okay. + need_to_break_equivalence = False + if equiv[-1] != filename: + for rename_commit in stats['rename_history'][filename]: + if graph.is_ancestor(rename_commit, commit): + need_to_break_equivalence = True + + if need_to_break_equivalence: + for f in equiv: + if f in stats['equivalence']: + del stats['equivalence'][f] + + @staticmethod + def analyze_commit(stats, graph, commit, parents, date, file_changes): + graph.add_commit_and_parents(commit, parents) + for change in file_changes: + modes, shas, change_types, filenames = change + if len(parents) == 1 and change_types.startswith(b'R'): + change_types = b'R' # remove the rename score; we don't care + if modes[-1] == b'160000': + continue + elif modes[-1] == b'000000': + # Track when files/directories are deleted + for f in RepoAnalyze.equiv_class(stats, filenames[-1]): + if any(x == b'040000' for x in modes[0:-1]): + stats['tree_deletions'][f] = date + else: + stats['file_deletions'][f] = date + elif change_types.strip(b'AMT') == b'': + RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames) + elif modes[-1] == b'040000' and change_types.strip(b'RAM') == b'': + RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames) + elif change_types.strip(b'RAMT') == b'': + RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames) + RepoAnalyze.handle_renames(stats, commit, change_types, filenames) + else: + raise SystemExit(_("Unhandled change type(s): %(change_type)s " + "(in commit %(commit)s)") + % ({'change_type': change_types, 'commit': commit}) + ) # pragma: no cover + + @staticmethod + def gather_data(args): + unpacked_size, packed_size = GitUtils.get_blob_sizes() + stats = {'names': collections.defaultdict(set), + 'allnames' : set(), + 'file_deletions': {}, + 'tree_deletions': {}, + 'equivalence': {}, + 'rename_history': collections.defaultdict(set), + 'unpacked_size': unpacked_size, + 'packed_size': packed_size, + 'num_commits': 0} + + # Setup the rev-list/diff-tree process + processed_commits_msg = _("Processed %d commits") + commit_parse_progress = ProgressWriter() + num_commits = 0 + cmd = ('git rev-list --topo-order --reverse {}'.format(' '.join(args.refs)) + + ' | git diff-tree --stdin --always --root --format=%H%n%P%n%cd' + + ' --date=short -M -t -c --raw --combined-all-paths') + dtp = subproc.Popen(cmd, shell=True, bufsize=-1, stdout=subprocess.PIPE) + f = dtp.stdout + line = f.readline() + if not line: + raise SystemExit(_("Nothing to analyze; repository is empty.")) + cont = bool(line) + graph = AncestryGraph() + while cont: + commit = line.rstrip() + parents = f.readline().split() + date = f.readline().rstrip() + + # We expect a blank line next; if we get a non-blank line then + # this commit modified no files and we need to move on to the next. + # If there is no line, we've reached end-of-input. + line = f.readline() + if not line: + cont = False + line = line.rstrip() + + # If we haven't reached end of input, and we got a blank line meaning + # a commit that has modified files, then get the file changes associated + # with this commit. + file_changes = [] + if cont and not line: + cont = False + for line in f: + if not line.startswith(b':'): + cont = True + break + n = 1+max(1, len(parents)) + assert line.startswith(b':'*(n-1)) + relevant = line[n-1:-1] + splits = relevant.split(None, n) + modes = splits[0:n] + splits = splits[n].split(None, n) + shas = splits[0:n] + splits = splits[n].split(b'\t') + change_types = splits[0] + filenames = [PathQuoting.dequote(x) for x in splits[1:]] + file_changes.append([modes, shas, change_types, filenames]) + + # If someone is trying to analyze a subset of the history, make sure + # to avoid dying on commits with parents that we haven't seen before + if args.refs: + graph.record_external_commits([p for p in parents + if not p in graph.value]) + + # Analyze this commit and update progress + RepoAnalyze.analyze_commit(stats, graph, commit, parents, date, + file_changes) + num_commits += 1 + commit_parse_progress.show(processed_commits_msg % num_commits) + + # Show the final commits processed message and record the number of commits + commit_parse_progress.finish() + stats['num_commits'] = num_commits + + # Close the output, ensure rev-list|diff-tree pipeline completed successfully + dtp.stdout.close() + if dtp.wait(): + raise SystemExit(_("Error: rev-list|diff-tree pipeline failed; see above.")) # pragma: no cover + + return stats + + @staticmethod + def write_report(reportdir, stats): + def datestr(datetimestr): + return datetimestr if datetimestr else _('').encode() + + def dirnames(path): + while True: + path = os.path.dirname(path) + yield path + if path == b'': + break + + # Compute aggregate size information for paths, extensions, and dirs + total_size = {'packed': 0, 'unpacked': 0} + path_size = {'packed': collections.defaultdict(int), + 'unpacked': collections.defaultdict(int)} + ext_size = {'packed': collections.defaultdict(int), + 'unpacked': collections.defaultdict(int)} + dir_size = {'packed': collections.defaultdict(int), + 'unpacked': collections.defaultdict(int)} + for sha in stats['names']: + size = {'packed': stats['packed_size'][sha], + 'unpacked': stats['unpacked_size'][sha]} + for which in ('packed', 'unpacked'): + for name in stats['names'][sha]: + total_size[which] += size[which] + path_size[which][name] += size[which] + basename, ext = os.path.splitext(name) + ext_size[which][ext] += size[which] + for dirname in dirnames(name): + dir_size[which][dirname] += size[which] + + # Determine if and when extensions and directories were deleted + ext_deleted_data = {} + for name in stats['allnames']: + when = stats['file_deletions'].get(name, None) + + # Update the extension + basename, ext = os.path.splitext(name) + if when is None: + ext_deleted_data[ext] = None + elif ext in ext_deleted_data: + if ext_deleted_data[ext] is not None: + ext_deleted_data[ext] = max(ext_deleted_data[ext], when) + else: + ext_deleted_data[ext] = when + + dir_deleted_data = {} + for name in dir_size['packed']: + dir_deleted_data[name] = stats['tree_deletions'].get(name, None) + + with open(os.path.join(reportdir, b"README"), 'bw') as f: + # Give a basic overview of this file + f.write(b"== %s ==\n" % _("Overall Statistics").encode()) + f.write((" %s: %d\n" % (_("Number of commits"), + stats['num_commits'])).encode()) + f.write((" %s: %d\n" % (_("Number of filenames"), + len(path_size['packed']))).encode()) + f.write((" %s: %d\n" % (_("Number of directories"), + len(dir_size['packed']))).encode()) + f.write((" %s: %d\n" % (_("Number of file extensions"), + len(ext_size['packed']))).encode()) + f.write(b"\n") + f.write((" %s: %d\n" % (_("Total unpacked size (bytes)"), + total_size['unpacked'])).encode()) + f.write((" %s: %d\n" % (_("Total packed size (bytes)"), + total_size['packed'])).encode()) + f.write(b"\n") + + # Mention issues with the report + f.write(("== %s ==\n" % _("Caveats")).encode()) + f.write(("=== %s ===\n" % _("Sizes")).encode()) + f.write(textwrap.dedent(_(""" + Packed size represents what size your repository would be if no + trees, commits, tags, or other metadata were included (though it may + fail to represent de-duplication; see below). It also represents the + current packing, which may be suboptimal if you haven't gc'ed for a + while. + + Unpacked size represents what size your repository would be if no + trees, commits, tags, or other metadata were included AND if no + files were packed; i.e., without delta-ing or compression. + + Both unpacked and packed sizes can be slightly misleading. Deleting + a blob from history not save as much space as the unpacked size, + because it is obviously normally stored in packed form. Also, + deleting a blob from history may not save as much space as its packed + size either, because another blob could be stored as a delta against + that blob, so when you remove one blob another blob's packed size may + grow. + + Also, the sum of the packed sizes can add up to more than the + repository size; if the same contents appeared in the repository in + multiple places, git will automatically de-dupe and store only one + copy, while the way sizes are added in this analysis adds the size + for each file path that has those contents. Further, if a file is + ever reverted to a previous version's contents, the previous + version's size will be counted multiple times in this analysis, even + though git will only store it once. + """)[1:]).encode()) + f.write(b"\n") + f.write(("=== %s ===\n" % _("Deletions")).encode()) + f.write(textwrap.dedent(_(""" + Whether a file is deleted is not a binary quality, since it can be + deleted on some branches but still exist in others. Also, it might + exist in an old tag, but have been deleted in versions newer than + that. More thorough tracking could be done, including looking at + merge commits where one side of history deleted and the other modified, + in order to give a more holistic picture of deletions. However, that + algorithm would not only be more complex to implement, it'd also be + quite difficult to present and interpret by users. Since --analyze + is just about getting a high-level rough picture of history, it instead + implements the simplistic rule that is good enough for 98% of cases: + A file is marked as deleted if the last commit in the fast-export + stream that mentions the file lists it as deleted. + This makes it dependent on topological ordering, but generally gives + the "right" answer. + """)[1:]).encode()) + f.write(b"\n") + f.write(("=== %s ===\n" % _("Renames")).encode()) + f.write(textwrap.dedent(_(""" + Renames share the same non-binary nature that deletions do, plus + additional challenges: + * If the renamed file is renamed again, instead of just two names for + a path you can have three or more. + * Rename pairs of the form (oldname, newname) that we consider to be + different names of the "same file" might only be valid over certain + commit ranges. For example, if a new commit reintroduces a file + named oldname, then new versions of oldname aren't the "same file" + anymore. We could try to portray this to the user, but it's easier + for the user to just break the pairing and only report unbroken + rename pairings to the user. + * The ability for users to rename files differently in different + branches means that our chains of renames will not necessarily be + linear but may branch out. + """)[1:]).encode()) + f.write(b"\n") + + # Equivalence classes for names, so if folks only want to keep a + # certain set of paths, they know the old names they want to include + # too. + with open(os.path.join(reportdir, b"renames.txt"), 'bw') as f: + seen = set() + for pathname,equiv_group in sorted(stats['equivalence'].items(), + key=lambda x:(x[1], x[0])): + if equiv_group in seen: + continue + seen.add(equiv_group) + f.write(("{} ->\n ".format(decode(equiv_group[0])) + + "\n ".join(decode(x) for x in equiv_group[1:]) + + "\n").encode()) + + # List directories in reverse sorted order of unpacked size + with open(os.path.join(reportdir, b"directories-deleted-sizes.txt"), 'bw') as f: + msg = "=== %s ===\n" % _("Deleted directories by reverse size") + f.write(msg.encode()) + msg = _("Format: unpacked size, packed size, date deleted, directory name\n") + f.write(msg.encode()) + for dirname, size in sorted(dir_size['packed'].items(), + key=lambda x:(x[1],x[0]), reverse=True): + if (dir_deleted_data[dirname]): + f.write(b" %10d %10d %-10s %s\n" % (dir_size['unpacked'][dirname], + size, + datestr(dir_deleted_data[dirname]), + dirname or _('').encode())) + + with open(os.path.join(reportdir, b"directories-all-sizes.txt"), 'bw') as f: + f.write(("=== %s ===\n" % _("All directories by reverse size")).encode()) + msg = _("Format: unpacked size, packed size, date deleted, directory name\n") + f.write(msg.encode()) + for dirname, size in sorted(dir_size['packed'].items(), + key=lambda x:(x[1],x[0]), reverse=True): + f.write(b" %10d %10d %-10s %s\n" % (dir_size['unpacked'][dirname], + size, + datestr(dir_deleted_data[dirname]), + dirname or _("").encode())) + + # List extensions in reverse sorted order of unpacked size + with open(os.path.join(reportdir, b"extensions-deleted-sizes.txt"), 'bw') as f: + msg = "=== %s ===\n" % _("Deleted extensions by reverse size") + f.write(msg.encode()) + msg = _("Format: unpacked size, packed size, date deleted, extension name\n") + f.write(msg.encode()) + for extname, size in sorted(ext_size['packed'].items(), + key=lambda x:(x[1],x[0]), reverse=True): + if (ext_deleted_data[extname]): + f.write(b" %10d %10d %-10s %s\n" % (ext_size['unpacked'][extname], + size, + datestr(ext_deleted_data[extname]), + extname or _('').encode())) + + with open(os.path.join(reportdir, b"extensions-all-sizes.txt"), 'bw') as f: + f.write(("=== %s ===\n" % _("All extensions by reverse size")).encode()) + msg = _("Format: unpacked size, packed size, date deleted, extension name\n") + f.write(msg.encode()) + for extname, size in sorted(ext_size['packed'].items(), + key=lambda x:(x[1],x[0]), reverse=True): + f.write(b" %10d %10d %-10s %s\n" % (ext_size['unpacked'][extname], + size, + datestr(ext_deleted_data[extname]), + extname or _('').encode())) + + # List files in reverse sorted order of unpacked size + with open(os.path.join(reportdir, b"path-deleted-sizes.txt"), 'bw') as f: + msg = "=== %s ===\n" % _("Deleted paths by reverse accumulated size") + f.write(msg.encode()) + msg = _("Format: unpacked size, packed size, date deleted, path name(s)\n") + f.write(msg.encode()) + for pathname, size in sorted(path_size['packed'].items(), + key=lambda x:(x[1],x[0]), reverse=True): + when = stats['file_deletions'].get(pathname, None) + if when: + f.write(b" %10d %10d %-10s %s\n" % (path_size['unpacked'][pathname], + size, + datestr(when), + pathname)) + + with open(os.path.join(reportdir, b"path-all-sizes.txt"), 'bw') as f: + msg = "=== %s ===\n" % _("All paths by reverse accumulated size") + f.write(msg.encode()) + msg = _("Format: unpacked size, packed size, date deleted, path name\n") + f.write(msg.encode()) + for pathname, size in sorted(path_size['packed'].items(), + key=lambda x:(x[1],x[0]), reverse=True): + when = stats['file_deletions'].get(pathname, None) + f.write(b" %10d %10d %-10s %s\n" % (path_size['unpacked'][pathname], + size, + datestr(when), + pathname)) + + # List of filenames and sizes in descending order + with open(os.path.join(reportdir, b"blob-shas-and-paths.txt"), 'bw') as f: + f.write(("=== %s ===\n" % _("Files by sha and associated pathnames in reverse size")).encode()) + f.write(_("Format: sha, unpacked size, packed size, filename(s) object stored as\n").encode()) + for sha, size in sorted(stats['packed_size'].items(), + key=lambda x:(x[1],x[0]), reverse=True): + if sha not in stats['names']: + # Some objects in the repository might not be referenced, or not + # referenced by the branches/tags the user cares about; skip them. + continue + names_with_sha = stats['names'][sha] + if len(names_with_sha) == 1: + names_with_sha = names_with_sha.pop() + else: + names_with_sha = b'[' + b', '.join(sorted(names_with_sha)) + b']' + f.write(b" %s %10d %10d %s\n" % (sha, + stats['unpacked_size'][sha], + size, + names_with_sha)) + + @staticmethod + def run(args): + if args.report_dir: + reportdir = args.report_dir + else: + git_dir = GitUtils.determine_git_dir(b'.') + + # Create the report directory as necessary + results_tmp_dir = os.path.join(git_dir, b'filter-repo') + if not os.path.isdir(results_tmp_dir): + os.mkdir(results_tmp_dir) + reportdir = os.path.join(results_tmp_dir, b"analysis") + + if os.path.isdir(reportdir): + if args.force: + sys.stdout.write(_("Warning: Removing recursively: \"%s\"\n") % decode(reportdir)) + shutil.rmtree(reportdir) + else: + sys.stdout.write(_("Error: dir already exists (use --force to delete): \"%s\"\n") % decode(reportdir)) + sys.exit(1) + + os.mkdir(reportdir) + + # Gather the data we need + stats = RepoAnalyze.gather_data(args) + + # Write the reports + sys.stdout.write(_("Writing reports to %s...") % decode(reportdir)) + sys.stdout.flush() + RepoAnalyze.write_report(reportdir, stats) + sys.stdout.write(_("done.\n")) + +class FileInfoValueHelper: + def __init__(self, replace_text, insert_blob_func, source_working_dir): + self.data = {} + self._replace_text = replace_text + self._insert_blob_func = insert_blob_func + cmd = ['git', 'cat-file', '--batch-command'] + self._cat_file_process = subproc.Popen(cmd, + stdin = subprocess.PIPE, + stdout = subprocess.PIPE, + cwd = source_working_dir) + + def finalize(self): + self._cat_file_process.stdin.close() + self._cat_file_process.wait() + + def get_contents_by_identifier(self, blobhash): + self._cat_file_process.stdin.write(b'contents '+blobhash+b'\n') + self._cat_file_process.stdin.flush() + line = self._cat_file_process.stdout.readline() + try: + (oid, oidtype, size) = line.split() + except ValueError: + assert(line == blobhash+b" missing\n") + return None + size = int(size) # Convert e.g. b'6283' to 6283 + assert(oidtype == b'blob') + contents_plus_newline = self._cat_file_process.stdout.read(size+1) + return contents_plus_newline[:-1] # return all but the newline + + def get_size_by_identifier(self, blobhash): + self._cat_file_process.stdin.write(b'info '+blobhash+b'\n') + self._cat_file_process.stdin.flush() + line = self._cat_file_process.stdout.readline() + (oid, oidtype, size) = line.split() + size = int(size) # Convert e.g. b'6283' to 6283 + assert(oidtype == b'blob') + return size + + def insert_file_with_contents(self, contents): + blob = Blob(contents) + self._insert_blob_func(blob) + return blob.id + + def is_binary(self, contents): + return b"\0" in contents[0:8192] + + def apply_replace_text(self, contents): + new_contents = contents + for literal, replacement in self._replace_text['literals']: + new_contents = new_contents.replace(literal, replacement) + for regex, replacement in self._replace_text['regexes']: + new_contents = regex.sub(replacement, new_contents) + return new_contents + +class LFSObjectTracker: + class LFSObjs: + def __init__(self): + self.id_to_object_map = {} + self.objects = set() + + def __init__(self, file_info, check_sources, check_targets): + self.source_objects = LFSObjectTracker.LFSObjs() + self.target_objects = LFSObjectTracker.LFSObjs() + self.hash_to_object_map = {} + self.file_info = file_info + self.check_sources = check_sources + self.check_targets = check_targets + self.objects_orphaned = False + + def _get_lfs_values(self, contents): + values = {} + if len(contents) > 1024: + return {} + for line in contents.splitlines(): + try: + (key, value) = line.split(b' ', 1) + except ValueError: + return {} + if not values and key != b'version': + return values + values[key] = value + return values + + def check_blob_data(self, contents, fast_export_id, source): + if source and not self.check_sources: + return + mymap = self.source_objects if source else self.target_objects + lfs_object_id = self._get_lfs_values(contents).get(b'oid') + if lfs_object_id: + mymap.id_to_object_map[fast_export_id] = lfs_object_id + + def check_file_change_data(self, git_id, source): + if source and not self.check_sources: + return + mymap = self.source_objects if source else self.target_objects + if isinstance(git_id, int): + lfs_object_id = mymap.id_to_object_map.get(git_id) + if lfs_object_id: + mymap.objects.add(lfs_object_id) + else: + if git_id in self.hash_to_object_map: + mymap.objects.add(self.hash_to_object_map[git_id]) + return + size = self.file_info.get_size_by_identifier(git_id) + if size >= 1024: + return + contents = self.file_info.get_contents_by_identifier(git_id) + lfs_object_id = self._get_lfs_values(contents).get(b'oid') + if lfs_object_id: + self.hash_to_object_map[git_id] = lfs_object_id + mymap.objects.add(lfs_object_id) + + def check_output_object(self, obj): + if not self.check_targets: + return + if type(obj) == Blob: + self.check_blob_data(obj.data, obj.id, False) + elif type(obj) == Commit: + for change in obj.file_changes: + if change.type != b'M': + continue + self.check_file_change_data(change.blob_id, False) + + def find_all_lfs_objects_in_repo(self, repo, source): + if not source: + self.file_info = FileInfoValueHelper(None, None, repo) + p = subproc.Popen(["git", "rev-list", "--objects", "--all"], + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + cwd=repo) + for line in p.stdout.readlines(): + try: + (git_oid, filename) = line.split() + except ValueError: + # Commit and tree objects only have oid + continue + + mymap = self.source_objects if source else self.target_objects + size = self.file_info.get_size_by_identifier(git_oid) + if size >= 1024: + continue + contents = self.file_info.get_contents_by_identifier(git_oid) + lfs_object_id = self._get_lfs_values(contents).get(b'oid') + if lfs_object_id: + mymap.objects.add(lfs_object_id) + if not source: + self.file_info.finalize() + +class InputFileBackup: + def __init__(self, input_file, output_file): + self.input_file = input_file + self.output_file = output_file + + def close(self): + self.input_file.close() + self.output_file.close() + + def read(self, size): + output = self.input_file.read(size) + self.output_file.write(output) + return output + + def readline(self): + line = self.input_file.readline() + self.output_file.write(line) + return line + +class DualFileWriter: + def __init__(self, file1, file2): + self.file1 = file1 + self.file2 = file2 + + def write(self, *args): + self.file1.write(*args) + self.file2.write(*args) + + def flush(self): + self.file1.flush() + self.file2.flush() + + def close(self): + self.file1.close() + self.file2.close() + +class RepoFilter(object): + def __init__(self, + args, + filename_callback = None, + message_callback = None, + name_callback = None, + email_callback = None, + refname_callback = None, + blob_callback = None, + commit_callback = None, + tag_callback = None, + reset_callback = None, + done_callback = None, + file_info_callback = None): + + self._args = args + + # Repo we are exporting + self._repo_working_dir = None + + # Store callbacks for acting on objects printed by FastExport + self._blob_callback = blob_callback + self._commit_callback = commit_callback + self._tag_callback = tag_callback + self._reset_callback = reset_callback + self._done_callback = done_callback + + # Store callbacks for acting on slices of FastExport objects + self._filename_callback = filename_callback # filenames from commits + self._message_callback = message_callback # commit OR tag message + self._name_callback = name_callback # author, committer, tagger + self._email_callback = email_callback # author, committer, tagger + self._refname_callback = refname_callback # from commit/tag/reset + self._file_info_callback = file_info_callback # various file info + self._handle_arg_callbacks() + + # Helpers for callbacks + self._file_info_value = None + + # Defaults for input + self._input = None + self._fep = None # Fast Export Process + self._fe_orig = None # Path to where original fast-export output stored + self._fe_filt = None # Path to where filtered fast-export output stored + self._parser = None # FastExportParser object we are working with + + # Defaults for output + self._output = None + self._fip = None # Fast Import Process + self._import_pipes = None + self._managed_output = True + + # A tuple of (depth, list-of-ancestors). Commits and ancestors are + # identified by their id (their 'mark' in fast-export or fast-import + # speak). The depth of a commit is one more than the max depth of any + # of its ancestors. + self._graph = AncestryGraph() + # Another one, for ancestry of commits in the original repo + self._orig_graph = AncestryGraph() + + # Names of files that were tweaked in any commit; such paths could lead + # to subsequent commits being empty + self._files_tweaked = set() + + # A set of commit hash pairs (oldhash, newhash) which used to be merge + # commits but due to filtering were turned into non-merge commits. + # The commits probably have suboptimal commit messages (e.g. "Merge branch + # next into master"). + self._commits_no_longer_merges = [] + + # A dict of original_ids to new_ids; filtering commits means getting + # new commit hash (sha1sums), and we record the mapping both for + # diagnostic purposes and so we can rewrite commit messages. Note that + # the new_id can be None rather than a commit hash if the original + # commit became empty and was pruned or was otherwise dropped. + self._commit_renames = {} + + # A set of original_ids (i.e. original hashes) for which we have not yet + # gotten the new hashses; the value is always the corresponding fast-export + # id (i.e. commit.id) + self._pending_renames = collections.OrderedDict() + + # A dict of commit_hash[0:7] -> set(commit_hashes with that prefix). + # + # It's common for commit messages to refer to commits by abbreviated + # commit hashes, as short as 7 characters. To facilitate translating + # such short hashes, we have a mapping of prefixes to full old hashes. + self._commit_short_old_hashes = collections.defaultdict(set) + + # A set of commit hash references appearing in commit messages which + # mapped to a valid commit that was removed entirely in the filtering + # process. The commit message will continue to reference the + # now-missing commit hash, since there was nothing to map it to. + self._commits_referenced_but_removed = set() + + # Other vars related to metadata tracking + self._already_ran = False + self._changed_refs = set() + self._lfs_object_tracker = None + + # Progress handling (number of commits parsed, etc.) + self._progress_writer = ProgressWriter() + self._num_commits = 0 + + # Size of blobs in the repo + self._unpacked_size = {} + + # Other vars + self._sanity_checks_handled = False + self._finalize_handled = False + self._orig_refs = None + self._config_settings = {} + self._newnames = {} + self._stash = None + + # Cache a few message translations for performance reasons + self._parsed_message = _("Parsed %d commits") + + # Compile some regexes and cache those + self._hash_re = re.compile(br'(\b[0-9a-f]{7,40}\b)') + + def _handle_arg_callbacks(self): + def make_callback(args, bdy): + callback_globals = {g: globals()[g] for g in public_globals} + callback_locals = {} + if type(args) == str: + args = (args, '_do_not_use_this_var = None') + exec('def callback({}):\n'.format(', '.join(args))+ + ' '+'\n '.join(bdy.splitlines()), callback_globals, callback_locals) + return callback_locals['callback'] + def handle(which, args=None): + which_under = which.replace('-','_') + if not args: + args = which + callback_field = '_{}_callback'.format(which_under) + code_string = getattr(self._args, which_under+'_callback') + if code_string: + if os.path.exists(code_string): + with open(code_string, 'r', encoding='utf-8') as f: + code_string = f.read() + if getattr(self, callback_field): + raise SystemExit(_("Error: Cannot pass a %s_callback to RepoFilter " + "AND pass --%s-callback" + % (which_under, which))) + if 'return ' not in code_string and \ + which not in ('blob', 'commit', 'tag', 'reset'): + raise SystemExit(_("Error: --%s-callback should have a return statement") + % which) + setattr(self, callback_field, make_callback(args, code_string)) + handle('filename') + handle('message') + handle('name') + handle('email') + handle('refname') + handle('blob') + handle('commit') + handle('tag') + handle('reset') + handle('file-info', ('filename', 'mode', 'blob_id', 'value')) + + def _run_sanity_checks(self): + self._sanity_checks_handled = True + if not self._managed_output: + if not self._args.replace_refs: + # If not _managed_output we don't want to make extra changes to the + # repo, so set default to no-op 'update-no-add' + self._args.replace_refs = 'update-no-add' + return + + if self._args.debug: + print("[DEBUG] Passed arguments:\n{}".format(self._args)) + + # Determine basic repository information + target_working_dir = self._args.target or b'.' + self._orig_refs = GitUtils.get_refs(target_working_dir) + is_bare = GitUtils.is_repository_bare(target_working_dir) + self._config_settings = GitUtils.get_config_settings(target_working_dir) + + # Determine if this is second or later run of filter-repo + tmp_dir = self.results_tmp_dir(create_if_missing=False) + ran_path = os.path.join(tmp_dir, b'already_ran') + self._already_ran = os.path.isfile(ran_path) + if self._already_ran: + current_time = time.time() + file_mod_time = os.path.getmtime(ran_path) + file_age = current_time - file_mod_time + if file_age > 86400: # file older than a day + msg = (f"The previous run is older than a day ({decode(ran_path)} already exists).\n" + f"See \"Already Ran\" section in the manual for more information.\n" + f"Treat this run as a continuation of filtering in the previous run (Y/N)? ") + response = input(msg) + + if response.lower() != 'y': + os.remove(ran_path) + self._already_ran = False + + # Interaction between --already-ran and --sensitive_data_removal + msg = textwrap.dedent(_("""\ + Error: Cannot specify --sensitive-data-removal on a follow-up invocation + of git-filter-repo unless it was specified in previously runs.""")) + if self._already_ran: + sdr_path = os.path.join(tmp_dir, b'sensitive_data_removal') + sdr_previously = os.path.isfile(sdr_path) + if not sdr_previously and self._args.sensitive_data_removal: + raise SystemExit(msg) + # Treat this as a --sensitive-data-removal run if a previous run was, + # even if it wasn't specified this time + self._args.sensitive_data_removal = sdr_previously + + # Have to check sensitive_data_removal interactions here instead of + # sanity_check_args because of the above interaction with already_ran stuff + if self._args.sensitive_data_removal: + if self._args.stdin: + msg = _("Error: sensitive data removal is incompatible with --stdin") + raise SystemExit(msg) + if self._args.source or self._args.target: + msg = _("Error: sensitive data removal is incompatible with --source and --target") + raise SystemExit(msg) + + # Default for --replace-refs + if not self._args.replace_refs: + self._args.replace_refs = 'delete-no-add' + if self._args.replace_refs == 'old-default': + self._args.replace_refs = ('update-or-add' if self._already_ran + else 'update-and-add') + + # Do sanity checks from the correct directory + if not self._args.force and not self._already_ran: + cwd = os.getcwd() + os.chdir(target_working_dir) + RepoFilter.sanity_check(self._orig_refs, is_bare, self._config_settings) + os.chdir(cwd) + + def _setup_lfs_orphaning_checks(self): + # Do a couple checks to see if we want to do lfs orphaning checks + if not self._args.sensitive_data_removal: + return + metadata_dir = self.results_tmp_dir() + lfs_objects_file = os.path.join(metadata_dir, b'original_lfs_objects') + if self._already_ran: + # Check if we did lfs filtering in the previous run + if not os.path.isfile(lfs_objects_file): + return + + # Set up self._file_info_value so we can query git for stuff + source_working_dir = self._args.source or b'.' + self._file_info_value = FileInfoValueHelper(self._args.replace_text, + self.insert, + source_working_dir) + + # One more check to see if we want to do lfs orphaning checks + if not self._already_ran: + # Check if lfs filtering is active in HEAD's .gitattributes file + a = self._file_info_value.get_contents_by_identifier(b"HEAD:.gitattributes") + if not a or not re.search(rb'\bfilter=lfs\b', a): + return + + # Set up the object tracker + check_sources = not self._already_ran and not self._args.partial + check_targets = not self._args.partial + self._lfs_object_tracker = LFSObjectTracker(self._file_info_value, + check_sources, + check_targets) + self._parser._lfs_object_tracker = self._lfs_object_tracker # kinda gross + + # Get initial objects + if self._already_ran: + with open(lfs_objects_file, 'br') as f: + for line in f: + self._lfs_object_tracker.source_objects.objects.add(line.strip()) + elif self._args.partial: + source = True + self._lfs_object_tracker.find_all_lfs_objects_in_repo(source_working_dir, + source) + + @staticmethod + def loose_objects_are_replace_refs(git_dir, refs, num_loose_objects): + replace_objects = set() + for refname, rev in refs.items(): + if not refname.startswith(b'refs/replace/'): + continue + replace_objects.add(rev) + + validobj_re = re.compile(rb'^[0-9a-f]{40}$') + object_dir=os.path.join(git_dir, b'objects') + for root, dirs, files in os.walk(object_dir): + for filename in files: + objname = os.path.basename(root)+filename + if objname not in replace_objects and validobj_re.match(objname): + return False + + return True + + @staticmethod + def sanity_check(refs, is_bare, config_settings): + def abort(reason): + dirname = config_settings.get(b'remote.origin.url', b'') + msg = "" + if dirname and os.path.isdir(dirname): + msg = _("Note: when cloning local repositories, you need to pass\n" + " --no-local to git clone to avoid this issue.\n") + raise SystemExit( + _("Aborting: Refusing to destructively overwrite repo history since\n" + "this does not look like a fresh clone.\n" + " (%s)\n%s" + "Please operate on a fresh clone instead. If you want to proceed\n" + "anyway, use --force.") % (reason, msg)) + + # Avoid letting people running with weird setups and overwriting GIT_DIR + # elsewhere + git_dir = GitUtils.determine_git_dir(b'.') + if is_bare and git_dir != b'.': + abort(_("GIT_DIR must be .")) + elif not is_bare and git_dir != b'.git': + abort(_("GIT_DIR must be .git")) + + # Check for refname collisions + if config_settings.get(b'core.ignorecase', b'false') == b'true': + collisions = collections.defaultdict(list) + for ref in refs: + collisions[ref.lower()].append(ref) + msg = "" + for ref in collisions: + if len(collisions[ref]) >= 2: + msg += " " + decode(b", ".join(collisions[ref])) + "\n" + if msg: + raise SystemExit( + _("Aborting: Cannot rewrite history on a case insensitive\n" + "filesystem since you have refs that differ in case only:\n" + "%s") % msg) + if config_settings.get(b'core.precomposeunicode', b'false') == b'true': + import unicodedata # Mac users need to have python-3.8 + collisions = collections.defaultdict(list) + for ref in refs: + strref = decode(ref) + collisions[unicodedata.normalize('NFC', strref)].append(strref) + msg = "" + for ref in collisions: + if len(collisions[ref]) >= 2: + msg += " " + ", ".join(collisions[ref]) + "\n" + if msg: + raise SystemExit( + _("Aborting: Cannot rewrite history on a character normalizing\n" + "filesystem since you have refs that differ in normalization:\n" + "%s") % msg) + + # Make sure repo is fully packed, just like a fresh clone would be. + # Note that transfer.unpackLimit defaults to 100, meaning that a + # repository with no packs and less than 100 objects should be considered + # fully packed. + output = subproc.check_output('git count-objects -v'.split()) + stats = dict(x.split(b': ') for x in output.splitlines()) + num_packs = int(stats[b'packs']) + num_loose_objects = int(stats[b'count']) + if num_packs > 1 or \ + num_loose_objects >= 100 or \ + (num_packs == 1 and num_loose_objects > 0 and + not RepoFilter.loose_objects_are_replace_refs(git_dir, refs, + num_loose_objects)): + abort(_("expected freshly packed repo")) + + # Make sure there is precisely one remote, named "origin"...or that this + # is a new bare repo with no packs and no remotes + output = subproc.check_output('git remote'.split()).strip() + if not (output == b"origin" or (num_packs == 0 and not output)): + abort(_("expected one remote, origin")) + + # Make sure that all reflogs have precisely one entry + reflog_dir=os.path.join(git_dir, b'logs') + for root, dirs, files in os.walk(reflog_dir): + for filename in files: + pathname = os.path.join(root, filename) + with open(pathname, 'br') as f: + if len(f.read().splitlines()) > 1: + shortpath = pathname[len(reflog_dir)+1:] + abort(_("expected at most one entry in the reflog for %s") % + decode(shortpath)) + + # Make sure there are no stashed changes + if b'refs/stash' in refs: + abort(_("has stashed changes")) + + # Do extra checks in non-bare repos + if not is_bare: + # Avoid uncommitted, unstaged, or untracked changes + if subproc.call('git diff --staged --quiet'.split()): + abort(_("you have uncommitted changes")) + if subproc.call('git diff --quiet'.split()): + abort(_("you have unstaged changes")) + untracked_output = subproc.check_output('git ls-files -o'.split()) + if len(untracked_output) > 0: + uf = untracked_output.rstrip(b'\n').split(b'\n') + # Since running git-filter-repo can result in files being written to + # __pycache__ (depending on python version, env vars, etc.), let's + # ignore those as far as "clean clone" is concerned. + relevant_uf = [x for x in uf + if not x.startswith(b'__pycache__/git_filter_repo.')] + if len(relevant_uf) > 0: + abort(_("you have untracked changes")) + + # Avoid unpushed changes + for refname, rev in refs.items(): + if not refname.startswith(b'refs/heads/'): + continue + origin_ref = refname.replace(b'refs/heads/', b'refs/remotes/origin/') + if origin_ref not in refs: + abort(_('%s exists, but %s not found') % (decode(refname), + decode(origin_ref))) + if rev != refs[origin_ref]: + abort(_('%s does not match %s') % (decode(refname), + decode(origin_ref))) + + # Make sure there is only one worktree + output = subproc.check_output('git worktree list'.split()) + if len(output.splitlines()) > 1: + abort(_('you have multiple worktrees')) + + def cleanup(self, repo, repack, reset, + run_quietly=False, show_debuginfo=False): + ''' cleanup repo; if repack then expire reflogs and do a gc --prune=now. + if reset then do a reset --hard. Optionally also curb output if + run_quietly is True, or go the opposite direction and show extra + output if show_debuginfo is True. ''' + assert not (run_quietly and show_debuginfo) + + if (repack and not run_quietly and not show_debuginfo): + print(_("Repacking your repo and cleaning out old unneeded objects")) + quiet_flags = '--quiet' if run_quietly else '' + cleanup_cmds = [] + if repack: + cleanup_cmds = ['git reflog expire --expire=now --all'.split(), + 'git gc {} --prune=now'.format(quiet_flags).split()] + if reset: + cleanup_cmds.insert(0, 'git reset {} --hard'.format(quiet_flags).split()) + location_info = ' (in {})'.format(decode(repo)) if repo != b'.' else '' + for cmd in cleanup_cmds: + if show_debuginfo: + print("[DEBUG] Running{}: {}".format(location_info, ' '.join(cmd))) + ret = subproc.call(cmd, cwd=repo) + if ret != 0: + raise SystemExit("fatal: running '%s' failed!" % ' '.join(cmd)) + if cmd[0:3] == 'git reflog expire'.split(): + self._write_stash() + + def _get_rename(self, old_hash): + # If we already know the rename, just return it + new_hash = self._commit_renames.get(old_hash, None) + if new_hash: + return new_hash + + # If it's not in the remaining pending renames, we don't know it + if old_hash is not None and old_hash not in self._pending_renames: + return None + + # Read through the pending renames until we find it or we've read them all, + # and return whatever we might find + self._flush_renames(old_hash) + return self._commit_renames.get(old_hash, None) + + def _flush_renames(self, old_hash=None, limit=0): + # Parse through self._pending_renames until we have read enough. We have + # read enough if: + # self._pending_renames is empty + # old_hash != None and we found a rename for old_hash + # limit > 0 and len(self._pending_renames) started less than 2*limit + # limit > 0 and len(self._pending_renames) < limit + if limit and len(self._pending_renames) < 2 * limit: + return + fi_input, fi_output = self._import_pipes + while self._pending_renames: + orig_hash, new_fast_export_id = self._pending_renames.popitem(last=False) + new_hash = fi_output.readline().rstrip() + self._commit_renames[orig_hash] = new_hash + self._graph.record_hash(new_fast_export_id, new_hash) + if old_hash == orig_hash: + return + if limit and len(self._pending_renames) < limit: + return + + def _translate_commit_hash(self, matchobj_or_oldhash): + old_hash = matchobj_or_oldhash + if not isinstance(matchobj_or_oldhash, bytes): + old_hash = matchobj_or_oldhash.group(1) + orig_len = len(old_hash) + new_hash = self._get_rename(old_hash) + if new_hash is None: + if old_hash[0:7] not in self._commit_short_old_hashes: + self._commits_referenced_but_removed.add(old_hash) + return old_hash + possibilities = self._commit_short_old_hashes[old_hash[0:7]] + matches = [x for x in possibilities + if x[0:orig_len] == old_hash] + if len(matches) != 1: + self._commits_referenced_but_removed.add(old_hash) + return old_hash + old_hash = matches[0] + new_hash = self._get_rename(old_hash) + + assert new_hash is not None + return new_hash[0:orig_len] + + def _maybe_trim_extra_parents(self, orig_parents, parents): + '''Due to pruning of empty commits, some parents could be non-existent + (None) or otherwise redundant. Remove the non-existent parents, and + remove redundant parents ***SO LONG AS*** that doesn't transform a + merge commit into a non-merge commit. + + Returns a tuple: + (parents, new_first_parent_if_would_become_non_merge)''' + + always_prune = (self._args.prune_degenerate == 'always') + + # Pruning of empty commits means multiple things: + # * An original parent of this commit may have been pruned causing the + # need to rewrite the reported parent to the nearest ancestor. We + # want to know when we're dealing with such a parent. + # * Further, there may be no "nearest ancestor" if the entire history + # of that parent was also pruned. (Detectable by the parent being + # 'None') + # Remove all parents rewritten to None, and keep track of which parents + # were rewritten to an ancestor. + tmp = zip(parents, + orig_parents, + [(x in _SKIPPED_COMMITS or always_prune) for x in orig_parents]) + tmp2 = [x for x in tmp if x[0] is not None] + if not tmp2: + # All ancestors have been pruned; we have no parents. + return [], None + parents, orig_parents, is_rewritten = [list(x) for x in zip(*tmp2)] + + # We can't have redundant parents if we don't have at least 2 parents + if len(parents) < 2: + return parents, None + + # Don't remove redundant parents if user doesn't want us to + if self._args.prune_degenerate == 'never': + return parents, None + + # Remove duplicate parents (if both sides of history have lots of commits + # which become empty due to pruning, the most recent ancestor on both + # sides may be the same commit), except only remove parents that have + # been rewritten due to previous empty pruning. + seen = set() + seen_add = seen.add + # Deleting duplicate rewritten parents means keeping parents if either + # they have not been seen or they are ones that have not been rewritten. + parents_copy = parents + uniq = [[p, orig_parents[i], is_rewritten[i]] for i, p in enumerate(parents) + if not (p in seen or seen_add(p)) or not is_rewritten[i]] + parents, orig_parents, is_rewritten = [list(x) for x in zip(*uniq)] + if len(parents) < 2: + return parents_copy, parents[0] + + # Flatten unnecessary merges. (If one side of history is entirely + # empty commits that were pruned, we may end up attempting to + # merge a commit with its ancestor. Remove parents that are an + # ancestor of another parent.) + num_parents = len(parents) + to_remove = [] + for cur in range(num_parents): + if not is_rewritten[cur]: + continue + for other in range(num_parents): + if cur == other: + continue + if not self._graph.is_ancestor(parents[cur], parents[other]): + continue + # parents[cur] is an ancestor of parents[other], so parents[cur] + # seems redundant. However, if it was intentionally redundant + # (e.g. a no-ff merge) in the original, then we want to keep it. + if not always_prune and \ + self._orig_graph.is_ancestor(orig_parents[cur], + orig_parents[other]): + continue + # Some folks want their history to have all first parents be merge + # commits (except for any root commits), and always do a merge --no-ff. + # For such folks, don't remove the first parent even if it's an + # ancestor of other commits. + if self._args.no_ff and cur == 0: + continue + # Okay so the cur-th parent is an ancestor of the other-th parent, + # and it wasn't that way in the original repository; mark the + # cur-th parent as removable. + to_remove.append(cur) + break # cur removed, so skip rest of others -- i.e. check cur+=1 + for x in reversed(to_remove): + parents.pop(x) + if len(parents) < 2: + return parents_copy, parents[0] + + return parents, None + + def _prunable(self, commit, new_1st_parent, had_file_changes, orig_parents): + parents = commit.parents + + if self._args.prune_empty == 'never': + return False + always_prune = (self._args.prune_empty == 'always') + + # For merge commits, unless there are prunable (redundant) parents, we + # do not want to prune + if len(parents) >= 2 and not new_1st_parent: + return False + + if len(parents) < 2: + # Special logic for commits that started empty... + if not had_file_changes and not always_prune: + had_parents_pruned = (len(parents) < len(orig_parents) or + (len(orig_parents) == 1 and + orig_parents[0] in _SKIPPED_COMMITS)) + # If the commit remains empty and had parents which were pruned, + # then prune this commit; otherwise, retain it + return (not commit.file_changes and had_parents_pruned) + + # We can only get here if the commit didn't start empty, so if it's + # empty now, it obviously became empty + if not commit.file_changes: + return True + + # If there are no parents of this commit and we didn't match the case + # above, then this commit cannot be pruned. Since we have no parent(s) + # to compare to, abort now to prevent future checks from failing. + if not parents: + return False + + # Similarly, we cannot handle the hard cases if we don't have a pipe + # to communicate with fast-import + if not self._import_pipes: + return False + + # If there have not been renames/remappings of IDs (due to insertion of + # new blobs), then we can sometimes know things aren't prunable with a + # simple check + if not _IDS.has_renames(): + # non-merge commits can only be empty if blob/file-change editing caused + # all file changes in the commit to have the same file contents as + # the parent. + changed_files = set(change.filename for change in commit.file_changes) + if len(orig_parents) < 2 and changed_files - self._files_tweaked: + return False + + # Finally, the hard case: due to either blob rewriting, or due to pruning + # of empty commits wiping out the first parent history back to the merge + # base, the list of file_changes we have may not actually differ from our + # (new) first parent's version of the files, i.e. this would actually be + # an empty commit. Check by comparing the contents of this commit to its + # (remaining) parent. + # + # NOTE on why this works, for the case of original first parent history + # having been pruned away due to being empty: + # The first parent history having been pruned away due to being + # empty implies the original first parent would have a tree (after + # filtering) that matched the merge base's tree. Since + # file_changes has the changes needed to go from what would have + # been the first parent to our new commit, and what would have been + # our first parent has a tree that matches the merge base, then if + # the new first parent has a tree matching the versions of files in + # file_changes, then this new commit is empty and thus prunable. + fi_input, fi_output = self._import_pipes + self._flush_renames() # Avoid fi_output having other stuff present + # Optimization note: we could have two loops over file_changes, the + # first doing all the self._output.write() calls, and the second doing + # the rest. But I'm worried about fast-import blocking on fi_output + # buffers filling up so I instead read from it as I go. + for change in commit.file_changes: + parent = new_1st_parent or commit.parents[0] # exists due to above checks + quoted_filename = PathQuoting.enquote(change.filename) + if isinstance(parent, int): + self._output.write(b"ls :%d %s\n" % (parent, quoted_filename)) + else: + self._output.write(b"ls %s %s\n" % (parent, quoted_filename)) + self._output.flush() + parent_version = fi_output.readline().split() + if change.type == b'D': + if parent_version != [b'missing', quoted_filename]: + return False + else: + blob_sha = change.blob_id + if isinstance(change.blob_id, int): + self._output.write(b"get-mark :%d\n" % change.blob_id) + self._output.flush() + blob_sha = fi_output.readline().rstrip() + if parent_version != [change.mode, b'blob', blob_sha, quoted_filename]: + return False + + return True + + def _record_remapping(self, commit, orig_parents): + new_id = None + # Record the mapping of old commit hash to new one + if commit.original_id and self._import_pipes: + fi_input, fi_output = self._import_pipes + self._output.write(b"get-mark :%d\n" % commit.id) + self._output.flush() + orig_id = commit.original_id + self._commit_short_old_hashes[orig_id[0:7]].add(orig_id) + # Note that we have queued up an id for later reading; flush a + # few of the older ones if we have too many queued up + self._pending_renames[orig_id] = commit.id + self._flush_renames(None, limit=40) + # Also, record if this was a merge commit that turned into a non-merge + # commit. + if len(orig_parents) >= 2 and len(commit.parents) < 2: + self._commits_no_longer_merges.append((commit.original_id, new_id)) + + def callback_metadata(self, extra_items = dict()): + return {'commit_rename_func': self._translate_commit_hash, + 'ancestry_graph': self._graph, + 'original_ancestry_graph': self._orig_graph, + **extra_items} + + def _tweak_blob(self, blob): + if self._args.max_blob_size and len(blob.data) > self._args.max_blob_size: + blob.skip() + + if blob.original_id in self._args.strip_blobs_with_ids: + blob.skip() + + if ( self._args.replace_text + and not self._file_info_callback + # not (if blob contains zero byte in the first 8Kb, that is, if blob is binary data) + and not b"\0" in blob.data[0:8192] + ): + for literal, replacement in self._args.replace_text['literals']: + blob.data = blob.data.replace(literal, replacement) + for regex, replacement in self._args.replace_text['regexes']: + blob.data = regex.sub(replacement, blob.data) + + if self._blob_callback: + self._blob_callback(blob, self.callback_metadata()) + + self._insert_into_stream(blob) + + def _filter_files(self, commit): + def filename_matches(path_expression, pathname): + ''' Returns whether path_expression matches pathname or a leading + directory thereof, allowing path_expression to not have a trailing + slash even if it is meant to match a leading directory. ''' + if path_expression == b'': + return True + n = len(path_expression) + if (pathname.startswith(path_expression) and + (path_expression[n-1:n] == b'/' or + len(pathname) == n or + pathname[n:n+1] == b'/')): + return True + return False + + def newname(path_changes, pathname, use_base_name, filtering_is_inclusive): + ''' Applies filtering and rename changes from path_changes to pathname, + returning any of None (file isn't wanted), original filename (file + is wanted with original name), or new filename. ''' + wanted = False + full_pathname = pathname + if use_base_name: + pathname = os.path.basename(pathname) + for (mod_type, match_type, path_exp) in path_changes: + if mod_type == 'filter' and not wanted: + assert match_type in ('match', 'glob', 'regex') + if match_type == 'match' and filename_matches(path_exp, pathname): + wanted = True + if match_type == 'glob' and fnmatch.fnmatch(pathname, path_exp): + wanted = True + if match_type == 'regex' and path_exp.search(pathname): + wanted = True + elif mod_type == 'rename': + match, repl = path_exp + assert match_type in ('match','regex') # glob was translated to regex + if match_type == 'match' and filename_matches(match, full_pathname): + full_pathname = full_pathname.replace(match, repl, 1) + pathname = full_pathname # rename incompatible with use_base_name + if match_type == 'regex': + full_pathname = match.sub(repl, full_pathname) + pathname = full_pathname # rename incompatible with use_base_name + return full_pathname if (wanted == filtering_is_inclusive) else None + + args = self._args + new_file_changes = {} # Assumes no renames or copies, otherwise collisions + for change in commit.file_changes: + # NEEDSWORK: _If_ we ever want to pass `--full-tree` to fast-export and + # parse that output, we'll need to modify this block; `--full-tree` + # issues a deleteall directive which has no filename, and thus this + # block would normally strip it. Of course, FileChange() and + # _parse_optional_filechange() would need updates too. + if change.type == b'DELETEALL': + new_file_changes[b''] = change + continue + if change.filename in self._newnames: + change.filename = self._newnames[change.filename] + else: + original_filename = change.filename + change.filename = newname(args.path_changes, change.filename, + args.use_base_name, args.inclusive) + if self._filename_callback: + change.filename = self._filename_callback(change.filename) + self._newnames[original_filename] = change.filename + if not change.filename: + continue # Filtering criteria excluded this file; move on to next one + if change.filename in new_file_changes: + # Getting here means that path renaming is in effect, and caused one + # path to collide with another. That's usually bad, but can be okay + # under two circumstances: + # 1) Sometimes people have a file named OLDFILE in old revisions of + # history, and they rename to NEWFILE, and would like to rewrite + # history so that all revisions refer to it as NEWFILE. As such, + # we can allow a collision when (at least) one of the two paths + # is a deletion. Note that if OLDFILE and NEWFILE are unrelated + # this also allows the rewrite to continue, which makes sense + # since OLDFILE is no longer in the way. + # 2) If OLDFILE and NEWFILE are exactly equal, then writing them + # both to the same location poses no problem; we only need one + # file. (This could come up if someone copied a file in some + # commit, then later either deleted the file or kept it exactly + # in sync with the original with any changes, and then decides + # they want to rewrite history to only have one of the two files) + colliding_change = new_file_changes[change.filename] + if change.type == b'D': + # We can just throw this one away and keep the other + continue + elif change.type == b'M' and ( + change.mode == colliding_change.mode and + change.blob_id == colliding_change.blob_id): + # The two are identical, so we can throw this one away and keep other + continue + elif new_file_changes[change.filename].type != b'D': + raise SystemExit(_("File renaming caused colliding pathnames!\n") + + _(" Commit: {}\n").format(commit.original_id) + + _(" Filename: {}").format(change.filename)) + # Strip files that are too large + if self._args.max_blob_size and \ + self._unpacked_size.get(change.blob_id, 0) > self._args.max_blob_size: + continue + if self._args.strip_blobs_with_ids and \ + change.blob_id in self._args.strip_blobs_with_ids: + continue + # Otherwise, record the change + new_file_changes[change.filename] = change + commit.file_changes = [v for k,v in sorted(new_file_changes.items())] + + def _tweak_commit(self, commit, aux_info): + if self._args.replace_message: + for literal, replacement in self._args.replace_message['literals']: + commit.message = commit.message.replace(literal, replacement) + for regex, replacement in self._args.replace_message['regexes']: + commit.message = regex.sub(replacement, commit.message) + if self._message_callback: + commit.message = self._message_callback(commit.message) + + # Change the commit message according to callback + if not self._args.preserve_commit_hashes: + commit.message = self._hash_re.sub(self._translate_commit_hash, + commit.message) + + # Change the author & committer according to mailmap rules + args = self._args + if args.mailmap: + commit.author_name, commit.author_email = \ + args.mailmap.translate(commit.author_name, commit.author_email) + commit.committer_name, commit.committer_email = \ + args.mailmap.translate(commit.committer_name, commit.committer_email) + # Change author & committer according to callbacks + if self._name_callback: + commit.author_name = self._name_callback(commit.author_name) + commit.committer_name = self._name_callback(commit.committer_name) + if self._email_callback: + commit.author_email = self._email_callback(commit.author_email) + commit.committer_email = self._email_callback(commit.committer_email) + + # Sometimes the 'branch' given is a tag; if so, rename it as requested so + # we don't get any old tagnames + if self._args.tag_rename: + commit.branch = RepoFilter._do_tag_rename(args.tag_rename, commit.branch) + if self._refname_callback: + commit.branch = self._refname_callback(commit.branch) + + # Filter or rename the list of file changes + orig_file_changes = set(commit.file_changes) + self._filter_files(commit) + + # Record ancestry graph + parents, orig_parents = commit.parents, aux_info['orig_parents'] + if self._args.state_branch: + external_parents = parents + else: + external_parents = [p for p in parents if not isinstance(p, int)] + # The use of 'reversed' is intentional here; there is a risk that we have + # duplicates in parents, and we want to map from parents to the first + # entry we find in orig_parents in such cases. + parent_reverse_dict = dict(zip(reversed(parents), reversed(orig_parents))) + + self._graph.record_external_commits(external_parents) + self._orig_graph.record_external_commits(external_parents) + self._graph.add_commit_and_parents(commit.id, parents) # new githash unknown + self._orig_graph.add_commit_and_parents(commit.old_id, orig_parents, + commit.original_id) + + # Prune parents (due to pruning of empty commits) if relevant, note that + # new_1st_parent is None unless this was a merge commit that is becoming + # a non-merge + prev_1st_parent = parents[0] if parents else None + parents, new_1st_parent = self._maybe_trim_extra_parents(orig_parents, + parents) + commit.parents = parents + + # If parents were pruned, then we need our file changes to be relative + # to the new first parent + # + # Notes: + # * new_1st_parent and new_1st_parent != parents[0] uniquely happens for example when: + # working on merge, selecting subset of files and merge base still + # valid while first parent history doesn't touch any of those paths, + # but second parent history does. prev_1st_parent had already been + # rewritten to the non-None first ancestor and it remains valid. + # self._maybe_trim_extra_parents() avoids removing this first parent + # because it'd make the commit a non-merge. However, if there are + # no file_changes of note, we'll drop this commit and mark + # new_1st_parent as the new replacement. To correctly determine if + # there are no file_changes of note, we need to have the list of + # file_changes relative to new_1st_parent. + # (See t9390#3, "basic -> basic-ten using '--path ten'") + # * prev_1st_parent != parents[0] happens for example when: + # similar to above, but the merge base is no longer valid and was + # pruned away as well. Then parents started as e.g. [None, $num], + # and both prev_1st_parent and new_1st_parent are None, while parents + # after self._maybe_trim_extra_parents() becomes just [$num]. + # (See t9390#67, "degenerate merge with non-matching filename".) + # Since $num was originally a second parent, we need to rewrite + # file changes to be relative to parents[0]. + # * TODO: We should be getting the changes relative to the new first + # parent even if self._fep is None, BUT we can't. Our method of + # getting the changes right now is an external git diff invocation, + # which we can't do if we just have a fast export stream. We can't + # really work around it by querying the fast-import stream either, + # because the 'ls' directive only allows us to list info about + # specific paths, but we need to find out which paths exist in two + # commits and then query them. We could maybe force checkpointing in + # fast-import, then doing a diff from what'll be the new first parent + # back to prev_1st_parent (which may be None, i.e. empty tree), using + # the fact that in A->{B,C}->D, where D is merge of B & C, the diff + # from C->D == C->A + A->B + B->D, and in these cases A==B, so it + # simplifies to C->D == C->A + B->D, and C is our new 1st parent + # commit, A is prev_1st_commit, and B->D is commit.file_changes that + # we already have. However, checkpointing the fast-import process + # and figuring out how long to wait before we can run our diff just + # seems excessive. For now, just punt and assume the merge wasn't + # "evil" (i.e. that it's remerge-diff is empty, as is true for most + # merges). If the merge isn't evil, no further steps are necessary. + if parents and self._fep and ( + prev_1st_parent != parents[0] or + new_1st_parent and new_1st_parent != parents[0]): + # Get the id from the original fast export stream corresponding to the + # new 1st parent. As noted above, that new 1st parent might be + # new_1st_parent, or if that is None, it'll be parents[0]. + will_be_1st = new_1st_parent or parents[0] + old_id = parent_reverse_dict[will_be_1st] + # Now, translate that to a hash + will_be_1st_commit_hash = self._orig_graph.map_to_hash(old_id) + # Get the changes from what is going to be the new 1st parent to this + # merge commit. Note that since we are going from the new 1st parent + # to the merge commit, we can just replace the existing + # commit.file_changes rather than getting something we need to combine + # with the existing commit.file_changes. Also, we can just replace + # because prev_1st_parent is an ancestor of will_be_1st_commit_hash + # (or prev_1st_parent is None and first parent history is gone), so + # even if we retain prev_1st_parent and do not prune it, the changes + # will still work given the snapshot-based way fast-export/fast-import + # work. + commit.file_changes = GitUtils.get_file_changes(self._repo_working_dir, + will_be_1st_commit_hash, + commit.original_id) + + # Save these and filter them + orig_file_changes = set(commit.file_changes) + self._filter_files(commit) + + # Process the --file-info-callback + if self._file_info_callback: + if self._file_info_value is None: + source_working_dir = self._args.source or b'.' + self._file_info_value = FileInfoValueHelper(self._args.replace_text, + self.insert, + source_working_dir) + new_file_changes = [] + for change in commit.file_changes: + if change.type != b'D': + assert(change.type == b'M') + (filename, mode, blob_id) = \ + self._file_info_callback(change.filename, + change.mode, + change.blob_id, + self._file_info_value) + if mode is None: + # TODO: Should deletion of the file even be a feature? Might + # want to remove this branch of the if-elif-else. + assert(filename is not None) + assert(blob_id is not None) + new_change = FileChange(b'D', filename) + elif filename is None: + continue # Drop the FileChange from this commit + else: + new_change = FileChange(b'M', filename, blob_id, mode) + else: + new_change = change # use change as-is for deletions + new_file_changes.append(new_change) + commit.file_changes = new_file_changes + + # Call the user-defined callback, if any + if self._commit_callback: + self._commit_callback(commit, self.callback_metadata(aux_info)) + + # Find out which files were modified by the callbacks. Such paths could + # lead to subsequent commits being empty (e.g. if removing a line containing + # a password from every version of a file that had the password, and some + # later commit did nothing more than remove that line) + final_file_changes = set(commit.file_changes) + if self._args.replace_text or self._blob_callback: + differences = orig_file_changes.union(final_file_changes) + else: + differences = orig_file_changes.symmetric_difference(final_file_changes) + self._files_tweaked.update(x.filename for x in differences) + + # Now print the resulting commit, or if prunable skip it + if not commit.dumped: + if not self._prunable(commit, new_1st_parent, + aux_info['had_file_changes'], orig_parents): + self._insert_into_stream(commit) + self._record_remapping(commit, orig_parents) + else: + rewrite_to = new_1st_parent or commit.first_parent() + commit.skip(new_id = rewrite_to) + if self._args.state_branch: + alias = Alias(commit.old_id or commit.id, rewrite_to or deleted_hash) + self._insert_into_stream(alias) + if commit.branch.startswith(b'refs/') or commit.branch == b'HEAD': + # The special check above is because when direct revisions are passed + # along to fast-export (such as with stashes), there is a chance the + # revision is rewritten to nothing. In such cases, we don't want to + # point an invalid ref that just names a revision to some other point. + reset = Reset(commit.branch, rewrite_to or deleted_hash) + self._insert_into_stream(reset) + self._commit_renames[commit.original_id] = None + + # Show progress + self._num_commits += 1 + if not self._args.quiet: + self._progress_writer.show(self._parsed_message % self._num_commits) + + @staticmethod + def _do_tag_rename(rename_pair, tagname): + old, new = rename_pair.split(b':', 1) + old, new = b'refs/tags/'+old, b'refs/tags/'+new + if tagname.startswith(old): + return tagname.replace(old, new, 1) + return tagname + + def _tweak_tag(self, tag): + # Tweak the tag message according to callbacks + if self._args.replace_message: + for literal, replacement in self._args.replace_message['literals']: + tag.message = tag.message.replace(literal, replacement) + for regex, replacement in self._args.replace_message['regexes']: + tag.message = regex.sub(replacement, tag.message) + if self._message_callback: + tag.message = self._message_callback(tag.message) + + # Tweak the tag name according to tag-name-related callbacks + tag_prefix = b'refs/tags/' + fullref = tag_prefix+tag.ref + if self._args.tag_rename: + fullref = RepoFilter._do_tag_rename(self._args.tag_rename, fullref) + if self._refname_callback: + fullref = self._refname_callback(fullref) + if not fullref.startswith(tag_prefix): + msg = "Error: fast-import requires tags to be in refs/tags/ namespace." + msg += "\n {} renamed to {}".format(tag_prefix+tag.ref, fullref) + raise SystemExit(msg) + tag.ref = fullref[len(tag_prefix):] + + # Tweak the tagger according to callbacks + if self._args.mailmap: + tag.tagger_name, tag.tagger_email = \ + self._args.mailmap.translate(tag.tagger_name, tag.tagger_email) + if self._name_callback: + tag.tagger_name = self._name_callback(tag.tagger_name) + if self._email_callback: + tag.tagger_email = self._email_callback(tag.tagger_email) + + # Call general purpose tag callback + if self._tag_callback: + self._tag_callback(tag, self.callback_metadata()) + + def _tweak_reset(self, reset): + if self._args.tag_rename: + reset.ref = RepoFilter._do_tag_rename(self._args.tag_rename, reset.ref) + if self._refname_callback: + reset.ref = self._refname_callback(reset.ref) + if self._reset_callback: + self._reset_callback(reset, self.callback_metadata()) + + def results_tmp_dir(self, create_if_missing=True): + target_working_dir = self._args.target or b'.' + git_dir = GitUtils.determine_git_dir(target_working_dir) + d = os.path.join(git_dir, b'filter-repo') + if create_if_missing and not os.path.isdir(d): + os.mkdir(d) + return d + + def _load_marks_file(self, marks_basename): + full_branch = 'refs/heads/{}'.format(self._args.state_branch) + marks_file = os.path.join(self.results_tmp_dir(), marks_basename) + working_dir = self._args.target or b'.' + cmd = ['git', '-C', working_dir, 'show-ref', full_branch] + contents = b'' + if subproc.call(cmd, stdout=subprocess.DEVNULL) == 0: + cmd = ['git', '-C', working_dir, 'show', + '%s:%s' % (full_branch, decode(marks_basename))] + try: + contents = subproc.check_output(cmd) + except subprocess.CalledProcessError as e: # pragma: no cover + raise SystemExit(_("Failed loading %s from %s") % + (decode(marks_basename), full_branch)) + if contents: + biggest_id = max(int(x.split()[0][1:]) for x in contents.splitlines()) + _IDS._next_id = max(_IDS._next_id, biggest_id+1) + with open(marks_file, 'bw') as f: + f.write(contents) + return marks_file + + def _save_marks_files(self): + basenames = [b'source-marks', b'target-marks'] + working_dir = self._args.target or b'.' + + # Check whether the branch exists + parent = [] + full_branch = 'refs/heads/{}'.format(self._args.state_branch) + cmd = ['git', '-C', working_dir, 'show-ref', full_branch] + if subproc.call(cmd, stdout=subprocess.DEVNULL) == 0: + parent = ['-p', full_branch] + + # Run 'git hash-object $MARKS_FILE' for each marks file, save result + blob_hashes = {} + for marks_basename in basenames: + marks_file = os.path.join(self.results_tmp_dir(), marks_basename) + if not os.path.isfile(marks_file): # pragma: no cover + raise SystemExit(_("Failed to find %s to save to %s") + % (marks_file, self._args.state_branch)) + cmd = ['git', '-C', working_dir, 'hash-object', '-w', marks_file] + blob_hashes[marks_basename] = subproc.check_output(cmd).strip() + + # Run 'git mktree' to create a tree out of it + p = subproc.Popen(['git', '-C', working_dir, 'mktree'], + stdin=subprocess.PIPE, stdout=subprocess.PIPE) + for b in basenames: + p.stdin.write(b'100644 blob %s\t%s\n' % (blob_hashes[b], b)) + p.stdin.close() + p.wait() + tree = p.stdout.read().strip() + + # Create the new commit + cmd = (['git', '-C', working_dir, 'commit-tree', '-m', 'New mark files', + tree] + parent) + commit = subproc.check_output(cmd).strip() + subproc.call(['git', '-C', working_dir, 'update-ref', full_branch, commit]) + + def importer_only(self): + self._run_sanity_checks() + self._setup_output() + + def set_output(self, outputRepoFilter): + assert outputRepoFilter._output + + # set_output implies this RepoFilter is doing exporting, though may not + # be the only one. + self._setup_input(use_done_feature = False) + + # Set our output management up to pipe to outputRepoFilter's locations + self._managed_output = False + self._output = outputRepoFilter._output + self._import_pipes = outputRepoFilter._import_pipes + + # Handle sanity checks, though currently none needed for export-only cases + self._run_sanity_checks() + + def _read_stash(self): + if self._stash: + return + if self._orig_refs and b'refs/stash' in self._orig_refs and \ + self._args.refs == ['--all']: + repo_working_dir = self._args.source or b'.' + git_dir = GitUtils.determine_git_dir(repo_working_dir) + stash = os.path.join(git_dir, b'logs', b'refs', b'stash') + if os.path.exists(stash): + self._stash = [] + with open(stash, 'br') as f: + for line in f: + (oldhash, newhash, rest) = line.split(None, 2) + self._stash.append((newhash, rest)) + self._args.refs.extend([x[0] for x in self._stash]) + + def _write_stash(self): + last = deleted_hash + if self._stash: + target_working_dir = self._args.target or b'.' + git_dir = GitUtils.determine_git_dir(target_working_dir) + stash = os.path.join(git_dir, b'logs', b'refs', b'stash') + with open(stash, 'bw') as f: + for (hash, rest) in self._stash: + new_hash = self._get_rename(hash) + if new_hash is None: + continue + f.write(b' '.join([last, new_hash, rest]) + b'\n') + last = new_hash + print(_("Rewrote the stash.")) + + def _setup_input(self, use_done_feature): + if self._args.stdin: + self._input = sys.stdin.detach() + sys.stdin = None # Make sure no one tries to accidentally use it + self._fe_orig = None + else: + self._read_stash() + skip_blobs = (self._blob_callback is None and + (self._args.replace_text is None or + self._file_info_callback is not None) and + self._args.source == self._args.target) + extra_flags = [] + if skip_blobs: + extra_flags.append('--no-data') + if self._args.max_blob_size: + self._unpacked_size, packed_size = GitUtils.get_blob_sizes() + if use_done_feature: + extra_flags.append('--use-done-feature') + if write_marks: + extra_flags.append(b'--mark-tags') + if self._args.state_branch: + assert(write_marks) + source_marks_file = self._load_marks_file(b'source-marks') + extra_flags.extend([b'--export-marks='+source_marks_file, + b'--import-marks='+source_marks_file]) + if self._args.preserve_commit_encoding is not None: # pragma: no cover + reencode = 'no' if self._args.preserve_commit_encoding else 'yes' + extra_flags.append('--reencode='+reencode) + if self._args.date_order: + extra_flags.append('--date-order') + location = ['-C', self._args.source] if self._args.source else [] + fep_cmd = ['git'] + location + ['fast-export', '--show-original-ids', + '--signed-tags=strip', '--tag-of-filtered-object=rewrite', + '--fake-missing-tagger', '--reference-excluded-parents' + ] + extra_flags + self._args.refs + self._fep = subproc.Popen(fep_cmd, bufsize=-1, stdout=subprocess.PIPE) + self._input = self._fep.stdout + if self._args.dry_run or self._args.debug: + self._fe_orig = os.path.join(self.results_tmp_dir(), + b'fast-export.original') + output = open(self._fe_orig, 'bw') + self._input = InputFileBackup(self._input, output) + if self._args.debug: + tmp = [decode(x) if isinstance(x, bytes) else x for x in fep_cmd] + print("[DEBUG] Running: {}".format(' '.join(tmp))) + print(" (saving a copy of the output at {})" + .format(decode(self._fe_orig))) + + def _setup_output(self): + if not self._args.dry_run: + location = ['-C', self._args.target] if self._args.target else [] + fip_cmd = ['git'] + location + ['-c', 'core.ignorecase=false', + 'fast-import', '--force', '--quiet'] + if date_format_permissive: + fip_cmd.append('--date-format=raw-permissive') + if self._args.state_branch: + target_marks_file = self._load_marks_file(b'target-marks') + fip_cmd.extend([b'--export-marks='+target_marks_file, + b'--import-marks='+target_marks_file]) + self._fip = subproc.Popen(fip_cmd, bufsize=-1, + stdin=subprocess.PIPE, stdout=subprocess.PIPE) + self._import_pipes = (self._fip.stdin, self._fip.stdout) + if self._args.dry_run or self._args.debug: + self._fe_filt = os.path.join(self.results_tmp_dir(), + b'fast-export.filtered') + self._output = open(self._fe_filt, 'bw') + else: + self._output = self._fip.stdin + if self._args.debug and not self._args.dry_run: + self._output = DualFileWriter(self._fip.stdin, self._output) + tmp = [decode(x) if isinstance(x, bytes) else x for x in fip_cmd] + print("[DEBUG] Running: {}".format(' '.join(tmp))) + print(" (using the following file as input: {})" + .format(decode(self._fe_filt))) + + def _migrate_origin_to_heads(self): + source_working_dir = self._args.source or b'.' + target_working_dir = self._args.target or b'.' + refs_to_migrate = set(x for x in self._orig_refs + if x.startswith(b'refs/remotes/origin/')) + refs_to_warn_about = set() + if refs_to_migrate: + if self._args.debug: + print("[DEBUG] Migrating refs/remotes/origin/* -> refs/heads/*") + p = subproc.Popen('git update-ref --no-deref --stdin'.split(), + stdin=subprocess.PIPE, cwd=source_working_dir) + for ref in refs_to_migrate: + if ref == b'refs/remotes/origin/HEAD': + p.stdin.write(b'delete %s %s\n' % (ref, self._orig_refs[ref])) + del self._orig_refs[ref] + continue + newref = ref.replace(b'refs/remotes/origin/', b'refs/heads/') + if newref not in self._orig_refs: + p.stdin.write(b'create %s %s\n' % (newref, self._orig_refs[ref])) + self._orig_refs[newref] = self._orig_refs[ref] + elif self._orig_refs[ref] != self._orig_refs[newref]: + refs_to_warn_about.add(newref) + p.stdin.write(b'delete %s %s\n' % (ref, self._orig_refs[ref])) + del self._orig_refs[ref] + p.stdin.close() + if p.wait(): # pragma: no cover + msg = _("git update-ref failed; see above") + raise SystemExit(msg) + + if b'remote.origin.url' not in self._config_settings: + return + + # For sensitive data removals, fetch ALL refs. Non-mirror clones normally + # only grab branches and tags, but other refs may hold on to the sensitive + # data as well. + if self._args.sensitive_data_removal and \ + not self._args.no_fetch and \ + not self._already_ran and \ + self._config_settings.get(b'remote.origin.mirror', b'false') != b'true': + + if refs_to_warn_about: + msg = ("Warning: You have refs modified from upstream:\n " + + "\n ".join([decode(x) for x in refs_to_warn_about]) + + "\n" + + " We want to forcibly fetch from upstream to ensure\n" + + " that all relevent refs are rewritten, but this will\n" + + " discard your local changes before starting the\n" + + " rewrite. Proceed with fetch (Y/N)?") + response = input(msg) + + if response.lower() != 'y': + self._args.no_fetch = True + # Don't do the fetch, and don't remove the origin remote + return + + cmd = 'git fetch -q --prune --update-head-ok --refmap "" origin +refs/*:refs/*' + m = _("NOTICE: Fetching all refs from origin to make sure we rewrite\n" + " all history that may reference the sensitive data, via\n" + " "+cmd) + print(m) + ret = subproc.call([arg if arg != '""' else '' for arg in cmd.split()], + cwd=source_working_dir) + if ret != 0: # pragma: no cover + m = _("Warning: Fetching all refs from origin failed") + print(m) + if self._args.sensitive_data_removal: + return + + # Now remove the origin remote + url = self._config_settings[b'remote.origin.url'].decode(errors='replace') + m = _("NOTICE: Removing 'origin' remote; see 'Why is my origin removed?'\n" + " in the manual if you want to push back there.\n" + " (was %s)") % url + print(m) + subproc.call('git remote rm origin'.split(), cwd=target_working_dir) + + def _final_commands(self): + self._finalize_handled = True + self._done_callback and self._done_callback() + + if self._file_info_value: + self._file_info_value.finalize() + if not self._args.quiet: + self._progress_writer.finish() + + def _ref_update(self, target_working_dir): + # Start the update-ref process + p = subproc.Popen('git update-ref --no-deref --stdin'.split(), + stdin=subprocess.PIPE, + cwd=target_working_dir) + + # Remove replace_refs from _orig_refs + replace_refs = {k:v for k, v in self._orig_refs.items() + if k.startswith(b'refs/replace/')} + reverse_replace_refs = collections.defaultdict(list) + for k,v in replace_refs.items(): + reverse_replace_refs[v].append(k) + all(map(self._orig_refs.pop, replace_refs)) + + # Remove unused refs + exported_refs, imported_refs = self.get_exported_and_imported_refs() + refs_to_nuke = exported_refs - imported_refs + # Because revisions can be passed to fast-export which handles them as + # though they were refs, we might have bad "refs" to nuke; strip them out. + refs_to_nuke = [x for x in refs_to_nuke + if x.startswith(b'refs/') or x == b'HEAD'] + if self._args.partial: + refs_to_nuke = set() + if refs_to_nuke and self._args.debug: + print("[DEBUG] Deleting the following refs:\n "+ + decode(b"\n ".join(sorted(refs_to_nuke)))) + p.stdin.write(b''.join([b"delete %s\n" % x + for x in refs_to_nuke])) + + # Delete or update and add replace_refs; note that fast-export automatically + # handles 'update-no-add', we only need to take action for the other four + # choices for replace_refs. + self._flush_renames() + actual_renames = {k:v for k,v in self._commit_renames.items() if k != v} + if self._args.replace_refs in ['delete-no-add', 'delete-and-add']: + # Delete old replace refs, if unwanted + replace_refs_to_nuke = set(replace_refs) + if self._args.replace_refs == 'delete-and-add': + # git-update-ref won't allow us to update a ref twice, so be careful + # to avoid deleting refs we'll later update + replace_refs_to_nuke = replace_refs_to_nuke.difference( + [b'refs/replace/'+x for x in actual_renames]) + p.stdin.write(b''.join([b"delete %s\n" % x + for x in replace_refs_to_nuke])) + if self._args.replace_refs in ['delete-and-add', 'update-or-add', + 'update-and-add']: + # Add new replace refs + update_only = (self._args.replace_refs == 'update-or-add') + p.stdin.write(b''.join([b"update refs/replace/%s %s\n" % (old, new) + for old,new in actual_renames.items() + if new and not (update_only and + old in reverse_replace_refs)])) + + # Complete the update-ref process + p.stdin.close() + if p.wait(): + raise SystemExit(_("git update-ref failed; see above")) # pragma: no cover + + def _remap_to(self, oldish_hash): + ''' + Given an oldish_hash (from the beginning of the current run), return: + IF oldish_hash is NOT pruned: + the hash of the rewrite of oldish_hash + otherwise: + the hash of the rewrite of the first unpruned ancestor of oldish_hash + ''' + old_id = self._orig_graph._hash_to_id[oldish_hash] + new_id = _IDS.translate(old_id) + new_hash = self._graph.git_hash[new_id] if new_id else deleted_hash + return new_hash + + def _compute_metadata(self, metadata_dir, orig_refs): + # + # First, handle commit_renames + # + old_commit_renames = dict() + if not self._already_ran: + commit_renames = {old: new + for old, new in self._commit_renames.items() + } + else: + # Read commit-map into old_commit_renames + with open(os.path.join(metadata_dir, b'commit-map'), 'br') as f: + f.readline() # Skip the header line + for line in f: + (old,new) = line.split() + old_commit_renames[old] = new + # Use A->B mappings in old_commit_renames, and B->C mappings in + # self._commit_renames to yield A->C mappings in commit_renames + commit_renames = {old: self._commit_renames.get(newish, newish) + for old, newish in old_commit_renames.items()} + # If there are any B->C mappings in self._commit_renames for which + # there was no A->B mapping in old_commit_renames, then add the + # B->C mapping to commit_renames too. + seen = set(old_commit_renames.values()) + commit_renames.update({old: new + for old, new in self._commit_renames.items() + if old not in seen}) + + # + # Second, handle ref_maps + # + exported_refs, imported_refs = self.get_exported_and_imported_refs() + + old_commit_unrenames = dict() + if not self._already_ran: + old_ref_map = dict((refname, (old_hash, deleted_hash)) + for refname, old_hash in orig_refs.items() + if refname in exported_refs) + else: + # old_commit_renames talk about how commits were renamed in the original + # run. Let's reverse it to find out how to get from the intermediate + # commit name, back to the original. Because everything in orig_refs + # right now refers to the intermediate commits after the first run(s), + # and we need to map them back to what they were before any changes. + old_commit_unrenames = dict((v,k) for (k,v) in old_commit_renames.items()) + + old_ref_map = {} + # Populate old_ref_map from the 'ref-map' file + with open(os.path.join(metadata_dir, b'ref-map'), 'br') as f: + f.readline() # Skip the header line + for line in f: + (old,intermediate,ref) = line.split() + old_ref_map[ref] = (old, intermediate) + # Append to old_ref_map items from orig_refs that were exported, but + # get the actual original commit name + for refname, old_hash in orig_refs.items(): + if refname in old_ref_map: + continue + if refname not in exported_refs: + continue + # Compute older_hash + original_hash = old_commit_unrenames.get(old_hash, old_hash) + old_ref_map[refname] = (original_hash, deleted_hash) + + new_refs = {} + new_refs_initialized = False + ref_maps = {} + self._orig_graph._ensure_reverse_maps_populated() + for refname, pair in old_ref_map.items(): + old_hash, hash_ref_becomes_if_not_imported_in_this_run = pair + if refname not in imported_refs: + new_hash = hash_ref_becomes_if_not_imported_in_this_run + elif old_hash in commit_renames: + intermediate = old_commit_renames.get(old_hash,old_hash) + if intermediate in self._commit_renames: + new_hash = self._remap_to(intermediate) + else: + new_hash = intermediate + else: # Must be either an annotated tag, or a ref whose tip was pruned + if not new_refs_initialized: + target_working_dir = self._args.target or b'.' + new_refs = GitUtils.get_refs(target_working_dir) + new_refs_initialized = True + if refname in new_refs: + new_hash = new_refs[refname] + else: + new_hash = deleted_hash + ref_maps[refname] = (old_hash, new_hash) + if self._args.source or self._args.target: + if not new_refs_initialized: + target_working_dir = self._args.target or b'.' + new_refs = GitUtils.get_refs(target_working_dir) + new_refs_initialized = True + for ref, new_hash in new_refs.items(): + if ref not in orig_refs and not ref.startswith(b'refs/replace/'): + old_hash = b'0'*len(new_hash) + ref_maps[ref] = (old_hash, new_hash) + + # + # Third, handle first_changes + # + + old_first_changes = dict() + if self._already_ran: + # Read first_changes into old_first_changes + with open(os.path.join(metadata_dir, b'first-changed-commits'), 'br') as f: + for line in f: + changed_commit, undeleted_self_or_ancestor = line.strip().split() + old_first_changes[changed_commit] = undeleted_self_or_ancestor + # We need to find the commits that were modified whose parents were not. + # To be able to find parents, we need the commit names as of the beginning + # of this run, and then when we are done, we need to map them back to the + # name of the commits from before any git-filter-repo runs. + # + # We are excluding here any commits deleted in previous git-filter-repo + # runs + undo_old_commit_renames = dict((v,k) for (k,v) in old_commit_renames.items() + if v != deleted_hash) + # Get a list of all commits that were changed, as of the beginning of + # this latest run. + changed_commits = {new + for (old,new) in old_commit_renames.items() + if old != new and new != deleted_hash} | \ + {old + for (old,new) in self._commit_renames.items() + if old != new} + special_changed_commits = {old + for (old,new) in old_commit_renames.items() + if new == deleted_hash} + first_changes = dict() + for (old,new) in self._commit_renames.items(): + if old == new: + # old wasn't modified, can't be first change if not even a change + continue + if old_commit_unrenames.get(old,old) != old: + # old was already modified in previous run; while it might represent + # something that is still a first change, we'll handle that as we + # loop over old_first_changes below + continue + if any(parent in changed_commits + for parent in self._orig_graph.get_parent_hashes(old)): + # a parent of old was modified, so old is not a first change + continue + # At this point, old IS a first change. We need to find out what new + # commit it maps to, or if it doesn't map to one, what new commit was + # its most recent ancestor that wasn't pruned. + if new is None: + new = self._remap_to(old) + first_changes[old] = (new if new is not None else deleted_hash) + for (old,undeleted_self_or_ancestor) in old_first_changes.items(): + if undeleted_self_or_ancestor == deleted_hash: + # old represents a commit that was pruned and whose entire ancestry + # was pruned. So, old is still a first change + first_changes[old] = undeleted_self_or_ancestor + continue + intermediate = old_commit_renames.get(old, old) + usoa = undeleted_self_or_ancestor + new_ancestor = self._commit_renames.get(usoa, usoa) + if intermediate == deleted_hash: + # old was pruned in previous rewrite + if usoa != new_ancestor: + # old's ancestor got rewritten in this filtering run; we can drop + # this one from first_changes. + continue + # Getting here means old was a first change and old was pruned in a + # previous run, and its ancestors that survived were non rewritten in + # this run, so old remains a first change + first_changes[old] = new_ancestor # or usoa, since new_ancestor == usoa + continue + assert(usoa == intermediate) # old wasn't pruned => usoa == intermediate + + # Check whether parents of intermediate were rewritten. Note that + # intermediate in self._commit_renames only means that intermediate was + # processed by the latest filtering (not necessarily that it changed), + # but we need to know that before we can check for parent hashes having + # changed. + if intermediate not in self._commit_renames: + # This commit was not processed by this run, so it remains a first + # change + first_changes[old] = usoa + continue + if any(parent in changed_commits + for parent in self._orig_graph.get_parent_hashes(intermediate)): + # An ancestor was modified by this run, so it is no longer a first + # change; continue to the next one. + continue + # This change is a first_change; find the new commit its usoa maps to + new = self._remap_to(intermediate) + assert(new is not None) + first_changes[old] = new + + return commit_renames, ref_maps, first_changes + + def _handle_lfs_metadata(self, metadata_dir): + if self._lfs_object_tracker is None: + print("NOTE: LFS object orphaning not checked (LFS not in use)") + return + + if self._args.partial: + target_working_dir = self._args.target or b'.' + source = False + self._lfs_object_tracker.find_all_lfs_objects_in_repo(target_working_dir, + source) + + with open(os.path.join(metadata_dir, b'original_lfs_objects'), 'bw') as f: + for obj in sorted(self._lfs_object_tracker.source_objects.objects): + f.write(obj+b"\n") + + orphaned_lfs_path = os.path.join(metadata_dir, b'orphaned_lfs_objects') + msg = textwrap.dedent(_(f"""\ + NOTE: There were LFS Objects Orphaned by this rewrite recorded in + {decode(orphaned_lfs_path)}.""")) + with open(orphaned_lfs_path, 'bw') as f: + differences = self._lfs_object_tracker.source_objects.objects - \ + self._lfs_object_tracker.target_objects.objects + for obj in sorted(differences): + f.write(obj+b"\n") + if differences: + self._lfs_object_tracker.objects_orphaned = True + print(msg) + + def _record_metadata(self, metadata_dir, orig_refs): + self._flush_renames() + commit_renames, ref_maps, first_changes = \ + self._compute_metadata(metadata_dir, orig_refs) + + if self._args.sensitive_data_removal: + changed_commits = sum(k!=v for (k,v) in commit_renames.items()) + print(f"You rewrote {changed_commits} (of {len(commit_renames)}) commits.") + print("") # Add a blank line before important rewrite information + print(f"NOTE: First Changed Commit(s) is/are:\n " + + decode(b"\n ".join(x for x in first_changes))) + + with open(os.path.join(metadata_dir, b'sensitive_data_removal'), 'bw') as f: + pass # Write nothing; we only need the file created + + self._handle_lfs_metadata(metadata_dir) + print("") # Add a blank line after important rewrite information + + with open(os.path.join(metadata_dir, b'commit-map'), 'bw') as f: + f.write(("%-40s %s\n" % (_("old"), _("new"))).encode()) + for (old,new) in sorted(commit_renames.items()): + msg = b'%s %s\n' % (old, new if new != None else deleted_hash) + f.write(msg) + + with open(os.path.join(metadata_dir, b'ref-map'), 'bw') as f: + f.write(("%-40s %-40s %s\n" % (_("old"), _("new"), _("ref"))).encode()) + for refname, hash_pair in sorted(ref_maps.items()): + (old_hash, new_hash) = hash_pair + f.write(b'%s %s %s\n' % (old_hash, new_hash, refname)) + if old_hash != new_hash: + self._changed_refs.add(refname) + + with open(os.path.join(metadata_dir, b'changed-refs'), 'bw') as f: + for refname in sorted(self._changed_refs): + f.write(b'%s\n' % refname) + + with open(os.path.join(metadata_dir, b'first-changed-commits'), 'bw') as f: + for commit, undeleted_self_or_ancestor in sorted(first_changes.items()): + f.write(b'%s %s\n' % (commit, undeleted_self_or_ancestor)) + + with open(os.path.join(metadata_dir, b'suboptimal-issues'), 'bw') as f: + issues_found = False + if self._commits_no_longer_merges: + issues_found = True + + f.write(textwrap.dedent(_(''' + The following commits used to be merge commits but due to filtering + are now regular commits; they likely have suboptimal commit messages + (e.g. "Merge branch next into master"). Original commit hash on the + left, commit hash after filtering/rewriting on the right: + ''')[1:]).encode()) + for oldhash, newhash in self._commits_no_longer_merges: + f.write(' {} {}\n'.format(oldhash, newhash).encode()) + f.write(b'\n') + + if self._commits_referenced_but_removed: + issues_found = True + f.write(textwrap.dedent(_(''' + The following commits were filtered out, but referenced in another + commit message. The reference to the now-nonexistent commit hash + (or a substring thereof) was left as-is in any commit messages: + ''')[1:]).encode()) + for bad_commit_reference in self._commits_referenced_but_removed: + f.write(' {}\n'.format(bad_commit_reference).encode()) + f.write(b'\n') + + if not issues_found: + f.write(_("No filtering problems encountered.\n").encode()) + + with open(os.path.join(metadata_dir, b'already_ran'), 'bw') as f: + f.write(_("This file exists to allow you to filter again without --force,\n" + "and to specify that metadata files should be updated instead\n" + "of rewritten").encode()) + + def finish(self): + ''' Alternative to run() when there is no input of our own to parse, + meaning that run only really needs to close the handle to fast-import + and let it finish, thus making a call to "run" feel like a misnomer. ''' + assert not self._input + assert self._managed_output + self.run() + + def insert(self, obj, direct_insertion = False): + if not direct_insertion: + if type(obj) == Blob: + self._tweak_blob(obj) + elif type(obj) == Commit: + aux_info = {'orig_parents': obj.parents, + 'had_file_changes': bool(obj.file_changes)} + self._tweak_commit(obj, aux_info) + elif type(obj) == Reset: + self._tweak_reset(obj) + elif type(obj) == Tag: + self._tweak_tag(obj) + self._insert_into_stream(obj) + + def _insert_into_stream(self, obj): + if not obj.dumped: + if self._lfs_object_tracker: + self._lfs_object_tracker.check_output_object(obj) + if self._parser: + self._parser.insert(obj) + else: + obj.dump(self._output) + + def get_exported_and_imported_refs(self): + return self._parser.get_exported_and_imported_refs() + + def run(self): + start = time.time() + if not self._input and not self._output: + self._run_sanity_checks() + if not self._args.dry_run and not self._args.partial: + self._read_stash() + self._migrate_origin_to_heads() + self._setup_input(use_done_feature = True) + self._setup_output() + assert self._sanity_checks_handled + + if self._input: + # Create and run the filter + self._repo_working_dir = self._args.source or b'.' + self._parser = FastExportParser(blob_callback = self._tweak_blob, + commit_callback = self._tweak_commit, + tag_callback = self._tweak_tag, + reset_callback = self._tweak_reset, + done_callback = self._final_commands) + self._setup_lfs_orphaning_checks() + self._parser.run(self._input, self._output) + if not self._finalize_handled: + self._final_commands() + + # Make sure fast-export completed successfully + if not self._args.stdin and self._fep.wait(): + raise SystemExit(_("Error: fast-export failed; see above.")) # pragma: no cover + self._input.close() + + # If we're not the manager of self._output, we should avoid post-run cleanup + if not self._managed_output: + return + + # Close the output and ensure fast-import successfully completes + self._output.close() + if not self._args.dry_run and self._fip.wait(): + raise SystemExit(_("Error: fast-import failed; see above.")) # pragma: no cover + + # With fast-export and fast-import complete, update state if requested + if self._args.state_branch: + self._save_marks_files() + + # Notify user how long it took, before doing a gc and such + msg = "New history written in {:.2f} seconds..." + if self._args.repack: + msg = "New history written in {:.2f} seconds; now repacking/cleaning..." + print(msg.format(time.time()-start)) + + # Exit early, if requested + if self._args.dry_run: + print(_("NOTE: Not running fast-import or cleaning up; --dry-run passed.")) + if self._fe_orig: + print(_(" Requested filtering can be seen by comparing:")) + print(" " + decode(self._fe_orig)) + else: + print(_(" Requested filtering can be seen at:")) + print(" " + decode(self._fe_filt)) + return + + target_working_dir = self._args.target or b'.' + if self._input: + self._ref_update(target_working_dir) + + # Write out data about run + self._record_metadata(self.results_tmp_dir(), self._orig_refs) + + # Final cleanup: + # If we need a repack, then nuke the reflogs and repack. + # If we need a reset, do a reset --hard + reset = not GitUtils.is_repository_bare(target_working_dir) + self.cleanup(target_working_dir, self._args.repack, reset, + run_quietly=self._args.quiet, + show_debuginfo=self._args.debug) + + # Let user know how long it took + print(_("Completely finished after {:.2f} seconds.") + .format(time.time()-start)) + + # Give post-rewrite instructions for cleaning up other copies for SDR + if self._args.sensitive_data_removal: + lfs_note = "" + if self._lfs_object_tracker and \ + self._lfs_object_tracker.objects_orphaned == True: + lfs_note = _(" and LFS Objects Orphaned") + push_command = "git push --force --mirror origin" + if self._args.no_fetch: + if self._args.partial: + push_command = "git push --force origin " + \ + " ".join(sorted([decode(x) for x in self._changed_refs])) + else: + push_command = "git push --all --tags origin" + print("") + print(sdr_next_steps % (push_command, lfs_note, lfs_note)) + +def main(): + setup_gettext() + args = FilteringOptions.parse_args(sys.argv[1:]) + if args.analyze: + RepoAnalyze.run(args) + else: + filter = RepoFilter(args) + filter.run() + +if __name__ == '__main__': + main()