11 "github.com/git-lfs/git-lfs/errors"
12 "github.com/git-lfs/git-lfs/filepathfilter"
13 "github.com/git-lfs/git-lfs/git"
14 "github.com/git-lfs/git-lfs/tasklog"
15 "github.com/git-lfs/gitobj"
18 // Rewriter allows rewriting topologically equivalent Git histories
19 // between two revisions.
20 type Rewriter struct {
21 // mu guards entries and commits (see below)
23 // entries is a mapping of old tree entries to new (rewritten) ones.
24 // Since TreeEntry contains a []byte (and is therefore not a key-able
25 // type), a unique TreeEntry -> string function is used for map keys.
26 entries map[string]*gitobj.TreeEntry
27 // commits is a mapping of old commit SHAs to new ones, where the ASCII
28 // hex encoding of the SHA1 values are used as map keys.
29 commits map[string][]byte
30 // filter is an optional value used to specify which tree entries
31 // (blobs, subtrees) are modifiable given a BlobFn. If non-nil, this
32 // filter will cull out any unmodifiable subtrees and blobs.
33 filter *filepathfilter.Filter
34 // db is the *ObjectDatabase from which blobs, commits, and trees are
36 db *gitobj.ObjectDatabase
37 // l is the *tasklog.Logger to which updates are written.
41 // RewriteOptions is an options type given to the Rewrite() function.
42 type RewriteOptions struct {
43 // Include is the list of refs of which commits reachable by that ref
46 // Exclude is the list of refs of which commits reachable by that ref
50 // UpdateRefs specifies whether the Rewriter should move refs from the
51 // original graph onto the migrated one. If true, the refs will be
52 // moved, and a reflog entry will be created.
55 // Verbose mode prints migrated objects.
58 // ObjectMapFilePath is the path to the map of old sha1 to new sha1
60 ObjectMapFilePath string
62 // BlobFn specifies a function to rewrite blobs.
64 // It is called once per unique, unchanged path. That is to say, if
65 // /a/foo and /a/bar contain identical contents, the BlobFn will be
66 // called twice: once for /a/foo and once for /a/bar, but no more on
67 // each blob for subsequent revisions, so long as each entry remains
70 // TreePreCallbackFn specifies a function to be called before opening a
71 // tree for rewriting. It will be called on all trees throughout history
72 // in topological ordering through the tree, starting at the root.
73 TreePreCallbackFn TreePreCallbackFn
74 // TreeCallbackFn specifies a function to rewrite trees after they have
75 // been reassembled by calling the above BlobFn on all existing tree
77 TreeCallbackFn TreeCallbackFn
80 // blobFn returns a useable BlobRewriteFn, either the one that was given in the
81 // *RewriteOptions, or a noopBlobFn.
82 func (r *RewriteOptions) blobFn() BlobRewriteFn {
89 // treePreFn returns a useable TreePreCallbackFn, either the one that was given
90 // in the *RewriteOptions, or a noopTreePreFn.
91 func (r *RewriteOptions) treePreFn() TreePreCallbackFn {
92 if r.TreePreCallbackFn == nil {
95 return r.TreePreCallbackFn
98 // treeFn returns a useable TreeRewriteFn, either the one that was given in the
99 // *RewriteOptions, or a noopTreeFn.
100 func (r *RewriteOptions) treeFn() TreeCallbackFn {
101 if r.TreeCallbackFn == nil {
104 return r.TreeCallbackFn
107 // BlobRewriteFn is a mapping function that takes a given blob and returns a
108 // new, modified blob. If it returns an error, the new blob will not be written
109 // and instead the error will be returned from the Rewrite() function.
111 // Invocations of an instance of BlobRewriteFn are not expected to store the
112 // returned blobs in the *git/gitobj.ObjectDatabase.
114 // The path argument is given to be an absolute path to the tree entry being
115 // rewritten, where the repository root is the root of the path given. For
116 // instance, a file "b.txt" in directory "dir" would be given as "/dir/b.txt",
117 // where as a file "a.txt" in the root would be given as "/a.txt".
119 // As above, the path separators are OS specific, and equivalent to the result
120 // of filepath.Join(...) or os.PathSeparator.
121 type BlobRewriteFn func(path string, b *gitobj.Blob) (*gitobj.Blob, error)
123 // TreePreCallbackFn specifies a function to call upon opening a new tree for
126 // Unlike its sibling TreeCallbackFn, TreePreCallbackFn may not modify the given
129 // TreePreCallbackFn can be nil, and will therefore exhibit behavior equivalent
130 // to only calling the BlobFn on existing tree entries.
132 // If the TreePreCallbackFn returns an error, it will be returned from the
133 // Rewrite() invocation.
134 type TreePreCallbackFn func(path string, t *gitobj.Tree) error
136 // TreeCallbackFn specifies a function to call before writing a re-written tree
137 // to the object database. The TreeCallbackFn can return a modified tree to be
138 // written to the object database instead of one generated from calling BlobFn
139 // on all of the tree entries.
142 // TreeCallbackFn can be nil, and will therefore exhibit behavior equivalent to
143 // only calling the BlobFn on existing tree entries.
145 // If the TreeCallbackFn returns an error, it will be returned from the
146 // Rewrite() invocation.
147 type TreeCallbackFn func(path string, t *gitobj.Tree) (*gitobj.Tree, error)
149 type rewriterOption func(*Rewriter)
152 // WithFilter is an optional argument given to the NewRewriter
153 // constructor function to limit invocations of the BlobRewriteFn to
154 // only pathspecs that match the given *filepathfilter.Filter.
155 WithFilter = func(filter *filepathfilter.Filter) rewriterOption {
156 return func(r *Rewriter) {
161 // WithLoggerto logs updates caused by the *git/githistory.Rewriter to
162 // the given io.Writer "sink".
163 WithLoggerTo = func(sink io.Writer) rewriterOption {
164 return WithLogger(tasklog.NewLogger(sink))
167 // WithLogger logs updates caused by the *git/githistory.Rewriter to the
168 // be given to the provided logger, "l".
169 WithLogger = func(l *tasklog.Logger) rewriterOption {
170 return func(r *Rewriter) {
175 // noopBlobFn is a no-op implementation of the BlobRewriteFn. It returns
176 // the blob that it was given, and returns no error.
177 noopBlobFn = func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { return b, nil }
178 // noopTreePreFn is a no-op implementation of the TreePreRewriteFn. It
179 // returns the tree that it was given, and returns no error.
180 noopTreePreFn = func(path string, t *gitobj.Tree) error { return nil }
181 // noopTreeFn is a no-op implementation of the TreeRewriteFn. It returns
182 // the tree that it was given, and returns no error.
183 noopTreeFn = func(path string, t *gitobj.Tree) (*gitobj.Tree, error) { return t, nil }
186 // NewRewriter constructs a *Rewriter from the given *ObjectDatabase instance.
187 func NewRewriter(db *gitobj.ObjectDatabase, opts ...rewriterOption) *Rewriter {
188 rewriter := &Rewriter{
190 entries: make(map[string]*gitobj.TreeEntry),
191 commits: make(map[string][]byte),
196 for _, opt := range opts {
202 // Rewrite rewrites the range of commits given by *RewriteOptions.{Left,Right}
203 // using the BlobRewriteFn to rewrite the individual blobs.
204 func (r *Rewriter) Rewrite(opt *RewriteOptions) ([]byte, error) {
205 // First, obtain a list of commits to rewrite.
206 commits, err := r.commitsToMigrate(opt)
211 var perc *tasklog.PercentageTask
213 perc = r.l.Percentage("migrate: Rewriting commits", uint64(len(commits)))
215 perc = r.l.Percentage("migrate: Examining commits", uint64(len(commits)))
218 var vPerc *tasklog.PercentageTask
223 var objectMapFile *os.File
224 if len(opt.ObjectMapFilePath) > 0 {
225 objectMapFile, err = os.OpenFile(opt.ObjectMapFilePath, os.O_RDWR|os.O_CREATE|os.O_EXCL, 0666)
227 return nil, fmt.Errorf("Could not create object map file: %v", err)
229 defer objectMapFile.Close()
232 // Keep track of the last commit that we rewrote. Callers often want
233 // this so that they can perform a git-update-ref(1).
235 for _, oid := range commits {
236 // Load the original commit to access the data necessary in
237 // order to rewrite it.
238 original, err := r.db.Commit(oid)
243 // Rewrite the tree given at that commit.
244 rewrittenTree, err := r.rewriteTree(oid, original.TreeID, "", opt.blobFn(), opt.treePreFn(), opt.treeFn(), vPerc)
249 // Create a new list of parents from the original commit to
250 // point at the rewritten parents in order to create a
251 // topologically equivalent DAG.
253 // This operation is safe since we are visiting the commits in
254 // reverse topological order and therefore have seen all parents
255 // before children (in other words, r.uncacheCommit(...) will
256 // always return a value, if the prospective parent is a part of
258 rewrittenParents := make([][]byte, 0, len(original.ParentIDs))
259 for _, originalParent := range original.ParentIDs {
260 rewrittenParent, ok := r.uncacheCommit(originalParent)
262 // If we haven't seen the parent before, this
263 // means that we're doing a partial migration
264 // and the parent that we're looking for isn't
267 // Use the original parent to properly link
268 // history across the migration boundary.
269 rewrittenParent = originalParent
272 rewrittenParents = append(rewrittenParents, rewrittenParent)
275 // Construct a new commit using the original header information,
276 // but the rewritten set of parents as well as root tree.
277 rewrittenCommit := &gitobj.Commit{
278 Author: original.Author,
279 Committer: original.Committer,
280 ExtraHeaders: original.ExtraHeaders,
281 Message: original.Message,
283 ParentIDs: rewrittenParents,
284 TreeID: rewrittenTree,
289 if original.Equal(rewrittenCommit) {
290 newSha = make([]byte, len(oid))
293 newSha, err = r.db.WriteCommit(rewrittenCommit)
297 if objectMapFile != nil {
298 if _, err := fmt.Fprintf(objectMapFile, "%x,%x\n", oid, newSha); err != nil {
304 // Cache that commit so that we can reassign children of this
306 r.cacheCommit(oid, newSha)
308 // Increment the percentage displayed in the terminal.
311 // Move the tip forward.
316 refs, err := r.refsToMigrate()
318 return nil, errors.Wrap(err, "could not find refs to update")
321 root, _ := r.db.Root()
323 updater := &refUpdater{
324 CacheFn: r.uncacheCommit,
332 if err := updater.UpdateRefs(); err != nil {
333 return nil, errors.Wrap(err, "could not update refs")
340 // rewriteTree is a recursive function which rewrites a tree given by the ID
341 // "sha" and path "path". It uses the given BlobRewriteFn to rewrite all blobs
342 // within the tree, either calling that function or recurring down into subtrees
343 // by re-assigning the SHA.
345 // Once it is done assembling the entries in a given subtree, it then calls the
346 // TreeCallbackFn, "tfn" to perform a final traversal of the subtree before
347 // saving it to the object database.
349 // It returns the new SHA of the rewritten tree, or an error if the tree was
350 // unable to be rewritten.
351 func (r *Rewriter) rewriteTree(commitOID []byte, treeOID []byte, path string,
352 fn BlobRewriteFn, tpfn TreePreCallbackFn, tfn TreeCallbackFn,
353 perc *tasklog.PercentageTask) ([]byte, error) {
355 tree, err := r.db.Tree(treeOID)
360 if err := tpfn("/"+path, tree); err != nil {
364 entries := make([]*gitobj.TreeEntry, 0, len(tree.Entries))
365 for _, entry := range tree.Entries {
368 fullpath = strings.Join([]string{path, entry.Name}, "/")
370 fullpath = entry.Name
373 if !r.allows(entry.Type(), fullpath) {
374 entries = append(entries, copyEntry(entry))
378 // If this is a symlink, skip it
379 if entry.Filemode == 0120000 {
380 entries = append(entries, copyEntry(entry))
384 if cached := r.uncacheEntry(entry); cached != nil {
385 entries = append(entries, copyEntry(cached))
391 switch entry.Type() {
392 case gitobj.BlobObjectType:
393 oid, err = r.rewriteBlob(commitOID, entry.Oid, fullpath, fn, perc)
394 case gitobj.TreeObjectType:
395 oid, err = r.rewriteTree(commitOID, entry.Oid, fullpath, fn, tpfn, tfn, perc)
404 entries = append(entries, r.cacheEntry(entry, &gitobj.TreeEntry{
405 Filemode: entry.Filemode,
411 rewritten, err := tfn("/"+path, &gitobj.Tree{Entries: entries})
416 if tree.Equal(rewritten) {
419 return r.db.WriteTree(rewritten)
422 func copyEntry(e *gitobj.TreeEntry) *gitobj.TreeEntry {
427 oid := make([]byte, len(e.Oid))
430 return &gitobj.TreeEntry{
431 Filemode: e.Filemode,
437 func (r *Rewriter) allows(typ gitobj.ObjectType, abs string) bool {
439 case gitobj.BlobObjectType:
440 return r.Filter().Allows(strings.TrimPrefix(abs, "/"))
441 case gitobj.CommitObjectType, gitobj.TreeObjectType:
444 panic(fmt.Sprintf("git/githistory: unknown entry type: %s", typ))
448 // rewriteBlob calls the given BlobRewriteFn "fn" on a blob given in the object
449 // database by the SHA1 "from" []byte. It writes and returns the new blob SHA,
450 // or an error if either the BlobRewriteFn returned one, or if the object could
451 // not be loaded/saved.
452 func (r *Rewriter) rewriteBlob(commitOID, from []byte, path string, fn BlobRewriteFn, perc *tasklog.PercentageTask) ([]byte, error) {
453 blob, err := r.db.Blob(from)
458 b, err := fn(path, blob)
464 sha, err := r.db.WriteBlob(b)
469 // Close the source blob, so long as it is not equal to the
470 // rewritten blob. If the two are equal, as in the check above
471 // this comment, calling r.db.WriteBlob(b) will have already
472 // closed both "b" and "blob" since they are the same.
474 // Closing an *os.File twice causes an `os.ErrInvalid` to be
476 if err = blob.Close(); err != nil {
481 perc.Entry(fmt.Sprintf("migrate: commit %s: %s", hex.EncodeToString(commitOID), path))
487 // Close the source blob, since it is identical to the rewritten blob,
488 // but neither were written.
489 if err := blob.Close(); err != nil {
495 // commitsToMigrate returns an in-memory copy of a list of commits according to
496 // the output of git-rev-list(1) (given the *RewriteOptions), where each
497 // outputted commit is 20 bytes of raw SHA1.
499 // If any error was encountered, it will be returned.
500 func (r *Rewriter) commitsToMigrate(opt *RewriteOptions) ([][]byte, error) {
501 waiter := r.l.Waiter("migrate: Sorting commits")
502 defer waiter.Complete()
504 scanner, err := git.NewRevListScanner(
505 opt.Include, opt.Exclude, r.scannerOpts())
512 commits = append(commits, scanner.OID())
515 if err = scanner.Err(); err != nil {
518 if err = scanner.Close(); err != nil {
524 // refsToMigrate returns a list of references to migrate, or an error if loading
525 // those references failed.
526 func (r *Rewriter) refsToMigrate() ([]*git.Ref, error) {
530 if root, ok := r.db.Root(); ok {
531 refs, err = git.AllRefsIn(root)
533 refs, err = git.AllRefs()
541 for _, ref := range refs {
542 if ref.Type == git.RefTypeRemoteBranch || ref.Type == git.RefTypeRemoteTag {
546 local = append(local, ref)
552 // scannerOpts returns a *git.ScanRefsOptions instance to be given to the
553 // *git.RevListScanner.
555 // If the database this *Rewriter is operating in a given root (not in memory)
556 // it re-assigns the working directory to be there.
557 func (r *Rewriter) scannerOpts() *git.ScanRefsOptions {
558 opts := &git.ScanRefsOptions{
559 Mode: git.ScanRefsMode,
560 Order: git.TopoRevListOrder,
564 SkippedRefs: make([]string, 0),
565 Mutex: new(sync.Mutex),
566 Names: make(map[string]string),
569 if root, ok := r.db.Root(); ok {
570 opts.WorkingDir = root
575 // Filter returns the filter used by this *Rewriter to filter subtrees, blobs
577 func (r *Rewriter) Filter() *filepathfilter.Filter {
581 // cacheEntry caches then given "from" entry so that it is always rewritten as
582 // a *TreeEntry equivalent to "to".
583 func (r *Rewriter) cacheEntry(from, to *gitobj.TreeEntry) *gitobj.TreeEntry {
587 r.entries[r.entryKey(from)] = to
592 // uncacheEntry returns a *TreeEntry that is cached from the given *TreeEntry
593 // "from". That is to say, it returns the *TreeEntry that "from" should be
594 // rewritten to, or nil if none could be found.
595 func (r *Rewriter) uncacheEntry(from *gitobj.TreeEntry) *gitobj.TreeEntry {
599 return r.entries[r.entryKey(from)]
602 // entryKey returns a unique key for a given *TreeEntry "e".
603 func (r *Rewriter) entryKey(e *gitobj.TreeEntry) string {
604 return fmt.Sprintf("%s:%x", e.Name, e.Oid)
607 // cacheEntry caches then given "from" commit so that it is always rewritten as
608 // a *git/gitobj.Commit equivalent to "to".
609 func (r *Rewriter) cacheCommit(from, to []byte) {
613 r.commits[hex.EncodeToString(from)] = to
616 // uncacheCommit returns a *git/gitobj.Commit that is cached from the given
617 // *git/gitobj.Commit "from". That is to say, it returns the *git/gitobj.Commit that
618 // "from" should be rewritten to and true, or nil and false if none could be
620 func (r *Rewriter) uncacheCommit(from []byte) ([]byte, bool) {
624 c, ok := r.commits[hex.EncodeToString(from)]