From: Stefan Sperling Subject: Re: Landry's firefox repository To: Christian Weisgerber Cc: gameoftrees@openbsd.org Date: Tue, 26 Mar 2024 14:53:22 +0100 On Wed, Jan 31, 2024 at 07:27:10PM +0100, Christian Weisgerber wrote: > Stefan Sperling: > > > > That merge commit de0fe38 prevents rebasing "master" on "origin/master". > > > > Does "master" above mean commit 9cd4661 (HEAD -> master)? > > Yes. Below is a series of 3 patches which fix this problem by teaching the commit graph about topological sort order. You can now run 'got rebase -t' and will forward the reference instead of attempting a bogus rebase which causes needless merge conflicts. However, there is a cost. The overhead I mention in the log message is quite significant on large repositories: $ pwd /git/linux.git $ time got log -l5 -b -s -t 2024-03-13 master Merge tag 'printk-for-6.9' of git://git.kernel.org/pub/scm/linux/kernel/git/printk/linux 2024-03-13 f88c3fb mm, slab: remove last vestiges of SLAB_MEM_SPREAD 2024-02-07 7412dc6 dump_stack: Do not get cpu_sync for panic CPU 2024-03-13 0ea680e Merge tag 'slab-for-6.9' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab 2024-02-07 d988d9a panic: Flush kernel log buffer at the end 1m32.27s real 1m03.10s user 0m30.97s system $ time got log -l5 -b -s 2024-03-13 master Merge tag 'printk-for-6.9' of git://git.kernel.org/pub/scm/linux/kernel/git/printk/linux 2024-03-13 f88c3fb mm, slab: remove last vestiges of SLAB_MEM_SPREAD 2024-02-07 7412dc6 dump_stack: Do not get cpu_sync for panic CPU 2024-03-13 0ea680e Merge tag 'slab-for-6.9' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab 2024-02-07 d988d9a panic: Flush kernel log buffer at the end 0m01.03s real 0m00.72s user 0m00.31s system $ Anyway, having this feature available is better than not having it. And there are ways to improve performance going forward. The 3 diffs should all apply with 'got patch' in one go. ok? ----------------------------------------------- add support for topological sorting to the commit graph The algorithm implemented here is based on a description I read on github's blog. See code comments for details. diff 78e82c8a2a2cd0fed316492b18264f5d8727f961 849e12c1d2dd36d497ec579cefa88b1b2dc117c8 commit - 78e82c8a2a2cd0fed316492b18264f5d8727f961 commit + 849e12c1d2dd36d497ec579cefa88b1b2dc117c8 blob - 1a1732a2994a8428496e498b70ebb81e222507aa blob + a776f0d370166a89c45161d249a492830e104ced --- got/got.c +++ got/got.c @@ -4481,8 +4481,13 @@ print_commits(struct got_object_id *root_id, struct go err = got_commit_graph_open(&graph, path, !log_branches); if (err) return err; - err = got_commit_graph_iter_start(graph, root_id, repo, - check_cancelled, NULL); + if (log_branches) { + err = got_commit_graph_toposort(graph, root_id, repo, + check_cancelled, NULL); + } else { + err = got_commit_graph_iter_start(graph, root_id, repo, + check_cancelled, NULL); + } if (err) goto done; for (;;) { blob - 9f0a8aa211fe8d5dd0defcc6fe994cafada284c3 blob + 35d7c7072f5ce67648f7f2c4a878396746ad3e45 --- include/got_commit_graph.h +++ include/got_commit_graph.h @@ -23,6 +23,8 @@ void got_commit_graph_close(struct got_commit_graph *) const struct got_error *got_commit_graph_iter_start( struct got_commit_graph *, struct got_object_id *, struct got_repository *, got_cancel_cb, void *); +const struct got_error *got_commit_graph_toposort(struct got_commit_graph *, + struct got_object_id *, struct got_repository *, got_cancel_cb, void *); const struct got_error *got_commit_graph_iter_next(struct got_object_id *, struct got_commit_graph *, struct got_repository *, got_cancel_cb, void *); const struct got_error *got_commit_graph_intersect(struct got_object_id **, blob - 514ea5bf403b3a7ac375edcf405ab96bcd94cb20 blob + 976b20aab20831ca6c66bb5793542bf438ec1688 --- lib/commit_graph.c +++ lib/commit_graph.c @@ -38,10 +38,21 @@ #include "got_lib_inflate.h" #include "got_lib_object.h" #include "got_lib_object_idset.h" +#include "got_lib_object_qid.h" +#ifndef nitems +#define nitems(_a) (sizeof((_a)) / sizeof((_a)[0])) +#endif + struct got_commit_graph_node { struct got_object_id id; + /* Used for topological sorting. */ + struct got_commit_graph_node *parents[2]; + struct got_commit_graph_node **more_parents; + int nparents; + int indegree; + /* Used only during iteration. */ time_t timestamp; TAILQ_ENTRY(got_commit_graph_node) entry; @@ -61,6 +72,7 @@ struct got_commit_graph { int flags; #define GOT_COMMIT_GRAPH_FIRST_PARENT_TRAVERSAL 0x01 +#define GOT_COMMIT_GRAPH_TOPOSORT 0x02 /* * A set of object IDs of known parent commits which we have not yet @@ -88,7 +100,8 @@ struct got_commit_graph { /* * Nodes which will be passed to the API user next, sorted by - * commit timestmap. + * commit timestamp. Sorted in topological order only if topological + * sorting was requested. */ struct got_commit_graph_iter_list iter_list; }; @@ -181,7 +194,8 @@ add_node(struct got_commit_graph_node **new_node, return got_error_from_errno("calloc"); memcpy(&node->id, commit_id, sizeof(node->id)); - err = got_object_idset_add(graph->node_ids, &node->id, NULL); + node->nparents = -1; + err = got_object_idset_add(graph->node_ids, &node->id, node); if (err) free(node); else @@ -547,6 +561,7 @@ got_commit_graph_close(struct got_commit_graph *graph) while ((node = TAILQ_FIRST(&graph->iter_list))) { TAILQ_REMOVE(&graph->iter_list, node, entry); + free(node->more_parents); free(node); } @@ -575,6 +590,9 @@ got_commit_graph_iter_start(struct got_commit_graph *g const struct got_error *err = NULL; struct got_commit_graph_node *node; + /* First-parent traversal is implicitly topological. */ + graph->flags &= ~GOT_COMMIT_GRAPH_TOPOSORT; + /* Clear left-over state from previous iteration attempts. */ while ((node = TAILQ_FIRST(&graph->iter_list))) TAILQ_REMOVE(&graph->iter_list, node, entry); @@ -615,7 +633,8 @@ got_commit_graph_iter_next(struct got_object_id *id, got_cancel_cb cancel_cb, void *cancel_arg) { const struct got_error *err = NULL; - struct got_commit_graph_node *node; + struct got_commit_graph_node *node, *pnode; + int i; node = TAILQ_FIRST(&graph->iter_list); if (node == NULL) { @@ -623,17 +642,33 @@ got_commit_graph_iter_next(struct got_object_id *id, return got_error(GOT_ERR_ITER_COMPLETED); } - while (TAILQ_NEXT(node, entry) == NULL && - got_object_idset_num_elements(graph->open_branches) > 0) { - err = fetch_commits_from_open_branches(graph, repo, - cancel_cb, cancel_arg); - if (err) - return err; + if (graph->flags & GOT_COMMIT_GRAPH_TOPOSORT) { + /* At least one node with in-degree zero must exist. */ + while (node->indegree != 0) + node = TAILQ_NEXT(node, entry); + } else { + while (TAILQ_NEXT(node, entry) == NULL && + got_object_idset_num_elements(graph->open_branches) > 0) { + err = fetch_commits_from_open_branches(graph, repo, + cancel_cb, cancel_arg); + if (err) + return err; + } } memcpy(id, &node->id, sizeof(*id)); TAILQ_REMOVE(&graph->iter_list, node, entry); + if (graph->flags & GOT_COMMIT_GRAPH_TOPOSORT) { + /* When visiting a commit decrement in-degree of all parents. */ + for (i = 0; i < node->nparents; i++) { + if (i < nitems(node->parents)) + pnode = node->parents[i]; + else + pnode = node->more_parents[i]; + pnode->indegree--; + } + } free(node); return NULL; } @@ -749,3 +784,164 @@ done: got_commit_graph_close(graph2); return err; } + +/* + * Sort the graph for traversal in topological order. + * + * This implementation is based on the description of topological sorting + * of git commits by Derrick Stolee at + * https://github.blog/2022-08-30-gits-database-internals-ii-commit-history-queries/#topological-sorting + * which reads as follows: + * + * The basic algorithm for topological sorting is Kahn’s algorithm which + * follows two big steps: + * 1. Walk all reachable commits, counting the number of times a commit appears + * as a parent of another commit. Call these numbers the in-degree of the + * commit, referencing the number of incoming edges. + * 2. Walk the reachable commits, but only visit a commit if its in-degree + * value is zero. When visiting a commit, decrement the in-degree value of + * each parent. + * + * This algorithm works because at least one of our starting points will + * have in-degree zero, and then decrementing the in-degree value is similar + * to deleting the commit from the graph, always having at least one commit + * with in-degree zero. + */ +const struct got_error * +got_commit_graph_toposort(struct got_commit_graph *graph, + struct got_object_id *id, struct got_repository *repo, + got_cancel_cb cancel_cb, void *cancel_arg) +{ + const struct got_error *err = NULL; + struct got_commit_graph_node *node = NULL, *pnode = NULL; + struct got_commit_object *commit = NULL; + struct got_object_id_queue commits; + const struct got_object_id_queue *parent_ids; + struct got_object_qid *qid = NULL, *pid; + int i; + + STAILQ_INIT(&commits); + + if (graph->flags & GOT_COMMIT_GRAPH_FIRST_PARENT_TRAVERSAL) + return got_commit_graph_iter_start(graph, id, repo, + cancel_cb, cancel_arg); + + /* Clear left-over state from previous iteration attempts. */ + while ((node = TAILQ_FIRST(&graph->iter_list))) + TAILQ_REMOVE(&graph->iter_list, node, entry); + err = got_object_idset_for_each(graph->open_branches, + remove_branch_tip, graph->open_branches); + if (err) + return err; + + graph->flags |= GOT_COMMIT_GRAPH_TOPOSORT; + + /* + * Sorting the commit graph in topological order requires visiting + * every reachable commit. This is very expensive but there are + * ways to speed this up significantly in the future: + * 1) Run this loop in got-read-pack if possible. + * 2) Use Git's commit-graph file to compute the result incrementally. + * See the blog post linked above for details. + */ + err = got_object_qid_alloc_partial(&qid); + if (err) + return err; + memcpy(&qid->id, id, sizeof(qid->id)); + STAILQ_INSERT_TAIL(&commits, qid, entry); + while (!STAILQ_EMPTY(&commits)) { + if (cancel_cb) { + err = (*cancel_cb)(cancel_arg); + if (err) + break; + } + + qid = STAILQ_FIRST(&commits); + STAILQ_REMOVE_HEAD(&commits, entry); + err = got_object_open_as_commit(&commit, repo, &qid->id); + if (err) + break; + + node = got_object_idset_get(graph->node_ids, &qid->id); + if (node == NULL) { + err = add_node(&node, graph, id, repo); + if (err) + break; + TAILQ_INSERT_TAIL(&graph->iter_list, node, entry); + } + + got_object_qid_free(qid); + qid = NULL; + + if (node->timestamp != 0) /* already traversed once */ + continue; + + if (node->nparents == -1) { + node->nparents = got_object_commit_get_nparents(commit); + if (node->nparents > nitems(node->parents)) { + node->more_parents = calloc(node->nparents, + sizeof(*node->more_parents)); + if (node->more_parents == NULL) { + err = got_error_from_errno("calloc"); + break; + } + } + + } + + node->timestamp = got_object_commit_get_committer_time(commit); + parent_ids = got_object_commit_get_parent_ids(commit); + i = 0; + STAILQ_FOREACH(pid, parent_ids, entry) { + if (cancel_cb) { + err = (*cancel_cb)(cancel_arg); + if (err) + goto done; + } + + /* + * Increment the in-degree counter every time a given + * commit appears as the parent of another commit. + */ + pnode = got_object_idset_get(graph->node_ids, &pid->id); + if (pnode == NULL) { + err = add_node(&pnode, graph, &pid->id, repo); + if (err) + goto done; + TAILQ_INSERT_TAIL(&graph->iter_list, pnode, + entry); + } + pnode->indegree++; + + /* + * Cache parent pointers on the node to make future + * in-degree updates easier. + */ + if (node->nparents <= nitems(node->parents)) { + node->parents[i] = pnode; + } else { + node->more_parents[i] = pnode; + if (i < nitems(node->parents)) + node->parents[i] = pnode; + } + i++; + + /* Keep traversing through all parent commits. */ + err = got_object_qid_alloc_partial(&qid); + if (err) + goto done; + memcpy(&qid->id, &pid->id, sizeof(qid->id)); + STAILQ_INSERT_TAIL(&commits, qid, entry); + qid = NULL; + } + + got_object_commit_close(commit); + commit = NULL; + } +done: + if (commit) + got_object_commit_close(commit); + got_object_qid_free(qid); + got_object_id_queue_free(&commits); + return err; +} blob - 4a83ec7819b07c9e4a689fc50177a7b979abbbf9 blob + aebdf49c02bb9cd38df6eb3530904c6237c5fb9c --- regress/cmdline/log.sh +++ regress/cmdline/log.sh @@ -1241,6 +1241,68 @@ test_log_commit_keywords() { test_done "$testroot" "$ret" } +test_log_toposort() { + local testroot=`test_init log_toposort` + local commit0=`git_show_head $testroot/repo` + local author_time0=`git_show_author_time $testroot/repo` + + got checkout $testroot/repo $testroot/wt > /dev/null + ret=$? + if [ $ret -ne 0 ]; then + test_done "$testroot" "$ret" + return 1 + fi + + echo aaa > $testroot/wt/alpha + (cd $testroot/wt && got commit -m 'change alpha' >/dev/null) + local commit1=`git_show_head $testroot/repo` + local author_time1=`git_show_author_time $testroot/repo` + + got branch -r $testroot/repo -c $commit0 newbranch + (cd $testroot/wt && got update -b newbranch > /dev/null) + echo ddd > $testroot/wt/gamma/delta + (cd $testroot/wt && got commit -m 'change delta' >/dev/null) + local commit2=`git_show_branch_head $testroot/repo newbranch` + local author_time2=`git_show_author_time $testroot/repo newbranch` + + echo zzz > $testroot/wt/epsilon/zeta + (cd $testroot/wt && got commit -m 'change zeta' >/dev/null) + local commit3=`git_show_head $testroot/repo` + local author_time3=`git_show_author_time $testroot/repo newbranch` + + (cd $testroot/wt && got update -b master > /dev/null) + (cd $testroot/wt && got merge newbranch > /dev/null) + local merge_commit=`git_show_head $testroot/repo` + local merge_time=`git_show_author_time $testroot/repo` + + local short_commit0=`trim_obj_id 33 $commit0` + local short_commit1=`trim_obj_id 33 $commit1` + local short_commit2=`trim_obj_id 33 $commit2` + local short_commit3=`trim_obj_id 33 $commit3` + + d_0=`date -u -r $author_time0 +"%G-%m-%d"` + d_1=`date -u -r $author_time1 +"%G-%m-%d"` + d_2=`date -u -r $author_time2 +"%G-%m-%d"` + d_3=`date -u -r $author_time3 +"%G-%m-%d"` + d_m=`date -u -r $merge_time +"%G-%m-%d"` + + got log -r $testroot/repo -s -b > $testroot/stdout + cat > $testroot/stdout.expected <code == GOT_ERR_ANCESTRY) { error = got_error_msg(GOT_ERR_ANCESTRY, @@ -13510,7 +13513,7 @@ cmd_merge(int argc, char *argv[]) } error = got_commit_graph_find_youngest_common_ancestor(&yca_id, - wt_branch_tip, branch_tip, 0, repo, + wt_branch_tip, branch_tip, 0, 0, repo, check_cancelled, NULL); if (error && error->code != GOT_ERR_ANCESTRY) goto done; blob - 35d7c7072f5ce67648f7f2c4a878396746ad3e45 blob + a425d8d817ab860cb50f4d849945ba8fe207c0fd --- include/got_commit_graph.h +++ include/got_commit_graph.h @@ -34,4 +34,4 @@ const struct got_error *got_commit_graph_intersect(str /* Find the youngest common ancestor of two commits. */ const struct got_error *got_commit_graph_find_youngest_common_ancestor( struct got_object_id **, struct got_object_id *, struct got_object_id *, - int, struct got_repository *, got_cancel_cb, void *); + int, int, struct got_repository *, got_cancel_cb, void *); blob - 976b20aab20831ca6c66bb5793542bf438ec1688 blob + 3748c660e7ad27e0b645825b852620a1a041ed71 --- lib/commit_graph.c +++ lib/commit_graph.c @@ -702,12 +702,14 @@ find_yca_add_id(struct got_object_id **yca_id, struct * common ancestors. * * If first_parent_traversal is nonzero, only linear history is considered. + * If toposort is set then sort commits in topological order before + * traversing them. */ const struct got_error * got_commit_graph_find_youngest_common_ancestor(struct got_object_id **yca_id, struct got_object_id *commit_id, struct got_object_id *commit_id2, - int first_parent_traversal, - struct got_repository *repo, got_cancel_cb cancel_cb, void *cancel_arg) + int first_parent_traversal, int toposort, struct got_repository *repo, + got_cancel_cb cancel_cb, void *cancel_arg) { const struct got_error *err = NULL; struct got_commit_graph *graph = NULL, *graph2 = NULL; @@ -728,16 +730,28 @@ got_commit_graph_find_youngest_common_ancestor(struct if (err) goto done; - err = got_commit_graph_iter_start(graph, commit_id, repo, - cancel_cb, cancel_arg); - if (err) - goto done; + if (toposort) { + err = got_commit_graph_toposort(graph, commit_id, repo, + cancel_cb, cancel_arg); + if (err) + goto done; - err = got_commit_graph_iter_start(graph2, commit_id2, repo, - cancel_cb, cancel_arg); - if (err) - goto done; + err = got_commit_graph_toposort(graph2, commit_id2, repo, + cancel_cb, cancel_arg); + if (err) + goto done; + } else { + err = got_commit_graph_iter_start(graph, commit_id, repo, + cancel_cb, cancel_arg); + if (err) + goto done; + err = got_commit_graph_iter_start(graph2, commit_id2, repo, + cancel_cb, cancel_arg); + if (err) + goto done; + } + for (;;) { if (cancel_cb) { err = (*cancel_cb)(cancel_arg); blob - 681905dc26b73d6a41382533ca6a87a306c8a240 blob + b6be87a769c4d7440f941e66aabb66e589e19a77 --- lib/send.c +++ lib/send.c @@ -226,7 +226,7 @@ check_common_ancestry(const char *refname, struct got_ "bad object type on server for %s", refname); err = got_commit_graph_find_youngest_common_ancestor(&yca_id, - my_id, their_id, 0, repo, cancel_cb, cancel_arg); + my_id, their_id, 0, 0, repo, cancel_cb, cancel_arg); if (err) return err; if (yca_id == NULL) blob - 46f11f7c161db17549ba3c6ba4055ffcc81b97ce blob + 1af0c70b3ab152b32f52bbe7f1d45ecf2b38daba --- regress/cmdline/rebase.sh +++ regress/cmdline/rebase.sh @@ -2236,8 +2236,7 @@ test_rebase_merged_history_traversal() { local commit1=`git_show_branch_head $testroot/repo master` (cd $testroot/wt && got branch newbranch2) >/dev/null - (cd $testroot/wt && got rebase newbranch1) > $testroot/stdout \ - 2> $testroot/stderr + (cd $testroot/wt && got rebase -t newbranch1) > $testroot/stdout echo "Forwarding refs/heads/newbranch1 to commit $commit1" > \ $testroot/stdout.expected @@ -2245,8 +2244,9 @@ test_rebase_merged_history_traversal() { >> $testroot/stdout.expected if ! cmp -s $testroot/stdout.expected $testroot/stdout; then - #diff -u $testroot/stdout.expected $testroot/stdout - ret="xfail ($(head -n1 $testroot/stdout))" + diff -u $testroot/stdout.expected $testroot/stdout + test_done "$testroot" "1" + return 1 fi test_done "$testroot" "$ret"