"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
Omar Polo <op@omarpolo.com>
Subject:
teach gotadmin cleanup to remove redundant packfiles
To:
gameoftrees@openbsd.org
Date:
Fri, 16 Jun 2023 23:25:27 +0200

Download raw body.

Thread
This is an attempt at teaching `gotadmin cleanup' to remove redundant
pack files.  A packfile becomes "redundant" when another packfile
contains all its objects; this usually happens as more objects are
added to a repository and the user issues a `gotadmin pack' (or git
repack).

The idea is simple, as per stsp' explanation on irc a while ago: find
the biggest packfile by number of objects, throw everything in a
idset, then iterate the other pack files and see if they're redundant.

It's particularly useful as we're currently lacking a way to remove
unneded pack files, and rely on 'git gc' or 'git repack -d' to do so.

I'm not sure I got the .keep handling right.  git-repack(1) mentions
that files .keep are skipped when *creating* a packfile, but gotadmin
pack ignores this convention, and I don't see a reason for this
special case.  By deduction, such packfile is hardly redundant
(although it could still be) and so unlikely to be deleted.

unlink_pack_path() in git' packfile.c with force_delete=0 won't remove
the pack when it finds the .keep, otoh remove_redundant_pack() in git'
repack.c sets force_delete to one.

I decided to keep pack files with a matching .keep, partly because I'm
getting lost in this maze, and the name ".keep" suggest so.

diff /home/op/w/got
commit - 2f3ccc5ff648594ff13619e304c2a139a917f2d2
path + /home/op/w/got
blob - 5278c5830ad6b4af4c8d43dbbe466b6a4851dd25
file + gotadmin/gotadmin.1
--- gotadmin/gotadmin.1
+++ gotadmin/gotadmin.1
@@ -232,8 +232,9 @@ Purge unreferenced loose objects from the repository a
 .Op Fl r Ar repository-path
 .Xc
 .Dl Pq alias: Cm cl
-Purge unreferenced loose objects from the repository and display
-the amount of disk space which has been freed as a result.
+Purge unreferenced loose objects and redundant pack files from the
+repository and display the amount of disk space which has been freed
+as a result.
 .Pp
 Unreferenced objects are present in the repository but cannot be
 reached via any reference in the entire
@@ -246,12 +247,18 @@ Packed objects stored in pack files under
 spread across 256 sub-directories named after the 256 possible
 hexadecimal values of the first byte of an object identifier.
 .Pp
-Packed objects stored in pack files under
-.Pa objects/pack/
-will not be purged.
-However, if redundant copies of packed objects exist in loose form,
-such redundant copies will be purged.
+Packed objects are stored in pack files under
+.Pa objects/pack/ .
 .Pp
+If redundant copies of packed objects exist in loose form, such
+redundant copies will be purged.
+If all the objects of a pack file are present in another pack file,
+such redundant pack file will be purged.
+.Pp
+For compatibility with Git, if a matching file
+.Pa .keep
+exists for a given pack file, such pack file won't be removed.
+.Pp
 Objects will usually become unreferenced as a result of deleting
 branches or tags with
 .Cm got branch -d
@@ -261,6 +268,10 @@ In order to determine the set of objects which are ref
 .Cm got ref -d
 may also leave unreferenced objects behind.
 .Pp
+Pack files will usually become redundant as a result of creating
+bigger packs with
+.Cm gotadmin pack .
+.Pp
 In order to determine the set of objects which are referenced, search
 all references for commit objects and tag objects, and traverse the
 corresponding tree object hierarchies.
blob - dee750d952183c77a15074c748c77e5a2510a5e5
file + gotadmin/gotadmin.c
--- gotadmin/gotadmin.c
+++ gotadmin/gotadmin.c
@@ -1136,16 +1136,19 @@ struct got_cleanup_progress_arg {
 	int last_nloose;
 	int last_ncommits;
 	int last_npurged;
+	int last_nredundant;
 	int verbosity;
 	int printed_something;
 	int dry_run;
 };
 
 static const struct got_error *
-cleanup_progress(void *arg, int nloose, int ncommits, int npurged)
+cleanup_progress(void *arg, int nloose, int ncommits, int npurged,
+    int nredundant)
 {
 	struct got_cleanup_progress_arg *a = arg;
 	int print_loose = 0, print_commits = 0, print_purged = 0;
+	int print_redundant = 0;
 
 	if (a->last_nloose != nloose) {
 		print_loose = 1;
@@ -1162,11 +1165,15 @@ cleanup_progress(void *arg, int nloose, int ncommits, 
 		print_purged = 1;
 		a->last_npurged = npurged;
 	}
+	if (a->last_nredundant != nredundant) {
+		print_redundant = 1;
+		a->last_nredundant = nredundant;
+	}
 
 	if (a->verbosity < 0)
 		return NULL;
 
-	if (print_loose || print_commits || print_purged)
+	if (print_loose || print_commits || print_purged || print_redundant)
 		printf("\r");
 	if (print_loose)
 		printf("%d loose object%s", nloose, nloose == 1 ? "" : "s");
@@ -1182,7 +1189,16 @@ cleanup_progress(void *arg, int nloose, int ncommits, 
 			    npurged == 1 ? "" : "s");
 		}
 	}
-	if (print_loose || print_commits || print_purged) {
+	if (print_redundant) {
+		if (a->dry_run) {
+			printf("%d pack file%s could be purged", nredundant,
+			    nredundant == 1 ? "" : "s");
+		} else {
+			printf("%d pack file%s purged", nredundant,
+			    nredundant == 1 ? "" : "s");
+		}
+	}
+	if (print_loose || print_commits || print_purged || print_redundant) {
 		a->printed_something = 1;
 		fflush(stdout);
 	}
@@ -1222,10 +1238,14 @@ cmd_cleanup(int argc, char *argv[])
 	int remove_lonely_packidx = 0, ignore_mtime = 0;
 	struct got_cleanup_progress_arg cpa;
 	struct got_lonely_packidx_progress_arg lpa;
-	off_t size_before, size_after;
-	char scaled_before[FMT_SCALED_STRSIZE];
-	char scaled_after[FMT_SCALED_STRSIZE];
-	char scaled_diff[FMT_SCALED_STRSIZE];
+	off_t loose_before, loose_after;
+	off_t pack_before, pack_after;
+	off_t total_size;
+	char loose_before_scaled[FMT_SCALED_STRSIZE];
+	char loose_after_scaled[FMT_SCALED_STRSIZE];
+	char pack_before_scaled[FMT_SCALED_STRSIZE];
+	char pack_after_scaled[FMT_SCALED_STRSIZE];
+	char total_size_scaled[FMT_SCALED_STRSIZE];
 	int *pack_fds = NULL;
 
 #ifndef PROFILE
@@ -1299,37 +1319,67 @@ cmd_cleanup(int argc, char *argv[])
 	memset(&cpa, 0, sizeof(cpa));
 	cpa.last_ncommits = -1;
 	cpa.last_npurged = -1;
+	cpa.last_nredundant = -1;
 	cpa.dry_run = dry_run;
 	cpa.verbosity = verbosity;
+
 	error = got_repo_purge_unreferenced_loose_objects(repo,
-	    &size_before, &size_after, &npacked, dry_run, ignore_mtime,
+	    &loose_before, &loose_after, &npacked, dry_run, ignore_mtime,
 	    cleanup_progress, &cpa, check_cancelled, NULL);
 	if (cpa.printed_something)
 		printf("\n");
 	if (error)
 		goto done;
+
+	cpa.printed_something = 0;
+	cpa.last_ncommits = -1;
+	cpa.last_npurged = -1;
+	cpa.last_nloose = -1;
+	cpa.last_nredundant = -1;
+	error = got_repo_purge_redundant_packfiles(repo, &pack_before,
+	    &pack_after, dry_run, cleanup_progress, &cpa,
+	    check_cancelled, NULL);
+	if (error)
+		goto done;
+	if (cpa.printed_something)
+		printf("\n");
+
+	total_size = (loose_before - loose_after) + (pack_before - pack_after);
+
 	if (cpa.printed_something) {
-		if (fmt_scaled(size_before, scaled_before) == -1) {
+		if (fmt_scaled(loose_before, loose_before_scaled) == -1) {
 			error = got_error_from_errno("fmt_scaled");
 			goto done;
 		}
-		if (fmt_scaled(size_after, scaled_after) == -1) {
+		if (fmt_scaled(loose_after, loose_after_scaled) == -1) {
 			error = got_error_from_errno("fmt_scaled");
 			goto done;
 		}
-		if (fmt_scaled(size_before - size_after, scaled_diff) == -1) {
+		if (fmt_scaled(pack_before, pack_before_scaled) == -1) {
 			error = got_error_from_errno("fmt_scaled");
 			goto done;
 		}
-		printf("loose total size before: %s\n", scaled_before);
-		printf("loose total size after: %s\n", scaled_after);
+		if (fmt_scaled(pack_after, pack_after_scaled) == -1) {
+			error = got_error_from_errno("fmt_scaled");
+			goto done;
+		}
+		if (fmt_scaled(total_size, total_size_scaled) == -1) {
+			error = got_error_from_errno("fmt_scaled");
+			goto done;
+		}
+		printf("loose total size before: %s\n", loose_before_scaled);
+		printf("loose total size after: %s\n", loose_after_scaled);
+		printf("pack files total size before: %s\n",
+		    pack_before_scaled);
+		printf("pack files total size after: %s\n", pack_after_scaled);
 		if (dry_run) {
 			printf("disk space which would be freed: %s\n",
-			    scaled_diff);
+			    total_size_scaled);
 		} else
-			printf("disk space freed: %s\n", scaled_diff);
+			printf("disk space freed: %s\n", total_size_scaled);
 		printf("loose objects also found in pack files: %d\n", npacked);
 	}
+
 done:
 	if (repo)
 		got_repo_close(repo);
blob - 5ddd50191b8acfb903877b148bb856de99014b07
file + include/got_repository_admin.h
--- include/got_repository_admin.h
+++ include/got_repository_admin.h
@@ -70,7 +70,7 @@ typedef const struct got_error *(*got_cleanup_progress
 
 /* A callback function which gets invoked with cleanup information to print. */
 typedef const struct got_error *(*got_cleanup_progress_cb)(void *arg,
-    int nloose, int ncommits, int npurged);
+    int nloose, int ncommits, int npurged, int nredundant);
 
 /*
  * Walk objects reachable via references to determine whether any loose
@@ -88,6 +88,12 @@ got_repo_purge_unreferenced_loose_objects(struct got_r
     int ignore_mtime, got_cleanup_progress_cb progress_cb, void *progress_arg,
     got_cancel_cb cancel_cb, void *cancel_arg);
 
+const struct got_error *
+got_repo_purge_redundant_packfiles(struct got_repository *repo,
+    off_t *before, off_t *size_after, int dry_run,
+    got_cleanup_progress_cb progress_cb, void *progress_arg,
+    got_cancel_cb cancel_cb, void *cancel_arg);
+
 /* A callback function which gets invoked with cleanup information to print. */
 typedef const struct got_error *(*got_lonely_packidx_progress_cb)(void *arg,
     const char *path);
blob - 48bed8a15e5f885c25ca796741f37fb70f1b87e3
file + lib/repository_admin.c
--- lib/repository_admin.c
+++ lib/repository_admin.c
@@ -628,7 +628,7 @@ report_cleanup_progress(got_cleanup_progress_cb progre
 	if (err || !elapsed)
 		return err;
 
-	return progress_cb(progress_arg, nloose, ncommits, npurged);
+	return progress_cb(progress_arg, nloose, ncommits, npurged, -1);
 }
 
 static const struct got_error *
@@ -1153,7 +1153,7 @@ got_repo_purge_unreferenced_loose_objects(struct got_r
 	if (nloose == 0) {
 		got_object_idset_free(loose_ids);
 		if (progress_cb) {
-			err = progress_cb(progress_arg, 0, 0, 0);
+			err = progress_cb(progress_arg, 0, 0, 0, -1);
 			if (err)
 				return err;
 		}
@@ -1217,7 +1217,8 @@ got_repo_purge_unreferenced_loose_objects(struct got_r
 
 	/* Produce a final progress report. */
 	if (progress_cb) {
-		err = progress_cb(progress_arg, nloose, ncommits, arg.npurged);
+		err = progress_cb(progress_arg, nloose, ncommits, arg.npurged,
+		    -1);
 		if (err)
 			goto done;
 	}
@@ -1227,6 +1228,153 @@ static const struct got_error *
 	return err;
 }
 
+const struct got_error *
+got_repo_purge_redundant_packfiles(struct got_repository *repo,
+    off_t *size_before, off_t *size_after, int dry_run,
+    got_cleanup_progress_cb progress_cb, void *progress_arg,
+    got_cancel_cb cancel_cb, void *cancel_arg)
+{
+	static const char *ext[] = {".pack", ".idx", ".rev", ".bitmap",
+	    ".promisor", ".mtimes"};
+	const struct got_error *err;
+	struct got_packidx *best, *packidx;
+	struct got_object_idset *idset = NULL;
+	struct got_pathlist_entry *pe;
+	struct got_packidx_object_id *pid;
+	struct got_object_id id;
+	struct stat sb;
+	const char *packidx_path;
+	char *dot, path[PATH_MAX];
+	size_t i, nobjects;
+	int remove, packs = 0;
+
+	*size_before = 0;
+	*size_after = 0;
+
+	err = got_pack_find_pack_for_reuse(&best, repo);
+	if (err || best == NULL)
+		goto done;
+
+	idset = got_object_idset_alloc();
+	if (idset == NULL) {
+		err = got_error_from_errno("got_object_idset_alloc");
+		goto done;
+	}
+
+	nobjects = be32toh(best->hdr.fanout_table[0xff]);
+	for (i = 0; i < nobjects; ++i) {
+		pid = &best->hdr.sorted_ids[i];
+
+		memset(&id, 0, sizeof(id));
+		memcpy(&id.sha1, pid->sha1, sizeof(id.sha1));
+
+		err = got_object_idset_add(idset, &id, NULL);
+		if (err)
+			goto done;
+	}
+
+	TAILQ_FOREACH(pe, &repo->packidx_paths, entry) {
+		if (cancel_cb) {
+			err = (*cancel_cb)(cancel_arg);
+			if (err)
+				break;
+		}
+
+		packidx_path = pe->path;
+		err = got_repo_get_packidx(&packidx, packidx_path,
+		    repo);
+		if (err)
+			goto done;
+
+		if (packidx == best) {
+			remove = 0;
+		} else {
+			nobjects = be32toh(packidx->hdr.fanout_table[0xff]);
+			for (i = 0; i < nobjects; ++i) {
+				pid = &packidx->hdr.sorted_ids[i];
+
+				memset(&id, 0, sizeof(id));
+				memcpy(&id.sha1, pid->sha1, sizeof(id.sha1));
+
+				if (!got_object_idset_contains(idset, &id))
+					break;
+			}
+
+			remove = (i == nobjects);
+		}
+
+		if (strlcpy(path, packidx_path, sizeof(path)) >=
+		    sizeof(path)) {
+			err = got_error(GOT_ERR_NO_SPACE);
+			goto done;
+		}
+
+		/*
+		 * For compatibility with Git, if a matching .keep
+		 * file exist don't delete the packfile.
+		 */
+		dot = strrchr(path, '.');
+		*dot = '\0';
+		if (strlcat(path, ".keep", sizeof(path)) >= sizeof(path)) {
+			err = got_error(GOT_ERR_NO_SPACE);
+			goto done;
+		}
+		if (faccessat(got_repo_get_fd(repo), path, F_OK, 0) == 0)
+			remove = 0;
+
+		if (remove)
+			packs++;
+
+		for (i = 0; i < nitems(ext); ++i) {
+			*dot = '\0';
+
+			if (strlcat(path, ext[i], sizeof(path)) >=
+			    sizeof(path)) {
+				err = got_error(GOT_ERR_NO_SPACE);
+				goto done;
+			}
+
+			if (fstatat(got_repo_get_fd(repo), path, &sb, 0) ==
+			    -1) {
+				if (errno == ENOENT &&
+				    strcmp(ext[i], ".pack") != 0 &&
+				    strcmp(ext[i], ".idx") != 0)
+					continue;
+				err = got_error_from_errno2("fstatat", path);
+				goto done;
+			}
+
+			*size_before += sb.st_size;
+			if (!remove) {
+				*size_after += sb.st_size;
+				continue;
+			}
+			err = progress_cb(progress_arg, -1, -1, -1, packs);
+			if (err)
+				goto done;
+
+			if (dry_run)
+				continue;
+
+			if (unlinkat(got_repo_get_fd(repo), path, 0) == -1) {
+				if (errno == ENOENT &&
+				    strcmp(ext[i], ".pack") != 0 &&
+				    strcmp(ext[i], ".idx") != 0)
+					continue;
+				err = got_error_from_errno2("unlinkat",
+				    path);
+				goto done;
+			}
+		}
+	}
+
+	err = progress_cb(progress_arg, -1, -1, -1, packs);
+ done:
+	if (idset)
+		got_object_idset_free(idset);
+	return err;
+}
+
 static const struct got_error *
 remove_packidx(int dir_fd, const char *relpath)
 {
blob - 2381958704d73cb1ebaeb0f7af8d2bf01b7d434f
file + regress/cmdline/cleanup.sh
--- regress/cmdline/cleanup.sh
+++ regress/cmdline/cleanup.sh
@@ -237,6 +237,62 @@ test_cleanup_precious_objects() {
 	test_done "$testroot" "$ret"
 }
 
+test_cleanup_redundant_pack_files() {
+	local testroot=`test_init cleanup_redundant_pack_files`
+
+	# no pack files should exist yet
+	n=$(ls "$testroot/repo/.git/objects/pack" | wc -l | xargs)
+	if [ "$n" -ne 0 ]; then
+		echo "expected no pack file to exists, $n found" >&2
+		test_done "$testroot" 1
+		return 1
+	fi
+
+	# create a redundant pack with an associated .pack
+	hash=$(gotadmin pack -a -r "$testroot/repo" \
+		| awk '/^Indexed/{print $2}')
+	kpack="$testroot/repo/.git/objects/pack/pack-$hash"
+	touch "${kpack%.pack}.keep"
+
+	# create a few pack files, all with a different number of
+	# objects
+	for i in `jot 5`; do
+		gotadmin pack -a -r "$testroot/repo" >/dev/null
+		echo "alpha $i" > $testroot/repo/alpha
+		git_commit "$testroot/repo" -m "edit #$i"
+	done
+
+	# create a pack with all objects
+	gotadmin pack -a -r "$testroot/repo" >/dev/null
+
+	packs_before=$(gotadmin info -r "$testroot/repo" \
+		| awk '/^pack files/{print $3}')
+
+	gotadmin cleanup -r "$testroot/repo" | grep 'pack files? purged' \
+		| tail -1 > $testroot/stdout
+
+	echo "5 pack files purged" > $testroot/stdout.expected
+	if cmp -s "$testroot/stdout.expected" "$testroot/stdout"; then
+		diff -u "$testroot/stdout.expected" "$testroot/stdout"
+		test_done "$testroot" 1
+		return 1
+	fi
+
+	if [ ! -f "$kpack" ]; then
+		echo "$kpack disappeared unexpectedly" >&2
+		test_done "$testroot" 1
+		return 1
+	fi
+
+	if [ ! -f "${kpack%.pack}.keep" ]; then
+		echo "${kpack%.pack}.keep disappeared unexpectedly" >&2
+		test_done "$testroot" 1
+		return 1
+	fi
+
+	test_done "$testroot" 0
+}
+
 test_cleanup_precious_objects() {
 	local testroot=`test_init cleanup_precious_objects`
 
@@ -365,5 +421,6 @@ run_test test_cleanup_precious_objects
 test_parseargs "$@"
 run_test test_cleanup_unreferenced_loose_objects
 run_test test_cleanup_redundant_loose_objects
+run_test test_cleanup_redundant_pack_files
 run_test test_cleanup_precious_objects
 run_test test_cleanup_missing_pack_file