From: Omar Polo Subject: teach gotadmin cleanup to remove redundant packfiles To: gameoftrees@openbsd.org Date: Fri, 16 Jun 2023 23:25:27 +0200 This is an attempt at teaching `gotadmin cleanup' to remove redundant pack files. A packfile becomes "redundant" when another packfile contains all its objects; this usually happens as more objects are added to a repository and the user issues a `gotadmin pack' (or git repack). The idea is simple, as per stsp' explanation on irc a while ago: find the biggest packfile by number of objects, throw everything in a idset, then iterate the other pack files and see if they're redundant. It's particularly useful as we're currently lacking a way to remove unneded pack files, and rely on 'git gc' or 'git repack -d' to do so. I'm not sure I got the .keep handling right. git-repack(1) mentions that files .keep are skipped when *creating* a packfile, but gotadmin pack ignores this convention, and I don't see a reason for this special case. By deduction, such packfile is hardly redundant (although it could still be) and so unlikely to be deleted. unlink_pack_path() in git' packfile.c with force_delete=0 won't remove the pack when it finds the .keep, otoh remove_redundant_pack() in git' repack.c sets force_delete to one. I decided to keep pack files with a matching .keep, partly because I'm getting lost in this maze, and the name ".keep" suggest so. diff /home/op/w/got commit - 2f3ccc5ff648594ff13619e304c2a139a917f2d2 path + /home/op/w/got blob - 5278c5830ad6b4af4c8d43dbbe466b6a4851dd25 file + gotadmin/gotadmin.1 --- gotadmin/gotadmin.1 +++ gotadmin/gotadmin.1 @@ -232,8 +232,9 @@ Purge unreferenced loose objects from the repository a .Op Fl r Ar repository-path .Xc .Dl Pq alias: Cm cl -Purge unreferenced loose objects from the repository and display -the amount of disk space which has been freed as a result. +Purge unreferenced loose objects and redundant pack files from the +repository and display the amount of disk space which has been freed +as a result. .Pp Unreferenced objects are present in the repository but cannot be reached via any reference in the entire @@ -246,12 +247,18 @@ Packed objects stored in pack files under spread across 256 sub-directories named after the 256 possible hexadecimal values of the first byte of an object identifier. .Pp -Packed objects stored in pack files under -.Pa objects/pack/ -will not be purged. -However, if redundant copies of packed objects exist in loose form, -such redundant copies will be purged. +Packed objects are stored in pack files under +.Pa objects/pack/ . .Pp +If redundant copies of packed objects exist in loose form, such +redundant copies will be purged. +If all the objects of a pack file are present in another pack file, +such redundant pack file will be purged. +.Pp +For compatibility with Git, if a matching file +.Pa .keep +exists for a given pack file, such pack file won't be removed. +.Pp Objects will usually become unreferenced as a result of deleting branches or tags with .Cm got branch -d @@ -261,6 +268,10 @@ In order to determine the set of objects which are ref .Cm got ref -d may also leave unreferenced objects behind. .Pp +Pack files will usually become redundant as a result of creating +bigger packs with +.Cm gotadmin pack . +.Pp In order to determine the set of objects which are referenced, search all references for commit objects and tag objects, and traverse the corresponding tree object hierarchies. blob - dee750d952183c77a15074c748c77e5a2510a5e5 file + gotadmin/gotadmin.c --- gotadmin/gotadmin.c +++ gotadmin/gotadmin.c @@ -1136,16 +1136,19 @@ struct got_cleanup_progress_arg { int last_nloose; int last_ncommits; int last_npurged; + int last_nredundant; int verbosity; int printed_something; int dry_run; }; static const struct got_error * -cleanup_progress(void *arg, int nloose, int ncommits, int npurged) +cleanup_progress(void *arg, int nloose, int ncommits, int npurged, + int nredundant) { struct got_cleanup_progress_arg *a = arg; int print_loose = 0, print_commits = 0, print_purged = 0; + int print_redundant = 0; if (a->last_nloose != nloose) { print_loose = 1; @@ -1162,11 +1165,15 @@ cleanup_progress(void *arg, int nloose, int ncommits, print_purged = 1; a->last_npurged = npurged; } + if (a->last_nredundant != nredundant) { + print_redundant = 1; + a->last_nredundant = nredundant; + } if (a->verbosity < 0) return NULL; - if (print_loose || print_commits || print_purged) + if (print_loose || print_commits || print_purged || print_redundant) printf("\r"); if (print_loose) printf("%d loose object%s", nloose, nloose == 1 ? "" : "s"); @@ -1182,7 +1189,16 @@ cleanup_progress(void *arg, int nloose, int ncommits, npurged == 1 ? "" : "s"); } } - if (print_loose || print_commits || print_purged) { + if (print_redundant) { + if (a->dry_run) { + printf("%d pack file%s could be purged", nredundant, + nredundant == 1 ? "" : "s"); + } else { + printf("%d pack file%s purged", nredundant, + nredundant == 1 ? "" : "s"); + } + } + if (print_loose || print_commits || print_purged || print_redundant) { a->printed_something = 1; fflush(stdout); } @@ -1222,10 +1238,14 @@ cmd_cleanup(int argc, char *argv[]) int remove_lonely_packidx = 0, ignore_mtime = 0; struct got_cleanup_progress_arg cpa; struct got_lonely_packidx_progress_arg lpa; - off_t size_before, size_after; - char scaled_before[FMT_SCALED_STRSIZE]; - char scaled_after[FMT_SCALED_STRSIZE]; - char scaled_diff[FMT_SCALED_STRSIZE]; + off_t loose_before, loose_after; + off_t pack_before, pack_after; + off_t total_size; + char loose_before_scaled[FMT_SCALED_STRSIZE]; + char loose_after_scaled[FMT_SCALED_STRSIZE]; + char pack_before_scaled[FMT_SCALED_STRSIZE]; + char pack_after_scaled[FMT_SCALED_STRSIZE]; + char total_size_scaled[FMT_SCALED_STRSIZE]; int *pack_fds = NULL; #ifndef PROFILE @@ -1299,37 +1319,67 @@ cmd_cleanup(int argc, char *argv[]) memset(&cpa, 0, sizeof(cpa)); cpa.last_ncommits = -1; cpa.last_npurged = -1; + cpa.last_nredundant = -1; cpa.dry_run = dry_run; cpa.verbosity = verbosity; + error = got_repo_purge_unreferenced_loose_objects(repo, - &size_before, &size_after, &npacked, dry_run, ignore_mtime, + &loose_before, &loose_after, &npacked, dry_run, ignore_mtime, cleanup_progress, &cpa, check_cancelled, NULL); if (cpa.printed_something) printf("\n"); if (error) goto done; + + cpa.printed_something = 0; + cpa.last_ncommits = -1; + cpa.last_npurged = -1; + cpa.last_nloose = -1; + cpa.last_nredundant = -1; + error = got_repo_purge_redundant_packfiles(repo, &pack_before, + &pack_after, dry_run, cleanup_progress, &cpa, + check_cancelled, NULL); + if (error) + goto done; + if (cpa.printed_something) + printf("\n"); + + total_size = (loose_before - loose_after) + (pack_before - pack_after); + if (cpa.printed_something) { - if (fmt_scaled(size_before, scaled_before) == -1) { + if (fmt_scaled(loose_before, loose_before_scaled) == -1) { error = got_error_from_errno("fmt_scaled"); goto done; } - if (fmt_scaled(size_after, scaled_after) == -1) { + if (fmt_scaled(loose_after, loose_after_scaled) == -1) { error = got_error_from_errno("fmt_scaled"); goto done; } - if (fmt_scaled(size_before - size_after, scaled_diff) == -1) { + if (fmt_scaled(pack_before, pack_before_scaled) == -1) { error = got_error_from_errno("fmt_scaled"); goto done; } - printf("loose total size before: %s\n", scaled_before); - printf("loose total size after: %s\n", scaled_after); + if (fmt_scaled(pack_after, pack_after_scaled) == -1) { + error = got_error_from_errno("fmt_scaled"); + goto done; + } + if (fmt_scaled(total_size, total_size_scaled) == -1) { + error = got_error_from_errno("fmt_scaled"); + goto done; + } + printf("loose total size before: %s\n", loose_before_scaled); + printf("loose total size after: %s\n", loose_after_scaled); + printf("pack files total size before: %s\n", + pack_before_scaled); + printf("pack files total size after: %s\n", pack_after_scaled); if (dry_run) { printf("disk space which would be freed: %s\n", - scaled_diff); + total_size_scaled); } else - printf("disk space freed: %s\n", scaled_diff); + printf("disk space freed: %s\n", total_size_scaled); printf("loose objects also found in pack files: %d\n", npacked); } + done: if (repo) got_repo_close(repo); blob - 5ddd50191b8acfb903877b148bb856de99014b07 file + include/got_repository_admin.h --- include/got_repository_admin.h +++ include/got_repository_admin.h @@ -70,7 +70,7 @@ typedef const struct got_error *(*got_cleanup_progress /* A callback function which gets invoked with cleanup information to print. */ typedef const struct got_error *(*got_cleanup_progress_cb)(void *arg, - int nloose, int ncommits, int npurged); + int nloose, int ncommits, int npurged, int nredundant); /* * Walk objects reachable via references to determine whether any loose @@ -88,6 +88,12 @@ got_repo_purge_unreferenced_loose_objects(struct got_r int ignore_mtime, got_cleanup_progress_cb progress_cb, void *progress_arg, got_cancel_cb cancel_cb, void *cancel_arg); +const struct got_error * +got_repo_purge_redundant_packfiles(struct got_repository *repo, + off_t *before, off_t *size_after, int dry_run, + got_cleanup_progress_cb progress_cb, void *progress_arg, + got_cancel_cb cancel_cb, void *cancel_arg); + /* A callback function which gets invoked with cleanup information to print. */ typedef const struct got_error *(*got_lonely_packidx_progress_cb)(void *arg, const char *path); blob - 48bed8a15e5f885c25ca796741f37fb70f1b87e3 file + lib/repository_admin.c --- lib/repository_admin.c +++ lib/repository_admin.c @@ -628,7 +628,7 @@ report_cleanup_progress(got_cleanup_progress_cb progre if (err || !elapsed) return err; - return progress_cb(progress_arg, nloose, ncommits, npurged); + return progress_cb(progress_arg, nloose, ncommits, npurged, -1); } static const struct got_error * @@ -1153,7 +1153,7 @@ got_repo_purge_unreferenced_loose_objects(struct got_r if (nloose == 0) { got_object_idset_free(loose_ids); if (progress_cb) { - err = progress_cb(progress_arg, 0, 0, 0); + err = progress_cb(progress_arg, 0, 0, 0, -1); if (err) return err; } @@ -1217,7 +1217,8 @@ got_repo_purge_unreferenced_loose_objects(struct got_r /* Produce a final progress report. */ if (progress_cb) { - err = progress_cb(progress_arg, nloose, ncommits, arg.npurged); + err = progress_cb(progress_arg, nloose, ncommits, arg.npurged, + -1); if (err) goto done; } @@ -1227,6 +1228,153 @@ static const struct got_error * return err; } +const struct got_error * +got_repo_purge_redundant_packfiles(struct got_repository *repo, + off_t *size_before, off_t *size_after, int dry_run, + got_cleanup_progress_cb progress_cb, void *progress_arg, + got_cancel_cb cancel_cb, void *cancel_arg) +{ + static const char *ext[] = {".pack", ".idx", ".rev", ".bitmap", + ".promisor", ".mtimes"}; + const struct got_error *err; + struct got_packidx *best, *packidx; + struct got_object_idset *idset = NULL; + struct got_pathlist_entry *pe; + struct got_packidx_object_id *pid; + struct got_object_id id; + struct stat sb; + const char *packidx_path; + char *dot, path[PATH_MAX]; + size_t i, nobjects; + int remove, packs = 0; + + *size_before = 0; + *size_after = 0; + + err = got_pack_find_pack_for_reuse(&best, repo); + if (err || best == NULL) + goto done; + + idset = got_object_idset_alloc(); + if (idset == NULL) { + err = got_error_from_errno("got_object_idset_alloc"); + goto done; + } + + nobjects = be32toh(best->hdr.fanout_table[0xff]); + for (i = 0; i < nobjects; ++i) { + pid = &best->hdr.sorted_ids[i]; + + memset(&id, 0, sizeof(id)); + memcpy(&id.sha1, pid->sha1, sizeof(id.sha1)); + + err = got_object_idset_add(idset, &id, NULL); + if (err) + goto done; + } + + TAILQ_FOREACH(pe, &repo->packidx_paths, entry) { + if (cancel_cb) { + err = (*cancel_cb)(cancel_arg); + if (err) + break; + } + + packidx_path = pe->path; + err = got_repo_get_packidx(&packidx, packidx_path, + repo); + if (err) + goto done; + + if (packidx == best) { + remove = 0; + } else { + nobjects = be32toh(packidx->hdr.fanout_table[0xff]); + for (i = 0; i < nobjects; ++i) { + pid = &packidx->hdr.sorted_ids[i]; + + memset(&id, 0, sizeof(id)); + memcpy(&id.sha1, pid->sha1, sizeof(id.sha1)); + + if (!got_object_idset_contains(idset, &id)) + break; + } + + remove = (i == nobjects); + } + + if (strlcpy(path, packidx_path, sizeof(path)) >= + sizeof(path)) { + err = got_error(GOT_ERR_NO_SPACE); + goto done; + } + + /* + * For compatibility with Git, if a matching .keep + * file exist don't delete the packfile. + */ + dot = strrchr(path, '.'); + *dot = '\0'; + if (strlcat(path, ".keep", sizeof(path)) >= sizeof(path)) { + err = got_error(GOT_ERR_NO_SPACE); + goto done; + } + if (faccessat(got_repo_get_fd(repo), path, F_OK, 0) == 0) + remove = 0; + + if (remove) + packs++; + + for (i = 0; i < nitems(ext); ++i) { + *dot = '\0'; + + if (strlcat(path, ext[i], sizeof(path)) >= + sizeof(path)) { + err = got_error(GOT_ERR_NO_SPACE); + goto done; + } + + if (fstatat(got_repo_get_fd(repo), path, &sb, 0) == + -1) { + if (errno == ENOENT && + strcmp(ext[i], ".pack") != 0 && + strcmp(ext[i], ".idx") != 0) + continue; + err = got_error_from_errno2("fstatat", path); + goto done; + } + + *size_before += sb.st_size; + if (!remove) { + *size_after += sb.st_size; + continue; + } + err = progress_cb(progress_arg, -1, -1, -1, packs); + if (err) + goto done; + + if (dry_run) + continue; + + if (unlinkat(got_repo_get_fd(repo), path, 0) == -1) { + if (errno == ENOENT && + strcmp(ext[i], ".pack") != 0 && + strcmp(ext[i], ".idx") != 0) + continue; + err = got_error_from_errno2("unlinkat", + path); + goto done; + } + } + } + + err = progress_cb(progress_arg, -1, -1, -1, packs); + done: + if (idset) + got_object_idset_free(idset); + return err; +} + static const struct got_error * remove_packidx(int dir_fd, const char *relpath) { blob - 2381958704d73cb1ebaeb0f7af8d2bf01b7d434f file + regress/cmdline/cleanup.sh --- regress/cmdline/cleanup.sh +++ regress/cmdline/cleanup.sh @@ -237,6 +237,62 @@ test_cleanup_precious_objects() { test_done "$testroot" "$ret" } +test_cleanup_redundant_pack_files() { + local testroot=`test_init cleanup_redundant_pack_files` + + # no pack files should exist yet + n=$(ls "$testroot/repo/.git/objects/pack" | wc -l | xargs) + if [ "$n" -ne 0 ]; then + echo "expected no pack file to exists, $n found" >&2 + test_done "$testroot" 1 + return 1 + fi + + # create a redundant pack with an associated .pack + hash=$(gotadmin pack -a -r "$testroot/repo" \ + | awk '/^Indexed/{print $2}') + kpack="$testroot/repo/.git/objects/pack/pack-$hash" + touch "${kpack%.pack}.keep" + + # create a few pack files, all with a different number of + # objects + for i in `jot 5`; do + gotadmin pack -a -r "$testroot/repo" >/dev/null + echo "alpha $i" > $testroot/repo/alpha + git_commit "$testroot/repo" -m "edit #$i" + done + + # create a pack with all objects + gotadmin pack -a -r "$testroot/repo" >/dev/null + + packs_before=$(gotadmin info -r "$testroot/repo" \ + | awk '/^pack files/{print $3}') + + gotadmin cleanup -r "$testroot/repo" | grep 'pack files? purged' \ + | tail -1 > $testroot/stdout + + echo "5 pack files purged" > $testroot/stdout.expected + if cmp -s "$testroot/stdout.expected" "$testroot/stdout"; then + diff -u "$testroot/stdout.expected" "$testroot/stdout" + test_done "$testroot" 1 + return 1 + fi + + if [ ! -f "$kpack" ]; then + echo "$kpack disappeared unexpectedly" >&2 + test_done "$testroot" 1 + return 1 + fi + + if [ ! -f "${kpack%.pack}.keep" ]; then + echo "${kpack%.pack}.keep disappeared unexpectedly" >&2 + test_done "$testroot" 1 + return 1 + fi + + test_done "$testroot" 0 +} + test_cleanup_precious_objects() { local testroot=`test_init cleanup_precious_objects` @@ -365,5 +421,6 @@ run_test test_cleanup_precious_objects test_parseargs "$@" run_test test_cleanup_unreferenced_loose_objects run_test test_cleanup_redundant_loose_objects +run_test test_cleanup_redundant_pack_files run_test test_cleanup_precious_objects run_test test_cleanup_missing_pack_file