Download raw body.
teach gotadmin cleanup to remove redundant packfiles
This is an attempt at teaching `gotadmin cleanup' to remove redundant
pack files. A packfile becomes "redundant" when another packfile
contains all its objects; this usually happens as more objects are
added to a repository and the user issues a `gotadmin pack' (or git
repack).
The idea is simple, as per stsp' explanation on irc a while ago: find
the biggest packfile by number of objects, throw everything in a
idset, then iterate the other pack files and see if they're redundant.
It's particularly useful as we're currently lacking a way to remove
unneded pack files, and rely on 'git gc' or 'git repack -d' to do so.
I'm not sure I got the .keep handling right. git-repack(1) mentions
that files .keep are skipped when *creating* a packfile, but gotadmin
pack ignores this convention, and I don't see a reason for this
special case. By deduction, such packfile is hardly redundant
(although it could still be) and so unlikely to be deleted.
unlink_pack_path() in git' packfile.c with force_delete=0 won't remove
the pack when it finds the .keep, otoh remove_redundant_pack() in git'
repack.c sets force_delete to one.
I decided to keep pack files with a matching .keep, partly because I'm
getting lost in this maze, and the name ".keep" suggest so.
diff /home/op/w/got
commit - 2f3ccc5ff648594ff13619e304c2a139a917f2d2
path + /home/op/w/got
blob - 5278c5830ad6b4af4c8d43dbbe466b6a4851dd25
file + gotadmin/gotadmin.1
--- gotadmin/gotadmin.1
+++ gotadmin/gotadmin.1
@@ -232,8 +232,9 @@ Purge unreferenced loose objects from the repository a
.Op Fl r Ar repository-path
.Xc
.Dl Pq alias: Cm cl
-Purge unreferenced loose objects from the repository and display
-the amount of disk space which has been freed as a result.
+Purge unreferenced loose objects and redundant pack files from the
+repository and display the amount of disk space which has been freed
+as a result.
.Pp
Unreferenced objects are present in the repository but cannot be
reached via any reference in the entire
@@ -246,12 +247,18 @@ Packed objects stored in pack files under
spread across 256 sub-directories named after the 256 possible
hexadecimal values of the first byte of an object identifier.
.Pp
-Packed objects stored in pack files under
-.Pa objects/pack/
-will not be purged.
-However, if redundant copies of packed objects exist in loose form,
-such redundant copies will be purged.
+Packed objects are stored in pack files under
+.Pa objects/pack/ .
.Pp
+If redundant copies of packed objects exist in loose form, such
+redundant copies will be purged.
+If all the objects of a pack file are present in another pack file,
+such redundant pack file will be purged.
+.Pp
+For compatibility with Git, if a matching file
+.Pa .keep
+exists for a given pack file, such pack file won't be removed.
+.Pp
Objects will usually become unreferenced as a result of deleting
branches or tags with
.Cm got branch -d
@@ -261,6 +268,10 @@ In order to determine the set of objects which are ref
.Cm got ref -d
may also leave unreferenced objects behind.
.Pp
+Pack files will usually become redundant as a result of creating
+bigger packs with
+.Cm gotadmin pack .
+.Pp
In order to determine the set of objects which are referenced, search
all references for commit objects and tag objects, and traverse the
corresponding tree object hierarchies.
blob - dee750d952183c77a15074c748c77e5a2510a5e5
file + gotadmin/gotadmin.c
--- gotadmin/gotadmin.c
+++ gotadmin/gotadmin.c
@@ -1136,16 +1136,19 @@ struct got_cleanup_progress_arg {
int last_nloose;
int last_ncommits;
int last_npurged;
+ int last_nredundant;
int verbosity;
int printed_something;
int dry_run;
};
static const struct got_error *
-cleanup_progress(void *arg, int nloose, int ncommits, int npurged)
+cleanup_progress(void *arg, int nloose, int ncommits, int npurged,
+ int nredundant)
{
struct got_cleanup_progress_arg *a = arg;
int print_loose = 0, print_commits = 0, print_purged = 0;
+ int print_redundant = 0;
if (a->last_nloose != nloose) {
print_loose = 1;
@@ -1162,11 +1165,15 @@ cleanup_progress(void *arg, int nloose, int ncommits,
print_purged = 1;
a->last_npurged = npurged;
}
+ if (a->last_nredundant != nredundant) {
+ print_redundant = 1;
+ a->last_nredundant = nredundant;
+ }
if (a->verbosity < 0)
return NULL;
- if (print_loose || print_commits || print_purged)
+ if (print_loose || print_commits || print_purged || print_redundant)
printf("\r");
if (print_loose)
printf("%d loose object%s", nloose, nloose == 1 ? "" : "s");
@@ -1182,7 +1189,16 @@ cleanup_progress(void *arg, int nloose, int ncommits,
npurged == 1 ? "" : "s");
}
}
- if (print_loose || print_commits || print_purged) {
+ if (print_redundant) {
+ if (a->dry_run) {
+ printf("%d pack file%s could be purged", nredundant,
+ nredundant == 1 ? "" : "s");
+ } else {
+ printf("%d pack file%s purged", nredundant,
+ nredundant == 1 ? "" : "s");
+ }
+ }
+ if (print_loose || print_commits || print_purged || print_redundant) {
a->printed_something = 1;
fflush(stdout);
}
@@ -1222,10 +1238,14 @@ cmd_cleanup(int argc, char *argv[])
int remove_lonely_packidx = 0, ignore_mtime = 0;
struct got_cleanup_progress_arg cpa;
struct got_lonely_packidx_progress_arg lpa;
- off_t size_before, size_after;
- char scaled_before[FMT_SCALED_STRSIZE];
- char scaled_after[FMT_SCALED_STRSIZE];
- char scaled_diff[FMT_SCALED_STRSIZE];
+ off_t loose_before, loose_after;
+ off_t pack_before, pack_after;
+ off_t total_size;
+ char loose_before_scaled[FMT_SCALED_STRSIZE];
+ char loose_after_scaled[FMT_SCALED_STRSIZE];
+ char pack_before_scaled[FMT_SCALED_STRSIZE];
+ char pack_after_scaled[FMT_SCALED_STRSIZE];
+ char total_size_scaled[FMT_SCALED_STRSIZE];
int *pack_fds = NULL;
#ifndef PROFILE
@@ -1299,37 +1319,67 @@ cmd_cleanup(int argc, char *argv[])
memset(&cpa, 0, sizeof(cpa));
cpa.last_ncommits = -1;
cpa.last_npurged = -1;
+ cpa.last_nredundant = -1;
cpa.dry_run = dry_run;
cpa.verbosity = verbosity;
+
error = got_repo_purge_unreferenced_loose_objects(repo,
- &size_before, &size_after, &npacked, dry_run, ignore_mtime,
+ &loose_before, &loose_after, &npacked, dry_run, ignore_mtime,
cleanup_progress, &cpa, check_cancelled, NULL);
if (cpa.printed_something)
printf("\n");
if (error)
goto done;
+
+ cpa.printed_something = 0;
+ cpa.last_ncommits = -1;
+ cpa.last_npurged = -1;
+ cpa.last_nloose = -1;
+ cpa.last_nredundant = -1;
+ error = got_repo_purge_redundant_packfiles(repo, &pack_before,
+ &pack_after, dry_run, cleanup_progress, &cpa,
+ check_cancelled, NULL);
+ if (error)
+ goto done;
+ if (cpa.printed_something)
+ printf("\n");
+
+ total_size = (loose_before - loose_after) + (pack_before - pack_after);
+
if (cpa.printed_something) {
- if (fmt_scaled(size_before, scaled_before) == -1) {
+ if (fmt_scaled(loose_before, loose_before_scaled) == -1) {
error = got_error_from_errno("fmt_scaled");
goto done;
}
- if (fmt_scaled(size_after, scaled_after) == -1) {
+ if (fmt_scaled(loose_after, loose_after_scaled) == -1) {
error = got_error_from_errno("fmt_scaled");
goto done;
}
- if (fmt_scaled(size_before - size_after, scaled_diff) == -1) {
+ if (fmt_scaled(pack_before, pack_before_scaled) == -1) {
error = got_error_from_errno("fmt_scaled");
goto done;
}
- printf("loose total size before: %s\n", scaled_before);
- printf("loose total size after: %s\n", scaled_after);
+ if (fmt_scaled(pack_after, pack_after_scaled) == -1) {
+ error = got_error_from_errno("fmt_scaled");
+ goto done;
+ }
+ if (fmt_scaled(total_size, total_size_scaled) == -1) {
+ error = got_error_from_errno("fmt_scaled");
+ goto done;
+ }
+ printf("loose total size before: %s\n", loose_before_scaled);
+ printf("loose total size after: %s\n", loose_after_scaled);
+ printf("pack files total size before: %s\n",
+ pack_before_scaled);
+ printf("pack files total size after: %s\n", pack_after_scaled);
if (dry_run) {
printf("disk space which would be freed: %s\n",
- scaled_diff);
+ total_size_scaled);
} else
- printf("disk space freed: %s\n", scaled_diff);
+ printf("disk space freed: %s\n", total_size_scaled);
printf("loose objects also found in pack files: %d\n", npacked);
}
+
done:
if (repo)
got_repo_close(repo);
blob - 5ddd50191b8acfb903877b148bb856de99014b07
file + include/got_repository_admin.h
--- include/got_repository_admin.h
+++ include/got_repository_admin.h
@@ -70,7 +70,7 @@ typedef const struct got_error *(*got_cleanup_progress
/* A callback function which gets invoked with cleanup information to print. */
typedef const struct got_error *(*got_cleanup_progress_cb)(void *arg,
- int nloose, int ncommits, int npurged);
+ int nloose, int ncommits, int npurged, int nredundant);
/*
* Walk objects reachable via references to determine whether any loose
@@ -88,6 +88,12 @@ got_repo_purge_unreferenced_loose_objects(struct got_r
int ignore_mtime, got_cleanup_progress_cb progress_cb, void *progress_arg,
got_cancel_cb cancel_cb, void *cancel_arg);
+const struct got_error *
+got_repo_purge_redundant_packfiles(struct got_repository *repo,
+ off_t *before, off_t *size_after, int dry_run,
+ got_cleanup_progress_cb progress_cb, void *progress_arg,
+ got_cancel_cb cancel_cb, void *cancel_arg);
+
/* A callback function which gets invoked with cleanup information to print. */
typedef const struct got_error *(*got_lonely_packidx_progress_cb)(void *arg,
const char *path);
blob - 48bed8a15e5f885c25ca796741f37fb70f1b87e3
file + lib/repository_admin.c
--- lib/repository_admin.c
+++ lib/repository_admin.c
@@ -628,7 +628,7 @@ report_cleanup_progress(got_cleanup_progress_cb progre
if (err || !elapsed)
return err;
- return progress_cb(progress_arg, nloose, ncommits, npurged);
+ return progress_cb(progress_arg, nloose, ncommits, npurged, -1);
}
static const struct got_error *
@@ -1153,7 +1153,7 @@ got_repo_purge_unreferenced_loose_objects(struct got_r
if (nloose == 0) {
got_object_idset_free(loose_ids);
if (progress_cb) {
- err = progress_cb(progress_arg, 0, 0, 0);
+ err = progress_cb(progress_arg, 0, 0, 0, -1);
if (err)
return err;
}
@@ -1217,7 +1217,8 @@ got_repo_purge_unreferenced_loose_objects(struct got_r
/* Produce a final progress report. */
if (progress_cb) {
- err = progress_cb(progress_arg, nloose, ncommits, arg.npurged);
+ err = progress_cb(progress_arg, nloose, ncommits, arg.npurged,
+ -1);
if (err)
goto done;
}
@@ -1227,6 +1228,153 @@ static const struct got_error *
return err;
}
+const struct got_error *
+got_repo_purge_redundant_packfiles(struct got_repository *repo,
+ off_t *size_before, off_t *size_after, int dry_run,
+ got_cleanup_progress_cb progress_cb, void *progress_arg,
+ got_cancel_cb cancel_cb, void *cancel_arg)
+{
+ static const char *ext[] = {".pack", ".idx", ".rev", ".bitmap",
+ ".promisor", ".mtimes"};
+ const struct got_error *err;
+ struct got_packidx *best, *packidx;
+ struct got_object_idset *idset = NULL;
+ struct got_pathlist_entry *pe;
+ struct got_packidx_object_id *pid;
+ struct got_object_id id;
+ struct stat sb;
+ const char *packidx_path;
+ char *dot, path[PATH_MAX];
+ size_t i, nobjects;
+ int remove, packs = 0;
+
+ *size_before = 0;
+ *size_after = 0;
+
+ err = got_pack_find_pack_for_reuse(&best, repo);
+ if (err || best == NULL)
+ goto done;
+
+ idset = got_object_idset_alloc();
+ if (idset == NULL) {
+ err = got_error_from_errno("got_object_idset_alloc");
+ goto done;
+ }
+
+ nobjects = be32toh(best->hdr.fanout_table[0xff]);
+ for (i = 0; i < nobjects; ++i) {
+ pid = &best->hdr.sorted_ids[i];
+
+ memset(&id, 0, sizeof(id));
+ memcpy(&id.sha1, pid->sha1, sizeof(id.sha1));
+
+ err = got_object_idset_add(idset, &id, NULL);
+ if (err)
+ goto done;
+ }
+
+ TAILQ_FOREACH(pe, &repo->packidx_paths, entry) {
+ if (cancel_cb) {
+ err = (*cancel_cb)(cancel_arg);
+ if (err)
+ break;
+ }
+
+ packidx_path = pe->path;
+ err = got_repo_get_packidx(&packidx, packidx_path,
+ repo);
+ if (err)
+ goto done;
+
+ if (packidx == best) {
+ remove = 0;
+ } else {
+ nobjects = be32toh(packidx->hdr.fanout_table[0xff]);
+ for (i = 0; i < nobjects; ++i) {
+ pid = &packidx->hdr.sorted_ids[i];
+
+ memset(&id, 0, sizeof(id));
+ memcpy(&id.sha1, pid->sha1, sizeof(id.sha1));
+
+ if (!got_object_idset_contains(idset, &id))
+ break;
+ }
+
+ remove = (i == nobjects);
+ }
+
+ if (strlcpy(path, packidx_path, sizeof(path)) >=
+ sizeof(path)) {
+ err = got_error(GOT_ERR_NO_SPACE);
+ goto done;
+ }
+
+ /*
+ * For compatibility with Git, if a matching .keep
+ * file exist don't delete the packfile.
+ */
+ dot = strrchr(path, '.');
+ *dot = '\0';
+ if (strlcat(path, ".keep", sizeof(path)) >= sizeof(path)) {
+ err = got_error(GOT_ERR_NO_SPACE);
+ goto done;
+ }
+ if (faccessat(got_repo_get_fd(repo), path, F_OK, 0) == 0)
+ remove = 0;
+
+ if (remove)
+ packs++;
+
+ for (i = 0; i < nitems(ext); ++i) {
+ *dot = '\0';
+
+ if (strlcat(path, ext[i], sizeof(path)) >=
+ sizeof(path)) {
+ err = got_error(GOT_ERR_NO_SPACE);
+ goto done;
+ }
+
+ if (fstatat(got_repo_get_fd(repo), path, &sb, 0) ==
+ -1) {
+ if (errno == ENOENT &&
+ strcmp(ext[i], ".pack") != 0 &&
+ strcmp(ext[i], ".idx") != 0)
+ continue;
+ err = got_error_from_errno2("fstatat", path);
+ goto done;
+ }
+
+ *size_before += sb.st_size;
+ if (!remove) {
+ *size_after += sb.st_size;
+ continue;
+ }
+ err = progress_cb(progress_arg, -1, -1, -1, packs);
+ if (err)
+ goto done;
+
+ if (dry_run)
+ continue;
+
+ if (unlinkat(got_repo_get_fd(repo), path, 0) == -1) {
+ if (errno == ENOENT &&
+ strcmp(ext[i], ".pack") != 0 &&
+ strcmp(ext[i], ".idx") != 0)
+ continue;
+ err = got_error_from_errno2("unlinkat",
+ path);
+ goto done;
+ }
+ }
+ }
+
+ err = progress_cb(progress_arg, -1, -1, -1, packs);
+ done:
+ if (idset)
+ got_object_idset_free(idset);
+ return err;
+}
+
static const struct got_error *
remove_packidx(int dir_fd, const char *relpath)
{
blob - 2381958704d73cb1ebaeb0f7af8d2bf01b7d434f
file + regress/cmdline/cleanup.sh
--- regress/cmdline/cleanup.sh
+++ regress/cmdline/cleanup.sh
@@ -237,6 +237,62 @@ test_cleanup_precious_objects() {
test_done "$testroot" "$ret"
}
+test_cleanup_redundant_pack_files() {
+ local testroot=`test_init cleanup_redundant_pack_files`
+
+ # no pack files should exist yet
+ n=$(ls "$testroot/repo/.git/objects/pack" | wc -l | xargs)
+ if [ "$n" -ne 0 ]; then
+ echo "expected no pack file to exists, $n found" >&2
+ test_done "$testroot" 1
+ return 1
+ fi
+
+ # create a redundant pack with an associated .pack
+ hash=$(gotadmin pack -a -r "$testroot/repo" \
+ | awk '/^Indexed/{print $2}')
+ kpack="$testroot/repo/.git/objects/pack/pack-$hash"
+ touch "${kpack%.pack}.keep"
+
+ # create a few pack files, all with a different number of
+ # objects
+ for i in `jot 5`; do
+ gotadmin pack -a -r "$testroot/repo" >/dev/null
+ echo "alpha $i" > $testroot/repo/alpha
+ git_commit "$testroot/repo" -m "edit #$i"
+ done
+
+ # create a pack with all objects
+ gotadmin pack -a -r "$testroot/repo" >/dev/null
+
+ packs_before=$(gotadmin info -r "$testroot/repo" \
+ | awk '/^pack files/{print $3}')
+
+ gotadmin cleanup -r "$testroot/repo" | grep 'pack files? purged' \
+ | tail -1 > $testroot/stdout
+
+ echo "5 pack files purged" > $testroot/stdout.expected
+ if cmp -s "$testroot/stdout.expected" "$testroot/stdout"; then
+ diff -u "$testroot/stdout.expected" "$testroot/stdout"
+ test_done "$testroot" 1
+ return 1
+ fi
+
+ if [ ! -f "$kpack" ]; then
+ echo "$kpack disappeared unexpectedly" >&2
+ test_done "$testroot" 1
+ return 1
+ fi
+
+ if [ ! -f "${kpack%.pack}.keep" ]; then
+ echo "${kpack%.pack}.keep disappeared unexpectedly" >&2
+ test_done "$testroot" 1
+ return 1
+ fi
+
+ test_done "$testroot" 0
+}
+
test_cleanup_precious_objects() {
local testroot=`test_init cleanup_precious_objects`
@@ -365,5 +421,6 @@ run_test test_cleanup_precious_objects
test_parseargs "$@"
run_test test_cleanup_unreferenced_loose_objects
run_test test_cleanup_redundant_loose_objects
+run_test test_cleanup_redundant_pack_files
run_test test_cleanup_precious_objects
run_test test_cleanup_missing_pack_file
teach gotadmin cleanup to remove redundant packfiles