"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
Omar Polo <op@omarpolo.com>
Subject:
better packfiles cleanup
To:
gameoftrees@openbsd.org
Date:
Tue, 11 Jul 2023 15:35:06 +0200

Download raw body.

Thread
The gotadmin cleanup algorithm for pack files is inadequate because it
only works in terms of unique objects contained in pack files, not
about reachability.

Let's say you have a pack file which holds objects for a few temporary
branches that are long gone.  Those objects are unreachable, but since
the objects in that pack file are not present in any other pack (are
'unique') then gotadmin cleanup won't delete that pack.

Here's an attempt at reworking this by taking into consideration if
the objects are also reachable or not.  I've refactored a bit how we
cleanup since now I need to walk the objects unconditionally and
there's some churn due to swapping the order of the arguments in the
reporting function.

cleanup.sh contains also some changes to address previous mistakes.
cmp(1) returns nonzero when the files differ and since the output was
changed we can't grep it like that anymore, so use -q and check the
number of pack files later with gotadmin info.

one more thing that should be done but will be addressed as follow-up
is to avoid removing pack files that are 'too new', much like we do
with loose object, and for the same reason.

ok?


diffstat refs/heads/main refs/heads/cl2
 M  gotadmin/gotadmin.c             |   14+   29-
 M  include/got_repository_admin.h  |    9+   26-
 M  lib/repository_admin.c          |  186+  167-
 M  regress/cmdline/cleanup.sh      |   10+   28-

4 files changed, 219 insertions(+), 250 deletions(-)

diff refs/heads/main refs/heads/cl2
commit - 02a7e21b3f3a4da1e7a12935ca467231cc145f46
commit + dab83a6e3622a6e4f818c9487b066404f6a0ccc7
blob - 3d82b9f6c3907dadf515414ac9882e54f076f4ff
blob + 9480ca036e0727eb650ea94c3379acdcb64a26b1
--- gotadmin/gotadmin.c
+++ gotadmin/gotadmin.c
@@ -1154,31 +1154,31 @@ cleanup_progress(void *arg, int nloose, int ncommits, 
 };
 
 static const struct got_error *
-cleanup_progress(void *arg, int nloose, int ncommits, int npurged,
+cleanup_progress(void *arg, int ncommits, int nloose, int npurged,
     int nredundant)
 {
 	struct got_cleanup_progress_arg *a = arg;
 	int print_loose = 0, print_commits = 0, print_purged = 0;
 	int print_redundant = 0;
 
+	if (a->last_ncommits != ncommits) {
+		print_commits = 1;
+		a->last_ncommits = ncommits;
+	}
 	if (a->last_nloose != nloose) {
+		print_commits = 1;
 		print_loose = 1;
 		a->last_nloose = nloose;
 	}
-	if (a->last_ncommits != ncommits) {
-		print_loose = 1;
-		print_commits = 1;
-		a->last_ncommits = ncommits;
-	}
 	if (a->last_npurged != npurged) {
-		print_loose = 1;
 		print_commits = 1;
+		print_loose = 1;
 		print_purged = 1;
 		a->last_npurged = npurged;
 	}
 	if (a->last_nredundant != nredundant) {
-		print_loose = 1;
 		print_commits = 1;
+		print_loose = 1;
 		print_purged = 1;
 		print_redundant = 1;
 		a->last_nredundant = nredundant;
@@ -1189,11 +1189,11 @@ cleanup_progress(void *arg, int nloose, int ncommits, 
 
 	if (print_loose || print_commits || print_purged || print_redundant)
 		printf("\r");
-	if (print_loose)
-		printf("%d loose object%s", nloose, nloose == 1 ? "" : "s");
 	if (print_commits)
-		printf("; %d commit%s scanned", ncommits,
+		printf("%d commit%s scanned", ncommits,
 		    ncommits == 1 ? "" : "s");
+	if (print_loose)
+		printf("; %d loose object%s", nloose, nloose == 1 ? "" : "s");
 	if (print_purged || print_redundant) {
 		if (a->dry_run) {
 			printf("; could purge %d object%s", npurged,
@@ -1251,7 +1251,6 @@ cmd_cleanup(int argc, char *argv[])
 	int ch, dry_run = 0, verbosity = 0;
 	int ncommits = 0, nloose = 0, npacked = 0;
 	int remove_lonely_packidx = 0, ignore_mtime = 0;
-	struct got_lockfile *lock = NULL;
 	struct got_cleanup_progress_arg cpa;
 	struct got_lonely_packidx_progress_arg lpa;
 	off_t loose_before, loose_after;
@@ -1323,10 +1322,6 @@ cmd_cleanup(int argc, char *argv[])
 		goto done;
 	}
 
-	error = got_repo_cleanup_prepare(repo, &lock);
-	if (error)
-		goto done;
-
 	if (remove_lonely_packidx) {
 		memset(&lpa, 0, sizeof(lpa));
 		lpa.dry_run = dry_run;
@@ -1337,25 +1332,16 @@ cmd_cleanup(int argc, char *argv[])
 	}
 
 	memset(&cpa, 0, sizeof(cpa));
-	cpa.last_ncommits = -1;
+	cpa.last_nloose = -1;
 	cpa.last_npurged = -1;
 	cpa.last_nredundant = -1;
 	cpa.dry_run = dry_run;
 	cpa.verbosity = verbosity;
 
-	error = got_repo_purge_unreferenced_loose_objects(repo,
-	    &loose_before, &loose_after, &ncommits, &nloose, &npacked,
+	error = got_repo_cleanup(repo, &loose_before, &loose_after,
+	    &pack_before, &pack_after, &ncommits, &nloose, &npacked,
 	    dry_run, ignore_mtime, cleanup_progress, &cpa,
 	    check_cancelled, NULL);
-	if (error) {
-		if (cpa.printed_something)
-			printf("\n");
-		goto done;
-	}
-
-	error = got_repo_purge_redundant_packfiles(repo, &pack_before,
-	    &pack_after, dry_run, ncommits, nloose, npacked,
-	    cleanup_progress, &cpa, check_cancelled, NULL);
 	if (cpa.printed_something)
 		printf("\n");
 	if (error)
@@ -1398,7 +1384,6 @@ done:
 	}
 
 done:
-	got_repo_cleanup_complete(repo, lock);
 	if (repo)
 		got_repo_close(repo);
 	if (pack_fds) {
blob - ea8cf37270d81a00cc006814c1e95430045f8277
blob + 7175f86e3c1f00b9b4d870af1638dc3f1068d951
--- include/got_repository_admin.h
+++ include/got_repository_admin.h
@@ -14,8 +14,6 @@ struct got_lockfile;
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
-struct got_lockfile;
-
 /* A callback function which gets invoked with progress information to print. */
 typedef const struct got_error *(*got_pack_progress_cb)(void *arg,
     int ncolored, int nfound, int ntrees, off_t packfile_size, int ncommits,
@@ -70,21 +68,9 @@ got_repo_list_pack(FILE *packfile, struct got_object_i
     struct got_repository *repo, got_pack_list_cb list_cb, void *list_arg,
     got_cancel_cb cancel_cb, void *cancel_arg);
 
-/*
- * Prepare for removing loose objects or redundant packfiles.
- *
- * These functions do the necessary locking in order to avoid
- * concurrent operation to irremediably damage the repository.
- */
-const struct got_error *
-got_repo_cleanup_prepare(struct got_repository *, struct got_lockfile **);
-
-const struct got_error *
-got_repo_cleanup_complete(struct got_repository *, struct got_lockfile *);
-
 /* A callback function which gets invoked with cleanup information to print. */
 typedef const struct got_error *(*got_cleanup_progress_cb)(void *arg,
-    int nloose, int ncommits, int npurged, int nredundant);
+    int ncommits, int nloose, int npurged, int nredundant);
 
 /*
  * Walk objects reachable via references to determine whether any loose
@@ -92,24 +78,21 @@ typedef const struct got_error *(*got_cleanup_progress
  * unless the dry_run parameter is set.
  * Do not remove objects with a modification timestamp above an
  * implementation-defined timestamp threshold, unless ignore_mtime is set.
- * Return the disk space size occupied by loose objects before and after
- * the operation.
+ * Remove packfiles which objects are either unreachable or provided
+ * by biggest pack files.
+ * Return the disk space size occupied by loose objects and pack files
+ * before and after the operation.
  * Return the number of loose objects which are also stored in a pack file.
  */
 const struct got_error *
-got_repo_purge_unreferenced_loose_objects(struct got_repository *repo,
-    off_t *size_before, off_t *size_after, int *ncommits, int *nloose,
+got_repo_cleanup(struct got_repository *repo,
+    off_t *loose_before, off_t *loose_after,
+    off_t *pack_before, off_t *pack_after,
+    int *ncommits, int *nloose,
     int *npacked, int dry_run, int ignore_mtime,
     got_cleanup_progress_cb progress_cb, void *progress_arg,
     got_cancel_cb cancel_cb, void *cancel_arg);
 
-const struct got_error *
-got_repo_purge_redundant_packfiles(struct got_repository *repo,
-    off_t *before, off_t *size_after, int dry_run,
-    int nloose, int ncommits, int npurged,
-    got_cleanup_progress_cb progress_cb, void *progress_arg,
-    got_cancel_cb cancel_cb, void *cancel_arg);
-
 /* A callback function which gets invoked with cleanup information to print. */
 typedef const struct got_error *(*got_lonely_packidx_progress_cb)(void *arg,
     const char *path);
blob - 19d5908c4cf536ad152ec319e3493ffdd5a1183d
blob + 1230b643fa320fa603fe0a8ed03d19e53a6b57e1
--- lib/repository_admin.c
+++ lib/repository_admin.c
@@ -613,9 +613,8 @@ const struct got_error *
 	return err;
 }
 
-const struct got_error *
-got_repo_cleanup_prepare(struct got_repository *repo,
-    struct got_lockfile **lk)
+static const struct got_error *
+repo_cleanup_lock(struct got_repository *repo, struct got_lockfile **lk)
 {
 	const struct got_error *err;
 	char myname[_POSIX_HOST_NAME_MAX + 1];
@@ -639,20 +638,10 @@ const struct got_error *
 	return NULL;
 }
 
-const struct got_error *
-got_repo_cleanup_complete(struct got_repository *repo,
-    struct got_lockfile *lk)
-{
-	if (lk == NULL)
-		return NULL;
-
-	return got_lockfile_unlock(lk, got_repo_get_fd(repo));
-}
-
 static const struct got_error *
 report_cleanup_progress(got_cleanup_progress_cb progress_cb,
     void *progress_arg, struct got_ratelimit *rl,
-    int nloose, int ncommits, int npurged, int nredundant)
+    int ncommits, int nloose, int npurged, int nredundant)
 {
 	const struct got_error *err;
 	int elapsed;
@@ -664,11 +653,13 @@ report_cleanup_progress(got_cleanup_progress_cb progre
 	if (err || !elapsed)
 		return err;
 
-	return progress_cb(progress_arg, nloose, ncommits, npurged, nredundant);
+	return progress_cb(progress_arg, ncommits, nloose, npurged,
+	    nredundant);
 }
 
 static const struct got_error *
-get_loose_object_ids(struct got_object_idset **loose_ids, off_t *ondisk_size,
+get_loose_object_ids(struct got_object_idset **loose_ids,
+    off_t *ondisk_size, int ncommits,
     got_cleanup_progress_cb progress_cb, void *progress_arg,
     struct got_ratelimit *rl, struct got_repository *repo)
 {
@@ -758,9 +749,9 @@ get_loose_object_ids(struct got_object_idset **loose_i
 			if (err)
 				goto done;
 			err = report_cleanup_progress(progress_cb,
-			    progress_arg, rl,
+			    progress_arg, rl, ncommits,
 			    got_object_idset_num_elements(*loose_ids),
-			    -1, -1, -1);
+			    -1, -1);
 			if (err)
 				goto done;
 		}
@@ -791,44 +782,9 @@ preserve_loose_object(struct got_object_idset *loose_i
 }
 
 static const struct got_error *
-preserve_loose_object(struct got_object_idset *loose_ids,
-    struct got_object_id *id, struct got_repository *repo, int *npacked)
-{
-	const struct got_error *err = NULL;
-	struct got_object *obj;
-
-	if (!got_object_idset_contains(loose_ids, id))
-		return NULL;
-
-	/*
-	 * Try to open this object from a pack file. This ensures that
-	 * we do in fact have a valid packed copy of the object. Otherwise
-	 * we should not delete the loose representation of this object.
-	 */
-	err = got_object_open_packed(&obj, id, repo);
-	if (err == NULL) {
-		got_object_close(obj);
-		/*
-		 * The object is referenced and packed.
-		 * We can purge the redundantly stored loose object.
-		 */
-		(*npacked)++;
-		return NULL;
-	} else if (err->code != GOT_ERR_NO_OBJ)
-		return err;
-
-	/*
-	 * This object is referenced and not packed.
-	 * Remove it from our purge set.
-	 */
-	return got_object_idset_remove(NULL, loose_ids, id);
-}
-
-static const struct got_error *
 load_tree_entries(struct got_object_id_queue *ids,
-    struct got_object_idset *loose_ids,
     struct got_object_idset *traversed_ids, struct got_object_id *tree_id,
-    const char *dpath, struct got_repository *repo, int *npacked,
+    const char *dpath, struct got_repository *repo,
     got_cancel_cb cancel_cb, void *cancel_arg)
 {
 	const struct got_error *err;
@@ -870,10 +826,6 @@ load_tree_entries(struct got_object_id_queue *ids,
 			STAILQ_INSERT_TAIL(ids, qid, entry);
 		} else if (S_ISREG(mode)) {
 			/* This blob is referenced. */
-			err = preserve_loose_object(loose_ids, id, repo,
-			    npacked);
-			if (err)
-				break;
 			err = got_object_idset_add(traversed_ids, id, NULL);
 			if (err)
 				break;
@@ -888,9 +840,9 @@ load_tree(struct got_object_idset *loose_ids,
 }
 
 static const struct got_error *
-load_tree(struct got_object_idset *loose_ids,
-    struct got_object_idset *traversed_ids, struct got_object_id *tree_id,
-    const char *dpath, struct got_repository *repo, int *npacked,
+load_tree(struct got_object_idset *traversed_ids,
+    struct got_object_id *tree_id,
+    const char *dpath, struct got_repository *repo,
     got_cancel_cb cancel_cb, void *cancel_arg)
 {
 	const struct got_error *err = NULL;
@@ -925,13 +877,8 @@ load_tree(struct got_object_idset *loose_ids,
 			break;
 		}
 
-		/* This tree is referenced. */
-		err = preserve_loose_object(loose_ids, &qid->id, repo, npacked);
-		if (err)
-			break;
-
-		err = load_tree_entries(&tree_ids, loose_ids, traversed_ids,
-		    &qid->id, dpath, repo, npacked, cancel_cb, cancel_arg);
+		err = load_tree_entries(&tree_ids, traversed_ids,
+		    &qid->id, dpath, repo, cancel_cb, cancel_arg);
 		got_object_qid_free(qid);
 		if (err)
 			break;
@@ -942,12 +889,10 @@ load_commit_or_tag(struct got_object_idset *loose_ids,
 }
 
 static const struct got_error *
-load_commit_or_tag(struct got_object_idset *loose_ids, int *ncommits,
-    int *npacked, struct got_object_idset *traversed_ids,
+load_commit_or_tag(int *ncommits, struct got_object_idset *traversed_ids,
     struct got_object_id *id, struct got_repository *repo,
     got_cleanup_progress_cb progress_cb, void *progress_arg,
-    struct got_ratelimit *rl, int nloose,
-    got_cancel_cb cancel_cb, void *cancel_arg)
+    struct got_ratelimit *rl, got_cancel_cb cancel_cb, void *cancel_arg)
 {
 	const struct got_error *err;
 	struct got_commit_object *commit = NULL;
@@ -984,11 +929,6 @@ load_commit_or_tag(struct got_object_idset *loose_ids,
 		if (err)
 			break;
 
-		/* This commit or tag is referenced. */
-		err = preserve_loose_object(loose_ids, &qid->id, repo, npacked);
-		if (err)
-			break;
-
 		err = got_object_get_type(&obj_type, repo, &qid->id);
 		if (err)
 			break;
@@ -1030,24 +970,22 @@ load_commit_or_tag(struct got_object_idset *loose_ids,
 				/*
 				 * Tag points at something other than a
 				 * commit or tree. Leave this weird tag object
-				 * and the object it points to on disk.
+				 * and the object it points to.
 				 */
-				err = got_object_idset_remove(NULL, loose_ids,
-				    &qid->id);
-				if (err && err->code != GOT_ERR_NO_OBJ)
+				if (got_object_idset_contains(traversed_ids,
+				    got_object_tag_get_object_id(tag)))
+					break;
+				err = got_object_idset_add(traversed_ids,
+				    got_object_tag_get_object_id(tag), NULL);
+				if (err)
 					goto done;
-				err = got_object_idset_remove(NULL, loose_ids,
-				    got_object_tag_get_object_id(tag));
-				if (err && err->code != GOT_ERR_NO_OBJ)
-					goto done;
-				err = NULL;
 				break;
 			}
 		}
 
 		if (tree_id) {
-			err = load_tree(loose_ids, traversed_ids, tree_id, "",
-			    repo, npacked, cancel_cb, cancel_arg);
+			err = load_tree(traversed_ids, tree_id, "",
+			    repo, cancel_cb, cancel_arg);
 			if (err)
 				break;
 		}
@@ -1056,7 +994,7 @@ load_commit_or_tag(struct got_object_idset *loose_ids,
 			(*ncommits)++; /* scanned tags are counted as commits */
 
 		err = report_cleanup_progress(progress_cb, progress_arg, rl,
-		    nloose, *ncommits, -1, -1);
+		    *ncommits, -1, -1, -1);
 		if (err)
 			break;
 
@@ -1088,13 +1026,35 @@ struct purge_loose_object_arg {
 	return err;
 }
 
+static const struct got_error *
+is_object_packed(int *packed, struct got_repository *repo,
+    struct got_object_id *id)
+{
+	const struct got_error *err;
+	struct got_object *obj;
+
+	*packed = 0;
+
+	err = got_object_open_packed(&obj, id, repo);
+	if (err) {
+		if (err->code == GOT_ERR_NO_OBJ)
+			err = NULL;
+		return err;
+	}
+	got_object_close(obj);
+	*packed = 1;
+	return NULL;
+}
+
 struct purge_loose_object_arg {
 	struct got_repository *repo;
 	got_cleanup_progress_cb progress_cb;
 	void *progress_arg;
 	struct got_ratelimit *rl;
+	struct got_object_idset *traversed_ids;
 	int nloose;
 	int ncommits;
+	int npacked;
 	int npurged;
 	off_t size_purged;
 	int dry_run;
@@ -1108,10 +1068,20 @@ purge_loose_object(struct got_object_id *id, void *dat
 	struct purge_loose_object_arg *a = arg;
 	const struct got_error *err, *unlock_err = NULL;
 	char *path = NULL;
-	int fd = -1;
+	int packed, fd = -1;
 	struct stat sb;
 	struct got_lockfile *lf = NULL;
 
+	err = is_object_packed(&packed, a->repo, id);
+	if (err)
+		return err;
+
+	if (!packed && got_object_idset_contains(a->traversed_ids, id))
+		return NULL;
+
+	if (packed)
+		a->npacked++;
+
 	err = got_object_get_path(&path, id, a->repo);
 	if (err)
 		return err;
@@ -1145,7 +1115,7 @@ purge_loose_object(struct got_object_id *id, void *dat
 		a->npurged++;
 		a->size_purged += sb.st_size;
 		err = report_cleanup_progress(a->progress_cb, a->progress_arg,
-		    a->rl, a->nloose, a->ncommits, a->npurged, -1);
+		    a->rl, a->ncommits, a->nloose, a->npurged, -1);
 		if (err)
 			goto done;
 	}
@@ -1158,33 +1128,21 @@ const struct got_error *
 	return err ? err : unlock_err;
 }
 
-const struct got_error *
-got_repo_purge_unreferenced_loose_objects(struct got_repository *repo,
-    off_t *size_before, off_t *size_after, int *ncommits, int *nloose,
-    int *npacked, int dry_run, int ignore_mtime,
+static const struct got_error *
+repo_purge_unreferenced_loose_objects(struct got_repository *repo,
+    struct got_object_idset *traversed_ids,
+    off_t *size_before, off_t *size_after, int ncommits, int *nloose,
+    int *npacked, int *npurged, int dry_run, int ignore_mtime,
+    time_t max_mtime, struct got_ratelimit *rl,
     got_cleanup_progress_cb progress_cb, void *progress_arg,
     got_cancel_cb cancel_cb, void *cancel_arg)
 {
 	const struct got_error *err;
 	struct got_object_idset *loose_ids;
-	struct got_object_idset *traversed_ids;
-	struct got_object_id **referenced_ids;
-	int i, nreferenced;
-	struct got_reflist_head refs;
-	struct got_reflist_entry *re;
 	struct purge_loose_object_arg arg;
-	time_t max_mtime = 0;
-	struct got_ratelimit rl;
 
-	TAILQ_INIT(&refs);
-	got_ratelimit_init(&rl, 0, 500);
-
-	*size_before = 0;
-	*size_after = 0;
-	*npacked = 0;
-
-	err = get_loose_object_ids(&loose_ids, size_before,
-	    progress_cb, progress_arg, &rl, repo);
+	err = get_loose_object_ids(&loose_ids, size_before, ncommits,
+	    progress_cb, progress_arg, rl, repo);
 	if (err)
 		return err;
 	*nloose = got_object_idset_num_elements(loose_ids);
@@ -1198,71 +1156,36 @@ got_repo_purge_unreferenced_loose_objects(struct got_r
 		return NULL;
 	}
 
-	traversed_ids = got_object_idset_alloc();
-	if (traversed_ids == NULL) {
-		err = got_error_from_errno("got_object_idset_alloc");
-		goto done;
-	}
-
-	err = got_ref_list(&refs, repo, "", got_ref_cmp_by_name, NULL);
-	if (err)
-		goto done;
-	if (!ignore_mtime) {
-		TAILQ_FOREACH(re, &refs, entry) {
-			time_t mtime = got_ref_get_mtime(re->ref);
-			if (mtime > max_mtime)
-				max_mtime = mtime;
-		}
-		/*
-		 * For safety, keep objects created within 10 minutes
-		 * before the youngest reference was created.
-		 */
-		if (max_mtime >= 600)
-			max_mtime -= 600;
-	}
-
-	err = get_reflist_object_ids(&referenced_ids, &nreferenced,
-	    (1 << GOT_OBJ_TYPE_COMMIT) | (1 << GOT_OBJ_TYPE_TAG),
-	    &refs, repo, cancel_cb, cancel_arg);
-	if (err)
-		goto done;
-
-	for (i = 0; i < nreferenced; i++) {
-		struct got_object_id *id = referenced_ids[i];
-		err = load_commit_or_tag(loose_ids, ncommits, npacked,
-		    traversed_ids, id, repo, progress_cb, progress_arg, &rl,
-		    *nloose, cancel_cb, cancel_arg);
-		if (err)
-			goto done;
-	}
-
-	/* Any remaining loose objects are unreferenced and can be purged. */
+	memset(&arg, 0, sizeof(arg));
 	arg.repo = repo;
 	arg.progress_arg = progress_arg;
 	arg.progress_cb = progress_cb;
-	arg.rl = &rl;
+	arg.rl = rl;
+	arg.traversed_ids = traversed_ids;
 	arg.nloose = *nloose;
+	arg.npacked = 0;
 	arg.npurged = 0;
 	arg.size_purged = 0;
-	arg.ncommits = *ncommits;
 	arg.dry_run = dry_run;
 	arg.max_mtime = max_mtime;
 	arg.ignore_mtime = ignore_mtime;
 	err = got_object_idset_for_each(loose_ids, purge_loose_object, &arg);
 	if (err)
 		goto done;
+
 	*size_after = *size_before - arg.size_purged;
+	*npacked = arg.npacked;
+	*npurged = arg.npurged;
 
 	/* Produce a final progress report. */
 	if (progress_cb) {
-		err = progress_cb(progress_arg, *nloose, *ncommits, arg.npurged,
-		    -1);
+		err = progress_cb(progress_arg, ncommits, *nloose,
+		    arg.npurged, -1);
 		if (err)
 			goto done;
 	}
 done:
 	got_object_idset_free(loose_ids);
-	got_object_idset_free(traversed_ids);
 	return err;
 }
 
@@ -1328,6 +1251,7 @@ pack_is_redundant(int *redundant, struct got_repositor
 
 static const struct got_error *
 pack_is_redundant(int *redundant, struct got_repository *repo,
+    struct got_object_idset *traversed_ids,
     const char *packidx_path, struct got_object_idset *idset)
 {
 	const struct got_error *err;
@@ -1352,6 +1276,9 @@ pack_is_redundant(int *redundant, struct got_repositor
 		if (got_object_idset_contains(idset, &id))
 			continue;
 
+		if (!got_object_idset_contains(traversed_ids, &id))
+			continue;
+
 		*redundant = 0;
 		err = got_object_idset_add(idset, &id, NULL);
 		if (err)
@@ -1380,10 +1307,11 @@ const struct got_error *
 	return 1;
 }
 
-const struct got_error *
-got_repo_purge_redundant_packfiles(struct got_repository *repo,
+static const struct got_error *
+repo_purge_redundant_packfiles(struct got_repository *repo,
+    struct got_object_idset *traversed_ids,
     off_t *size_before, off_t *size_after, int dry_run,
-    int nloose, int ncommits, int npurged,
+    int nloose, int ncommits, int npurged, struct got_ratelimit *rl,
     got_cleanup_progress_cb progress_cb, void *progress_arg,
     got_cancel_cb cancel_cb, void *cancel_arg)
 {
@@ -1394,13 +1322,7 @@ got_repo_purge_redundant_packfiles(struct got_reposito
 	struct got_pathlist_entry *pe;
 	size_t i, npacks;
 	int remove, redundant_packs = 0;
-	struct got_ratelimit rl;
 
-	got_ratelimit_init(&rl, 0, 500);
-
-	*size_before = 0;
-	*size_after = 0;
-
 	npacks = 0;
 	TAILQ_FOREACH(pe, &repo->packidx_paths, entry)
 		npacks++;
@@ -1437,7 +1359,8 @@ got_repo_purge_redundant_packfiles(struct got_reposito
 				break;
 		}
 
-		err = pack_is_redundant(&remove, repo, sorted[i].path, idset);
+		err = pack_is_redundant(&remove, repo, traversed_ids,
+		    sorted[i].path, idset);
 		if (err)
 			goto done;
 		err = purge_redundant_pack(repo, sorted[i].path, dry_run,
@@ -1447,14 +1370,14 @@ got_repo_purge_redundant_packfiles(struct got_reposito
 		if (!remove)
 			continue;
 		err = report_cleanup_progress(progress_cb, progress_arg,
-		    &rl, nloose, ncommits, npurged, ++redundant_packs);
+		    rl, ncommits, nloose, npurged, ++redundant_packs);
 		if (err)
 			goto done;
 	}
 
 	/* Produce a final progress report. */
 	if (progress_cb) {
-		err = progress_cb(progress_arg, nloose, ncommits, npurged,
+		err = progress_cb(progress_arg, ncommits, nloose, npurged,
 		    redundant_packs);
 		if (err)
 			goto done;
@@ -1467,6 +1390,102 @@ got_repo_remove_lonely_packidx(struct got_repository *
 }
 
 const struct got_error *
+got_repo_cleanup(struct got_repository *repo,
+    off_t *loose_before, off_t *loose_after,
+    off_t *pack_before, off_t *pack_after,
+    int *ncommits, int *nloose, int *npacked, int dry_run, int ignore_mtime,
+    got_cleanup_progress_cb progress_cb, void *progress_arg,
+    got_cancel_cb cancel_cb, void *cancel_arg)
+{
+	const struct got_error *unlock_err, *err = NULL;
+	struct got_lockfile *lk = NULL;
+	struct got_ratelimit rl;
+	struct got_reflist_head refs;
+	struct got_object_idset *traversed_ids = NULL;
+	struct got_reflist_entry *re;
+	struct got_object_id **referenced_ids;
+	int i, nreferenced;
+	int npurged = 0;
+	time_t max_mtime = 0;
+
+	TAILQ_INIT(&refs);
+	got_ratelimit_init(&rl, 0, 500);
+
+	*loose_before = 0;
+	*loose_after = 0;
+	*pack_before = 0;
+	*pack_after = 0;
+	*ncommits = 0;
+	*nloose = 0;
+	*npacked = 0;
+
+	err = repo_cleanup_lock(repo, &lk);
+	if (err)
+		return err;
+
+	traversed_ids = got_object_idset_alloc();
+	if (traversed_ids == NULL) {
+		err = got_error_from_errno("got_object_idset_alloc");
+		goto done;
+	}
+
+	err = got_ref_list(&refs, repo, "", got_ref_cmp_by_name, NULL);
+	if (err)
+		goto done;
+	if (!ignore_mtime) {
+		TAILQ_FOREACH(re, &refs, entry) {
+			time_t mtime = got_ref_get_mtime(re->ref);
+			if (mtime > max_mtime)
+				max_mtime = mtime;
+		}
+		/*
+		 * For safety, keep objects created within 10 minutes
+		 * before the youngest reference was created.
+		 */
+		if (max_mtime >= 600)
+			max_mtime -= 600;
+	}
+
+	err = get_reflist_object_ids(&referenced_ids, &nreferenced,
+	    (1 << GOT_OBJ_TYPE_COMMIT) | (1 << GOT_OBJ_TYPE_TAG),
+	    &refs, repo, cancel_cb, cancel_arg);
+	if (err)
+		goto done;
+
+	for (i = 0; i < nreferenced; i++) {
+		struct got_object_id *id = referenced_ids[i];
+		err = load_commit_or_tag(ncommits, traversed_ids,
+		    id, repo, progress_cb, progress_arg, &rl,
+		    cancel_cb, cancel_arg);
+		if (err)
+			goto done;
+	}
+
+	err = repo_purge_unreferenced_loose_objects(repo, traversed_ids,
+	    loose_before, loose_after, *ncommits, nloose, npacked, &npurged,
+	    dry_run, ignore_mtime, max_mtime, &rl, progress_cb, progress_arg,
+	    cancel_cb, cancel_arg);
+	if (err)
+		goto done;
+
+	err = repo_purge_redundant_packfiles(repo, traversed_ids,
+	    pack_before, pack_after, dry_run, *nloose, *ncommits, npurged,
+	    &rl, progress_cb, progress_arg, cancel_cb, cancel_arg);
+	if (err)
+		goto done;
+
+ done:
+	if (lk) {
+		unlock_err = got_lockfile_unlock(lk, got_repo_get_fd(repo));
+		if (err == NULL)
+			err = unlock_err;
+	}
+	if (traversed_ids)
+		got_object_idset_free(traversed_ids);
+	return err;
+}
+
+const struct got_error *
 got_repo_remove_lonely_packidx(struct got_repository *repo, int dry_run,
     got_lonely_packidx_progress_cb progress_cb, void *progress_arg,
     got_cancel_cb cancel_cb, void *cancel_arg)
blob - f6ddbb9657a29cfb443245457d038aee5b06d364
blob + 757a755adad5a2bca886449ea29bc283017dd221
--- regress/cmdline/cleanup.sh
+++ regress/cmdline/cleanup.sh
@@ -266,16 +266,15 @@ test_cleanup_redundant_pack_files() {
 	gotadmin pack -a -r "$testroot/repo" >/dev/null
 	gotadmin pack -a -r "$testroot/repo" >/dev/null
 
-	gotadmin cleanup -r "$testroot/repo" | grep 'pack files? purged' \
-		| tail -1 > $testroot/stdout
+	# create another one with unreachable objects
+	(cd "$testroot/repo" && git checkout -q -b tempbranch)
+	echo "modified alpha on tempbranch" >$testroot/repo/alpha
+	git_commit "$testroot/repo" -m "edit alpha on tempbranch"
+	gotadmin pack -a -r "$testroot/repo" >/dev/null
+	(cd "$testroot/repo" && git checkout -q master)
+	(cd "$testroot/repo" && got branch -d tempbranch) >/dev/null
 
-	echo "5 pack files purged" > $testroot/stdout.expected
-	if cmp -s "$testroot/stdout.expected" "$testroot/stdout"; then
-		diff -u "$testroot/stdout.expected" "$testroot/stdout"
-		test_done "$testroot" 1
-		return 1
-	fi
-
+	gotadmin cleanup -q -r "$testroot/repo"
 	n=$(gotadmin info -r "$testroot/repo" | awk '/^pack files/{print $3}')
 	if [ "$n" -ne 2 ]; then
 		echo "expected 2 pack files left, $n found instead" >&2
@@ -302,16 +301,8 @@ test_cleanup_redundant_pack_files() {
 	done
 	gotadmin pack -r "$testroot/repo" >/dev/null
 
-	gotadmin cleanup -r "$testroot/repo" | grep 'pack files? purged' \
-	    | tail -1 > $testroot/stdout
+	gotadmin cleanup -q -r "$testroot/repo"
 
-	echo "0 pack files purged" > $testroot/stdout.expected
-	if cmp -s "$testroot/stdout.expected" "$testroot/stdout"; then
-		diff -u "$testroot/stdout.expected" "$testroot/stdout"
-		test_done "$testroot" 1
-		return 1
-	fi
-
 	n=$(gotadmin info -r "$testroot/repo" | awk '/^pack files/{print $3}')
 	if [ "$n" -ne 3 ]; then
 		echo "expected 3 pack files left, $n found instead" >&2
@@ -333,16 +324,7 @@ test_cleanup_redundant_pack_files() {
 
 	gotadmin pack -a -x master -r "$testroot/repo" >/dev/null
 
-	gotadmin cleanup -r "$testroot/repo" | grep 'pack files? purged' \
-		| tail -1 > $testroot/stdout
-
-	echo "6 pack files purged" > $testroot/stdout.expected
-	if cmp -s "$testroot/stdout.expected" "$testroot/stdout"; then
-		diff -u "$testroot/stdout.expected" "$testroot/stdout"
-		test_done "$testroot" 1
-		return 1
-	fi
-
+	gotadmin cleanup -q -r "$testroot/repo"
 	n=$(gotadmin info -r "$testroot/repo" | awk '/^pack files/{print $3}')
 	if [ "$n" -ne 3 ]; then
 		echo "expected 3 pack files left, $n found instead" >&2