"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
Stefan Sperling <stsp@stsp.name>
Subject:
reuse delta-base objects during pack file creation
To:
gameoftrees@openbsd.org
Date:
Fri, 20 Feb 2026 22:49:31 +0100

Download raw body.

Thread
  • Stefan Sperling:

    reuse delta-base objects during pack file creation

When cloning large repositories from gotd (e.g. ports.git) there is a
noticeable tail end during the deltification phase where progress becomes
slower. Time spent deltifying the last, say, 10% of objects may even
exceed the time spent on the first 90% of objects.

The reason is that we would be reusing deltas before we hit 90% and then
run deltification on the remaining objects. In this deltification phase we
do not copy delta-bases as they are, but we deltify them.
This takes time, and we gain very little from doing this work. Someone else
has already spent effort finding optimal delta bases while the pack file
which stores the delta-base object was created. And by deltifying delta-bases
we make delta chains longer, which can make unpacking slower.

Git reuses delta-base objects directly, too, so this is a proven approach.

We need two changes for this optimization, both combined in the diff below.

1) Add flags to the raw object data structure which tell us whether a raw
object was found in a pack file and, if so, whether it is stored as a delta
or as a delta-base (i.e. a verbatim copy of the object).

2) Skip the deltification loop for packed delta-bases.
We must still initialize m->dtab for these objects since they might be
used as a base during deltification of other objects. But we can skip the
expensive hunt for delta-base candidates. The code which writes out the
pack file will see that no delta has been calculated for these objects and
simply copy them to the generated pack file as-is.

Loose objects and non-reused deltified objects are still (re-)deltified
as before. To speed things up further in the future we could look into
reusing deltas from multiple pack files. Though I am not sure if the
additional complexity would be worth it. Well maintained repositories
have one large pack file and a handful of small ones.

While copying objects between packs we have decompression/recompression
overhead which i believe Git manages to avoid. This might be another
path to improving performance in the future.

ok?

M  lib/got_lib_object.h                       |   4+   0-
M  lib/got_lib_privsep.h                      |   3+   2-
M  lib/object_open_io.c                       |   8+   3-
M  lib/object_open_privsep.c                  |  22+  18-
M  lib/pack_create.c                          |  19+   0-
M  lib/privsep.c                              |   4+   2-
M  libexec/got-read-object/got-read-object.c  |   1+   1-
M  libexec/got-read-pack/got-read-pack.c      |   4+   1-

8 files changed, 65 insertions(+), 27 deletions(-)

commit - 9456c7974d487ec39d90e4fd16887cf464d3841e
commit + 14e4379835382ffd9432c3ee46ddb4f6e3c87a9d
blob - 0272a5dfbad3568d5ae28e7d1c782d936770da89
blob + 71d67d7703ccd903eb9164f8d467b0f268344c6b
--- lib/got_lib_object.h
+++ lib/got_lib_object.h
@@ -45,6 +45,10 @@ struct got_raw_object {
 	size_t hdrlen;
 	int refcnt;		/* > 0 if open and/or cached */
 
+	int flags;
+#define GOT_RAW_OBJ_FLAG_PACKED		0x01
+#define GOT_RAW_OBJ_FLAG_DELTIFIED	0x02
+
 	got_object_raw_close_cb *close_cb;
 	void *close_arg;
 };
blob - bee307699142e88ce12f0d346090e26d9b6238d3
blob + 5d45acef7f774410e32a0b1a920e76ff356930e8
--- lib/got_lib_privsep.h
+++ lib/got_lib_privsep.h
@@ -301,6 +301,7 @@ struct got_imsg_blob {
 struct got_imsg_raw_obj {
 	off_t size;
 	size_t hdrlen;
+	int flags;
 
 	/*
 	 * If size <= GOT_PRIVSEP_INLINE_OBJECT_DATA_MAX, object data follows
@@ -740,9 +741,9 @@ const struct got_error *got_privsep_get_imsg_obj(struc
 const struct got_error *got_privsep_recv_obj(struct got_object **,
     struct imsgbuf *);
 const struct got_error *got_privsep_send_raw_obj(struct imsgbuf *, off_t,
-    size_t, uint8_t *);
+    size_t, int, uint8_t *);
 const struct got_error *got_privsep_recv_raw_obj(uint8_t **, off_t *, size_t *,
-    struct imsgbuf *);
+    int *, struct imsgbuf *);
 const struct got_error *got_privsep_send_commit(struct imsgbuf *,
     struct got_commit_object *);
 const struct got_error *got_privsep_recv_commit(struct got_commit_object **,
blob - 7833a0beab9c4e9d117d69bf91ac068c17f523a9
blob + 1e71d08fb2e3518f6bf8f80909e64202b17a3252
--- lib/object_open_io.c
+++ lib/object_open_io.c
@@ -191,8 +191,8 @@ wrap_fd(FILE **f, int wrapped_fd)
 
 static const struct got_error *
 read_packed_object_raw(uint8_t **outbuf, off_t *size, size_t *hdrlen,
-    int outfd, struct got_pack *pack, struct got_packidx *packidx, int idx,
-    struct got_object_id *id)
+    int *flags, int outfd, struct got_pack *pack,
+    struct got_packidx *packidx, int idx, struct got_object_id *id)
 {
 	const struct got_error *err = NULL;
 	uint64_t raw_size = 0;
@@ -202,6 +202,7 @@ read_packed_object_raw(uint8_t **outbuf, off_t *size, 
 	*outbuf = NULL;
 	*size = 0;
 	*hdrlen = 0;
+	*flags = GOT_RAW_OBJ_FLAG_PACKED;
 
 	err = got_packfile_open_object(&obj, pack, packidx, idx, id);
 	if (err)
@@ -211,6 +212,7 @@ read_packed_object_raw(uint8_t **outbuf, off_t *size, 
 		err = got_pack_get_max_delta_object_size(&raw_size, obj, pack);
 		if (err)
 			goto done;
+		*flags |= GOT_RAW_OBJ_FLAG_DELTIFIED;
 	} else
 		raw_size = obj->size;
 
@@ -276,6 +278,7 @@ got_object_raw_open(struct got_raw_object **obj, int *
 	off_t size = 0;
 	size_t hdrlen = 0;
 	char *path_packfile = NULL;
+	int flags = 0;
 
 	*obj = got_repo_get_cached_raw_object(repo, id);
 	if (*obj != NULL) {
@@ -303,7 +306,7 @@ got_object_raw_open(struct got_raw_object **obj, int *
 			if (err)
 				goto done;
 		}
-		err = read_packed_object_raw(&outbuf, &size, &hdrlen,
+		err = read_packed_object_raw(&outbuf, &size, &hdrlen, &flags,
 		    tempfd, pack, packidx, idx, id);
 		if (err)
 			goto done;
@@ -342,6 +345,8 @@ got_object_raw_open(struct got_raw_object **obj, int *
 	    GOT_DELTA_RESULT_SIZE_CACHED_MAX, hdrlen, size);
 	if (err)
 		goto done;
+	
+	(*obj)->flags = flags;
 
 	err = got_repo_cache_raw_object(repo, id, *obj);
 	if (err) {
blob - dbfa06210469a59bc5a940787ba3ace5381315f0
blob + 5aab464c1c4b38c29510f1a0ddd2604f84f49ee7
--- lib/object_open_privsep.c
+++ lib/object_open_privsep.c
@@ -112,7 +112,8 @@ done:
 
 static const struct got_error *
 request_packed_object_raw(uint8_t **outbuf, off_t *size, size_t *hdrlen,
-    int outfd, struct got_pack *pack, int idx, struct got_object_id *id)
+    int *flags, int outfd, struct got_pack *pack, int idx,
+    struct got_object_id *id)
 {
 	const struct got_error *err = NULL;
 	struct imsgbuf *ibuf = pack->privsep_child->ibuf;
@@ -136,7 +137,7 @@ request_packed_object_raw(uint8_t **outbuf, off_t *siz
 	if (err)
 		return err;
 
-	err = got_privsep_recv_raw_obj(outbuf, size, hdrlen, ibuf);
+	err = got_privsep_recv_raw_obj(outbuf, size, hdrlen, flags, ibuf);
 	if (err)
 		return err;
 
@@ -161,8 +162,8 @@ read_packed_object_privsep(struct got_object **obj,
 
 static const struct got_error *
 read_packed_object_raw_privsep(uint8_t **outbuf, off_t *size, size_t *hdrlen,
-    int outfd, struct got_pack *pack, struct got_packidx *packidx, int idx,
-    struct got_object_id *id)
+    int *flags, int outfd, struct got_pack *pack, struct got_packidx *packidx,
+    int idx, struct got_object_id *id)
 {
 	const struct got_error *err = NULL;
 
@@ -172,8 +173,8 @@ read_packed_object_raw_privsep(uint8_t **outbuf, off_t
 			return err;
 	}
 
-	return request_packed_object_raw(outbuf, size, hdrlen, outfd, pack,
-	    idx, id);
+	return request_packed_object_raw(outbuf, size, hdrlen, flags,
+	    outfd, pack, idx, id);
 }
 
 const struct got_error *
@@ -294,8 +295,8 @@ request_object(struct got_object **obj, struct got_obj
 }
 
 static const struct got_error *
-request_raw_object(uint8_t **outbuf, off_t *size, size_t *hdrlen, int outfd,
-    struct got_object_id *id, struct got_repository *repo, int infd)
+request_raw_object(uint8_t **outbuf, off_t *size, size_t *hdrlen, int *flags,
+    int outfd, struct got_object_id *id, struct got_repository *repo, int infd)
 {
 	const struct got_error *err = NULL;
 	struct imsgbuf *ibuf;
@@ -315,7 +316,7 @@ request_raw_object(uint8_t **outbuf, off_t *size, size
 	if (err)
 		return err;
 
-	return got_privsep_recv_raw_obj(outbuf, size, hdrlen, ibuf);
+	return got_privsep_recv_raw_obj(outbuf, size, hdrlen, flags, ibuf);
 }
 
 static const struct got_error *
@@ -410,21 +411,21 @@ got_object_read_header_privsep(struct got_object **obj
 
 static const struct got_error *
 read_object_raw_privsep(uint8_t **outbuf, off_t *size, size_t *hdrlen,
-    int outfd, struct got_object_id *id, struct got_repository *repo,
-    int obj_fd)
+    int *flags, int outfd, struct got_object_id *id,
+    struct got_repository *repo, int obj_fd)
 {
 	const struct got_error *err;
 
 	if (repo->privsep_children[GOT_REPO_PRIVSEP_CHILD_OBJECT].imsg_fd != -1)
-		return request_raw_object(outbuf, size, hdrlen, outfd, id,
-		    repo, obj_fd);
+		return request_raw_object(outbuf, size, hdrlen, flags,
+		    outfd, id, repo, obj_fd);
 
 	err = start_child(repo, GOT_REPO_PRIVSEP_CHILD_OBJECT);
 	if (err)
 		return err;
 
-	return request_raw_object(outbuf, size, hdrlen, outfd, id, repo,
-	    obj_fd);
+	return request_raw_object(outbuf, size, hdrlen, flags,
+	    outfd, id, repo, obj_fd);
 }
 
 const struct got_error *
@@ -474,6 +475,7 @@ got_object_raw_open(struct got_raw_object **obj, int *
 	off_t size = 0;
 	size_t hdrlen = 0;
 	char *path_packfile = NULL;
+	int flags = 0;
 
 	*obj = got_repo_get_cached_raw_object(repo, id);
 	if (*obj != NULL) {
@@ -504,7 +506,7 @@ got_object_raw_open(struct got_raw_object **obj, int *
 				goto done;
 		}
 		err = read_packed_object_raw_privsep(&outbuf, &size, &hdrlen,
-		    *outfd, pack, packidx, idx, id);
+		    &flags, *outfd, pack, packidx, idx, id);
 		if (err)
 			goto done;
 	} else if (err->code == GOT_ERR_NO_OBJ) {
@@ -513,8 +515,8 @@ got_object_raw_open(struct got_raw_object **obj, int *
 		err = got_object_open_loose_fd(&fd, id, repo);
 		if (err)
 			goto done;
-		err = read_object_raw_privsep(&outbuf, &size, &hdrlen, *outfd,
-		    id, repo, fd);
+		err = read_object_raw_privsep(&outbuf, &size, &hdrlen, &flags,
+		    *outfd, id, repo, fd);
 		if (err)
 			goto done;
 	}
@@ -524,6 +526,8 @@ got_object_raw_open(struct got_raw_object **obj, int *
 	if (err)
 		goto done;
 
+	(*obj)->flags = flags;
+
 	err = got_repo_cache_raw_object(repo, id, *obj);
 done:
 	free(path_packfile);
blob - b0aa5bc15b2f5adb37972b7c68f53df5bbe8485d
blob + 88ae6a7f7857b14079689f7c4e8f26ef6cebdfce
--- lib/pack_create.c
+++ lib/pack_create.c
@@ -598,6 +598,25 @@ pick_deltas(struct got_pack_meta **meta, int nmeta, in
 			n->dtab = NULL;
 		}
 
+		/*
+		 * Do not deltify objects which are already packed and not
+		 * deltified. The object is likely a delta-base for deltas
+		 * we are reusing, and deltifying bases is a waste of time.
+		 * We can just copy them to our new pack file directly.
+		 * In the worst case we could end up using a bit more disk
+		 * space. But our newly created pack will be done sooner.
+		 *
+		 * We cannot skip deltified objects here because we cannot
+		 * tell whether their base object will be included in the
+		 * pack file we are generating.
+		 */
+		if ((raw->flags & GOT_RAW_OBJ_FLAG_PACKED) &&
+		    (raw->flags & GOT_RAW_OBJ_FLAG_DELTIFIED) == 0) {
+			got_object_raw_close(raw);
+			raw = NULL;
+			continue;
+		}
+
 		best_size = raw->size;
 		best_ndeltas = 0;
 		for (j = MAX(0, i - max_base_candidates); j < i; j++) {
blob - e580803b5171ab0806bd69d4818151fa6fdd24af
blob + ef88b5607ee86b311e7fe35357fa3aa76758968f
--- lib/privsep.c
+++ lib/privsep.c
@@ -275,7 +275,7 @@ got_privsep_send_raw_obj_outfd(struct imsgbuf *ibuf, i
 
 const struct got_error *
 got_privsep_send_raw_obj(struct imsgbuf *ibuf, off_t size, size_t hdrlen,
-    uint8_t *data)
+    int flags, uint8_t *data)
 {
 	struct got_imsg_raw_obj iobj;
 	size_t len = sizeof(iobj);
@@ -284,6 +284,7 @@ got_privsep_send_raw_obj(struct imsgbuf *ibuf, off_t s
 	memset(&iobj, 0, sizeof(iobj));
 	iobj.hdrlen = hdrlen;
 	iobj.size = size;
+	iobj.flags = flags;
 
 	if (data && size + hdrlen <= GOT_PRIVSEP_INLINE_OBJECT_DATA_MAX)
 		len += (size_t)size + hdrlen;
@@ -306,7 +307,7 @@ got_privsep_send_raw_obj(struct imsgbuf *ibuf, off_t s
 
 const struct got_error *
 got_privsep_recv_raw_obj(uint8_t **outbuf, off_t *size, size_t *hdrlen,
-    struct imsgbuf *ibuf)
+    int *flags, struct imsgbuf *ibuf)
 {
 	const struct got_error *err = NULL;
 	struct imsg imsg;
@@ -330,6 +331,7 @@ got_privsep_recv_raw_obj(uint8_t **outbuf, off_t *size
 		iobj = imsg.data;
 		*size = iobj->size;
 		*hdrlen = iobj->hdrlen;
+		*flags = iobj->flags;
 
 		if (datalen == sizeof(*iobj)) {
 			/* Data has been written to file descriptor. */
blob - b4ee8e364fcf7b360c15dc42312997cb86f45e78
blob + 6563535c8a757aae6d4e5dea7b875f0fedb1f7d1
--- libexec/got-read-object/got-read-object.c
+++ libexec/got-read-object/got-read-object.c
@@ -79,7 +79,7 @@ send_raw_obj(struct imsgbuf *ibuf, struct got_object *
 	if (err)
 		goto done;
 
-	err = got_privsep_send_raw_obj(ibuf, size, hdrlen, data);
+	err = got_privsep_send_raw_obj(ibuf, size, hdrlen, 0, data);
 done:
 	free(data);
 	if (close(fd) == -1 && err == NULL)
blob - 4e65a3f8939e6541fe568b471a16f36581536485
blob + d4a1d16e1ec13dbad6e5f94ef9499f3a3748bad4
--- libexec/got-read-pack/got-read-pack.c
+++ libexec/got-read-pack/got-read-pack.c
@@ -833,6 +833,7 @@ raw_object_request(struct imsg *imsg, struct imsgbuf *
 	struct got_object *obj;
 	struct got_object_id id;
 	size_t datalen;
+	int flags = GOT_RAW_OBJ_FLAG_PACKED;
 
 	datalen = imsg->hdr.len - IMSG_HEADER_SIZE;
 	if (datalen != sizeof(iobj))
@@ -858,6 +859,7 @@ raw_object_request(struct imsg *imsg, struct imsgbuf *
 		err = got_pack_get_max_delta_object_size(&size, obj, pack);
 		if (err)
 			goto done;
+		flags |= GOT_RAW_OBJ_FLAG_DELTIFIED;
 	} else
 		size = obj->size;
 
@@ -870,7 +872,8 @@ raw_object_request(struct imsg *imsg, struct imsgbuf *
 	if (err)
 		goto done;
 
-	err = got_privsep_send_raw_obj(ibuf, obj->size, obj->hdrlen, buf);
+	err = got_privsep_send_raw_obj(ibuf, obj->size, obj->hdrlen,
+	    flags, buf);
 done:
 	free(buf);
 	if (outfile && fclose(outfile) == EOF && err == NULL)