From: Stefan Sperling Subject: reuse delta-base objects during pack file creation To: gameoftrees@openbsd.org Date: Fri, 20 Feb 2026 22:49:31 +0100 When cloning large repositories from gotd (e.g. ports.git) there is a noticeable tail end during the deltification phase where progress becomes slower. Time spent deltifying the last, say, 10% of objects may even exceed the time spent on the first 90% of objects. The reason is that we would be reusing deltas before we hit 90% and then run deltification on the remaining objects. In this deltification phase we do not copy delta-bases as they are, but we deltify them. This takes time, and we gain very little from doing this work. Someone else has already spent effort finding optimal delta bases while the pack file which stores the delta-base object was created. And by deltifying delta-bases we make delta chains longer, which can make unpacking slower. Git reuses delta-base objects directly, too, so this is a proven approach. We need two changes for this optimization, both combined in the diff below. 1) Add flags to the raw object data structure which tell us whether a raw object was found in a pack file and, if so, whether it is stored as a delta or as a delta-base (i.e. a verbatim copy of the object). 2) Skip the deltification loop for packed delta-bases. We must still initialize m->dtab for these objects since they might be used as a base during deltification of other objects. But we can skip the expensive hunt for delta-base candidates. The code which writes out the pack file will see that no delta has been calculated for these objects and simply copy them to the generated pack file as-is. Loose objects and non-reused deltified objects are still (re-)deltified as before. To speed things up further in the future we could look into reusing deltas from multiple pack files. Though I am not sure if the additional complexity would be worth it. Well maintained repositories have one large pack file and a handful of small ones. While copying objects between packs we have decompression/recompression overhead which i believe Git manages to avoid. This might be another path to improving performance in the future. ok? M lib/got_lib_object.h | 4+ 0- M lib/got_lib_privsep.h | 3+ 2- M lib/object_open_io.c | 8+ 3- M lib/object_open_privsep.c | 22+ 18- M lib/pack_create.c | 19+ 0- M lib/privsep.c | 4+ 2- M libexec/got-read-object/got-read-object.c | 1+ 1- M libexec/got-read-pack/got-read-pack.c | 4+ 1- 8 files changed, 65 insertions(+), 27 deletions(-) commit - 9456c7974d487ec39d90e4fd16887cf464d3841e commit + 14e4379835382ffd9432c3ee46ddb4f6e3c87a9d blob - 0272a5dfbad3568d5ae28e7d1c782d936770da89 blob + 71d67d7703ccd903eb9164f8d467b0f268344c6b --- lib/got_lib_object.h +++ lib/got_lib_object.h @@ -45,6 +45,10 @@ struct got_raw_object { size_t hdrlen; int refcnt; /* > 0 if open and/or cached */ + int flags; +#define GOT_RAW_OBJ_FLAG_PACKED 0x01 +#define GOT_RAW_OBJ_FLAG_DELTIFIED 0x02 + got_object_raw_close_cb *close_cb; void *close_arg; }; blob - bee307699142e88ce12f0d346090e26d9b6238d3 blob + 5d45acef7f774410e32a0b1a920e76ff356930e8 --- lib/got_lib_privsep.h +++ lib/got_lib_privsep.h @@ -301,6 +301,7 @@ struct got_imsg_blob { struct got_imsg_raw_obj { off_t size; size_t hdrlen; + int flags; /* * If size <= GOT_PRIVSEP_INLINE_OBJECT_DATA_MAX, object data follows @@ -740,9 +741,9 @@ const struct got_error *got_privsep_get_imsg_obj(struc const struct got_error *got_privsep_recv_obj(struct got_object **, struct imsgbuf *); const struct got_error *got_privsep_send_raw_obj(struct imsgbuf *, off_t, - size_t, uint8_t *); + size_t, int, uint8_t *); const struct got_error *got_privsep_recv_raw_obj(uint8_t **, off_t *, size_t *, - struct imsgbuf *); + int *, struct imsgbuf *); const struct got_error *got_privsep_send_commit(struct imsgbuf *, struct got_commit_object *); const struct got_error *got_privsep_recv_commit(struct got_commit_object **, blob - 7833a0beab9c4e9d117d69bf91ac068c17f523a9 blob + 1e71d08fb2e3518f6bf8f80909e64202b17a3252 --- lib/object_open_io.c +++ lib/object_open_io.c @@ -191,8 +191,8 @@ wrap_fd(FILE **f, int wrapped_fd) static const struct got_error * read_packed_object_raw(uint8_t **outbuf, off_t *size, size_t *hdrlen, - int outfd, struct got_pack *pack, struct got_packidx *packidx, int idx, - struct got_object_id *id) + int *flags, int outfd, struct got_pack *pack, + struct got_packidx *packidx, int idx, struct got_object_id *id) { const struct got_error *err = NULL; uint64_t raw_size = 0; @@ -202,6 +202,7 @@ read_packed_object_raw(uint8_t **outbuf, off_t *size, *outbuf = NULL; *size = 0; *hdrlen = 0; + *flags = GOT_RAW_OBJ_FLAG_PACKED; err = got_packfile_open_object(&obj, pack, packidx, idx, id); if (err) @@ -211,6 +212,7 @@ read_packed_object_raw(uint8_t **outbuf, off_t *size, err = got_pack_get_max_delta_object_size(&raw_size, obj, pack); if (err) goto done; + *flags |= GOT_RAW_OBJ_FLAG_DELTIFIED; } else raw_size = obj->size; @@ -276,6 +278,7 @@ got_object_raw_open(struct got_raw_object **obj, int * off_t size = 0; size_t hdrlen = 0; char *path_packfile = NULL; + int flags = 0; *obj = got_repo_get_cached_raw_object(repo, id); if (*obj != NULL) { @@ -303,7 +306,7 @@ got_object_raw_open(struct got_raw_object **obj, int * if (err) goto done; } - err = read_packed_object_raw(&outbuf, &size, &hdrlen, + err = read_packed_object_raw(&outbuf, &size, &hdrlen, &flags, tempfd, pack, packidx, idx, id); if (err) goto done; @@ -342,6 +345,8 @@ got_object_raw_open(struct got_raw_object **obj, int * GOT_DELTA_RESULT_SIZE_CACHED_MAX, hdrlen, size); if (err) goto done; + + (*obj)->flags = flags; err = got_repo_cache_raw_object(repo, id, *obj); if (err) { blob - dbfa06210469a59bc5a940787ba3ace5381315f0 blob + 5aab464c1c4b38c29510f1a0ddd2604f84f49ee7 --- lib/object_open_privsep.c +++ lib/object_open_privsep.c @@ -112,7 +112,8 @@ done: static const struct got_error * request_packed_object_raw(uint8_t **outbuf, off_t *size, size_t *hdrlen, - int outfd, struct got_pack *pack, int idx, struct got_object_id *id) + int *flags, int outfd, struct got_pack *pack, int idx, + struct got_object_id *id) { const struct got_error *err = NULL; struct imsgbuf *ibuf = pack->privsep_child->ibuf; @@ -136,7 +137,7 @@ request_packed_object_raw(uint8_t **outbuf, off_t *siz if (err) return err; - err = got_privsep_recv_raw_obj(outbuf, size, hdrlen, ibuf); + err = got_privsep_recv_raw_obj(outbuf, size, hdrlen, flags, ibuf); if (err) return err; @@ -161,8 +162,8 @@ read_packed_object_privsep(struct got_object **obj, static const struct got_error * read_packed_object_raw_privsep(uint8_t **outbuf, off_t *size, size_t *hdrlen, - int outfd, struct got_pack *pack, struct got_packidx *packidx, int idx, - struct got_object_id *id) + int *flags, int outfd, struct got_pack *pack, struct got_packidx *packidx, + int idx, struct got_object_id *id) { const struct got_error *err = NULL; @@ -172,8 +173,8 @@ read_packed_object_raw_privsep(uint8_t **outbuf, off_t return err; } - return request_packed_object_raw(outbuf, size, hdrlen, outfd, pack, - idx, id); + return request_packed_object_raw(outbuf, size, hdrlen, flags, + outfd, pack, idx, id); } const struct got_error * @@ -294,8 +295,8 @@ request_object(struct got_object **obj, struct got_obj } static const struct got_error * -request_raw_object(uint8_t **outbuf, off_t *size, size_t *hdrlen, int outfd, - struct got_object_id *id, struct got_repository *repo, int infd) +request_raw_object(uint8_t **outbuf, off_t *size, size_t *hdrlen, int *flags, + int outfd, struct got_object_id *id, struct got_repository *repo, int infd) { const struct got_error *err = NULL; struct imsgbuf *ibuf; @@ -315,7 +316,7 @@ request_raw_object(uint8_t **outbuf, off_t *size, size if (err) return err; - return got_privsep_recv_raw_obj(outbuf, size, hdrlen, ibuf); + return got_privsep_recv_raw_obj(outbuf, size, hdrlen, flags, ibuf); } static const struct got_error * @@ -410,21 +411,21 @@ got_object_read_header_privsep(struct got_object **obj static const struct got_error * read_object_raw_privsep(uint8_t **outbuf, off_t *size, size_t *hdrlen, - int outfd, struct got_object_id *id, struct got_repository *repo, - int obj_fd) + int *flags, int outfd, struct got_object_id *id, + struct got_repository *repo, int obj_fd) { const struct got_error *err; if (repo->privsep_children[GOT_REPO_PRIVSEP_CHILD_OBJECT].imsg_fd != -1) - return request_raw_object(outbuf, size, hdrlen, outfd, id, - repo, obj_fd); + return request_raw_object(outbuf, size, hdrlen, flags, + outfd, id, repo, obj_fd); err = start_child(repo, GOT_REPO_PRIVSEP_CHILD_OBJECT); if (err) return err; - return request_raw_object(outbuf, size, hdrlen, outfd, id, repo, - obj_fd); + return request_raw_object(outbuf, size, hdrlen, flags, + outfd, id, repo, obj_fd); } const struct got_error * @@ -474,6 +475,7 @@ got_object_raw_open(struct got_raw_object **obj, int * off_t size = 0; size_t hdrlen = 0; char *path_packfile = NULL; + int flags = 0; *obj = got_repo_get_cached_raw_object(repo, id); if (*obj != NULL) { @@ -504,7 +506,7 @@ got_object_raw_open(struct got_raw_object **obj, int * goto done; } err = read_packed_object_raw_privsep(&outbuf, &size, &hdrlen, - *outfd, pack, packidx, idx, id); + &flags, *outfd, pack, packidx, idx, id); if (err) goto done; } else if (err->code == GOT_ERR_NO_OBJ) { @@ -513,8 +515,8 @@ got_object_raw_open(struct got_raw_object **obj, int * err = got_object_open_loose_fd(&fd, id, repo); if (err) goto done; - err = read_object_raw_privsep(&outbuf, &size, &hdrlen, *outfd, - id, repo, fd); + err = read_object_raw_privsep(&outbuf, &size, &hdrlen, &flags, + *outfd, id, repo, fd); if (err) goto done; } @@ -524,6 +526,8 @@ got_object_raw_open(struct got_raw_object **obj, int * if (err) goto done; + (*obj)->flags = flags; + err = got_repo_cache_raw_object(repo, id, *obj); done: free(path_packfile); blob - b0aa5bc15b2f5adb37972b7c68f53df5bbe8485d blob + 88ae6a7f7857b14079689f7c4e8f26ef6cebdfce --- lib/pack_create.c +++ lib/pack_create.c @@ -598,6 +598,25 @@ pick_deltas(struct got_pack_meta **meta, int nmeta, in n->dtab = NULL; } + /* + * Do not deltify objects which are already packed and not + * deltified. The object is likely a delta-base for deltas + * we are reusing, and deltifying bases is a waste of time. + * We can just copy them to our new pack file directly. + * In the worst case we could end up using a bit more disk + * space. But our newly created pack will be done sooner. + * + * We cannot skip deltified objects here because we cannot + * tell whether their base object will be included in the + * pack file we are generating. + */ + if ((raw->flags & GOT_RAW_OBJ_FLAG_PACKED) && + (raw->flags & GOT_RAW_OBJ_FLAG_DELTIFIED) == 0) { + got_object_raw_close(raw); + raw = NULL; + continue; + } + best_size = raw->size; best_ndeltas = 0; for (j = MAX(0, i - max_base_candidates); j < i; j++) { blob - e580803b5171ab0806bd69d4818151fa6fdd24af blob + ef88b5607ee86b311e7fe35357fa3aa76758968f --- lib/privsep.c +++ lib/privsep.c @@ -275,7 +275,7 @@ got_privsep_send_raw_obj_outfd(struct imsgbuf *ibuf, i const struct got_error * got_privsep_send_raw_obj(struct imsgbuf *ibuf, off_t size, size_t hdrlen, - uint8_t *data) + int flags, uint8_t *data) { struct got_imsg_raw_obj iobj; size_t len = sizeof(iobj); @@ -284,6 +284,7 @@ got_privsep_send_raw_obj(struct imsgbuf *ibuf, off_t s memset(&iobj, 0, sizeof(iobj)); iobj.hdrlen = hdrlen; iobj.size = size; + iobj.flags = flags; if (data && size + hdrlen <= GOT_PRIVSEP_INLINE_OBJECT_DATA_MAX) len += (size_t)size + hdrlen; @@ -306,7 +307,7 @@ got_privsep_send_raw_obj(struct imsgbuf *ibuf, off_t s const struct got_error * got_privsep_recv_raw_obj(uint8_t **outbuf, off_t *size, size_t *hdrlen, - struct imsgbuf *ibuf) + int *flags, struct imsgbuf *ibuf) { const struct got_error *err = NULL; struct imsg imsg; @@ -330,6 +331,7 @@ got_privsep_recv_raw_obj(uint8_t **outbuf, off_t *size iobj = imsg.data; *size = iobj->size; *hdrlen = iobj->hdrlen; + *flags = iobj->flags; if (datalen == sizeof(*iobj)) { /* Data has been written to file descriptor. */ blob - b4ee8e364fcf7b360c15dc42312997cb86f45e78 blob + 6563535c8a757aae6d4e5dea7b875f0fedb1f7d1 --- libexec/got-read-object/got-read-object.c +++ libexec/got-read-object/got-read-object.c @@ -79,7 +79,7 @@ send_raw_obj(struct imsgbuf *ibuf, struct got_object * if (err) goto done; - err = got_privsep_send_raw_obj(ibuf, size, hdrlen, data); + err = got_privsep_send_raw_obj(ibuf, size, hdrlen, 0, data); done: free(data); if (close(fd) == -1 && err == NULL) blob - 4e65a3f8939e6541fe568b471a16f36581536485 blob + d4a1d16e1ec13dbad6e5f94ef9499f3a3748bad4 --- libexec/got-read-pack/got-read-pack.c +++ libexec/got-read-pack/got-read-pack.c @@ -833,6 +833,7 @@ raw_object_request(struct imsg *imsg, struct imsgbuf * struct got_object *obj; struct got_object_id id; size_t datalen; + int flags = GOT_RAW_OBJ_FLAG_PACKED; datalen = imsg->hdr.len - IMSG_HEADER_SIZE; if (datalen != sizeof(iobj)) @@ -858,6 +859,7 @@ raw_object_request(struct imsg *imsg, struct imsgbuf * err = got_pack_get_max_delta_object_size(&size, obj, pack); if (err) goto done; + flags |= GOT_RAW_OBJ_FLAG_DELTIFIED; } else size = obj->size; @@ -870,7 +872,8 @@ raw_object_request(struct imsg *imsg, struct imsgbuf * if (err) goto done; - err = got_privsep_send_raw_obj(ibuf, obj->size, obj->hdrlen, buf); + err = got_privsep_send_raw_obj(ibuf, obj->size, obj->hdrlen, + flags, buf); done: free(buf); if (outfile && fclose(outfile) == EOF && err == NULL)