"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
Stefan Sperling <stsp@stsp.name>
Subject:
faster reuse of deltas
To:
gameoftrees@openbsd.org
Date:
Sat, 3 Dec 2022 12:29:54 +0100

Download raw body.

Thread
At present, 'gotadmin pack' copies reused deltas to the delta cache file,
and then copies deltas from this cache into the generated pack file.
The extra copy operation is redundant. It was implemented because, at
some earlier point in development history, the code which copied data
to the pack file was hard-wired to compress data while writing it out.
So we needed to store reused deltas in decompressed form somewhere.

Nowadays, the cache is storing deltas in compressed form, and the extra
copy operation just slows us down for no good reason. Deltas are being
parsed and provided by got-read-pack either way, so this helper remains
in full control over the data which the main process ends up copying (but
not parsing).

With the patch below, got-read-pack tells the main process only about the
locations of reused deltas in the original pack file, and the main process
reads deltas directly from that file. This speeds things up and will make
it easier to implement reuse of non-deltified packed objects later.

ok?

 avoid copying reused deltas to delta cache file; copy from pack file instead
 
diff 270c41a2b8c0d37d0ea9710a656369efa551dfcd fb97eae9491a2282ef08307ac719dfce32afcfee
commit - 270c41a2b8c0d37d0ea9710a656369efa551dfcd
commit + fb97eae9491a2282ef08307ac719dfce32afcfee
blob - 00518379da8d1ea10fa247bbd516547bf00019e1
blob + 188c28e9e67bdc37146176e911d6226b223c40d2
--- lib/got_lib_pack.h
+++ lib/got_lib_pack.h
@@ -222,5 +222,5 @@ const struct got_error *got_packfile_extract_raw_delta
 const struct got_error *got_packfile_extract_object_to_mem(uint8_t **, size_t *,
     struct got_object *, struct got_pack *);
 const struct got_error *got_packfile_extract_raw_delta(uint8_t **, size_t *,
-    size_t *, off_t *, off_t *, struct got_object_id *, uint64_t *, uint64_t *,
-    struct got_pack *, struct got_packidx *, int);
+    size_t *, off_t *, off_t *, off_t *, struct got_object_id *, uint64_t *,
+    uint64_t *, struct got_pack *, struct got_packidx *, int);
blob - 7ad544d7095d79e959a67e5c9d4d84fd6da2c0f4
blob + 7a3c6579c0c3c5ab813f5c5cdb1093a791ac08b4
--- lib/got_lib_pack_create.h
+++ lib/got_lib_pack_create.h
@@ -95,8 +95,8 @@ got_pack_search_deltas(struct got_pack_metavec *v,
     struct got_pack_metavec *v);
 
 const struct got_error *
-got_pack_search_deltas(struct got_pack_metavec *v,
-    struct got_object_idset *idset, int delta_cache_fd,
+got_pack_search_deltas(struct got_packidx **packidx, struct got_pack **pack,
+    struct got_pack_metavec *v, struct got_object_idset *idset,
     int ncolored, int nfound, int ntrees, int ncommits,
     struct got_repository *repo,
     got_pack_progress_cb progress_cb, void *progress_arg,
blob - 37d537ab48623fa50dd909cf2725e58151958e35
blob + 5010cb878bd556b6b5255748ec43c5bda5a060da
--- lib/got_lib_privsep.h
+++ lib/got_lib_privsep.h
@@ -327,12 +327,6 @@ struct got_imsg_reused_delta {
 	off_t delta_size;
 	off_t delta_compressed_size;
 	off_t delta_offset;
-	off_t delta_out_offset;
-
-	/*
-	 * Delta data has been written at delta_out_offset to the file
-	 * descriptor passed via the GOT_IMSG_RAW_DELTA_OUTFD imsg.
-	 */
 };
 struct got_imsg_reused_deltas {
 	size_t ndeltas;
blob - 890220f4bbc04471e5b17866775b2696968d025b
blob + 68de5bf9f8523cf07c88e05955ce9b4c6abc451a
--- lib/pack.c
+++ lib/pack.c
@@ -1897,7 +1897,8 @@ got_packfile_extract_raw_delta(uint8_t **delta_buf, si
 
 const struct got_error *
 got_packfile_extract_raw_delta(uint8_t **delta_buf, size_t *delta_size,
-    size_t *delta_compressed_size, off_t *delta_offset, off_t *base_offset,
+    size_t *delta_compressed_size, off_t *delta_offset,
+    off_t *delta_data_offset, off_t *base_offset,
     struct got_object_id *base_id, uint64_t *base_size, uint64_t *result_size,
     struct got_pack *pack, struct got_packidx *packidx, int idx)
 {
@@ -1906,12 +1907,12 @@ got_packfile_extract_raw_delta(uint8_t **delta_buf, si
 	uint8_t type;
 	uint64_t size;
 	size_t tslen, delta_hdrlen;
-	off_t delta_data_offset;
 
 	*delta_buf = NULL;
 	*delta_size = 0;
 	*delta_compressed_size = 0;
 	*delta_offset = 0;
+	*delta_data_offset = 0;
 	*base_offset = 0;
 	*base_size = 0;
 	*result_size = 0;
@@ -1955,9 +1956,9 @@ got_packfile_extract_raw_delta(uint8_t **delta_buf, si
 	    offset + delta_hdrlen < delta_hdrlen)
 		return got_error(GOT_ERR_BAD_DELTA);
 
-	delta_data_offset = offset + tslen + delta_hdrlen;
+	*delta_data_offset = offset + tslen + delta_hdrlen;
 	err = read_raw_delta_data(delta_buf, delta_size, delta_compressed_size,
-	    base_size, result_size, delta_data_offset, pack, packidx);
+	    base_size, result_size, *delta_data_offset, pack, packidx);
 	if (err)
 		return err;
 
blob - 44ccdb3791fa3057c6bda8686d44d57d48169e2e
blob + 0dda4f95f2b3f8b16523c768cd55fa04de120f90
--- lib/pack_create.c
+++ lib/pack_create.c
@@ -1597,11 +1597,16 @@ write_packed_object(off_t *packfile_size, int packfd,
 	char buf[32];
 	int nh;
 	struct got_raw_object *raw = NULL;
-	off_t outlen;
+	off_t outlen, delta_offset;
 
 	csum.output_sha1 = ctx;
 	csum.output_crc = NULL;
 
+	if (m->reused_delta_offset)
+		delta_offset = m->reused_delta_offset;
+	else
+		delta_offset = m->delta_offset;
+
 	m->off = *packfile_size;
 	if (m->delta_len == 0) {
 		err = got_object_raw_open(&raw, outfd, repo, &m->id);
@@ -1650,15 +1655,14 @@ write_packed_object(off_t *packfile_size, int packfd,
 		err = deltahdr(packfile_size, ctx, packfd, m);
 		if (err)
 			goto done;
-		err = hcopy_mmap(delta_cache_map, m->delta_offset,
+		err = hcopy_mmap(delta_cache_map, delta_offset,
 		    delta_cache_size, packfd, m->delta_compressed_len,
 		    ctx);
 		if (err)
 			goto done;
 		*packfile_size += m->delta_compressed_len;
 	} else {
-		if (fseeko(delta_cache, m->delta_offset, SEEK_SET)
-		    == -1) {
+		if (fseeko(delta_cache, delta_offset, SEEK_SET) == -1) {
 			err = got_error_from_errno("fseeko");
 			goto done;
 		}
@@ -1678,8 +1682,8 @@ genpack(uint8_t *pack_sha1, int packfd, FILE *delta_ca
 }
 
 static const struct got_error *
-genpack(uint8_t *pack_sha1, int packfd, FILE *delta_cache,
-    struct got_pack_meta **deltify, int ndeltify,
+genpack(uint8_t *pack_sha1, int packfd, struct got_pack *reuse_pack,
+    FILE *delta_cache, struct got_pack_meta **deltify, int ndeltify,
     struct got_pack_meta **reuse, int nreuse,
     int ncolored, int nfound, int ntrees, int nours,
     struct got_repository *repo,
@@ -1697,6 +1701,7 @@ genpack(uint8_t *pack_sha1, int packfd, FILE *delta_ca
 	int delta_cache_fd = -1;
 	uint8_t *delta_cache_map = NULL;
 	size_t delta_cache_size = 0;
+	FILE *packfile = NULL;
 
 	SHA1Init(&ctx);
 
@@ -1752,6 +1757,19 @@ genpack(uint8_t *pack_sha1, int packfd, FILE *delta_ca
 
 	qsort(reuse, nreuse, sizeof(struct got_pack_meta *),
 	    reuse_write_order_cmp);
+	if (nreuse > 0 && reuse_pack->map == NULL) {
+		int fd = dup(reuse_pack->fd);
+		if (fd == -1) {
+			err = got_error_from_errno("dup");
+			goto done;
+		}
+		packfile = fdopen(fd, "r");
+		if (packfile == NULL) {
+			err = got_error_from_errno("fdopen");
+			close(fd);
+			goto done;
+		}
+	}
 	for (i = 0; i < nreuse; i++) {
 		err = got_pack_report_progress(progress_cb, progress_arg, rl,
 		    ncolored, nfound, ntrees, packfile_size, nours,
@@ -1760,7 +1778,7 @@ genpack(uint8_t *pack_sha1, int packfd, FILE *delta_ca
 			goto done;
 		m = reuse[i];
 		err = write_packed_object(&packfile_size, packfd,
-		    delta_cache, delta_cache_map, delta_cache_size,
+		    packfile, reuse_pack->map, reuse_pack->filesize,
 		    m, &outfd, &ctx, repo);
 		if (err)
 			goto done;
@@ -1786,6 +1804,8 @@ done:
 		err = got_error_from_errno("munmap");
 	if (delta_cache_fd != -1 && close(delta_cache_fd) == -1 && err == NULL)
 		err = got_error_from_errno("close");
+	if (packfile && fclose(packfile) == EOF && err == NULL)
+		err = got_error_from_errno("fclose");
 	return err;
 }
 
@@ -1810,8 +1830,9 @@ got_pack_create(uint8_t *packsha1, int packfd, FILE *d
     struct got_ratelimit *rl, got_cancel_cb cancel_cb, void *cancel_arg)
 {
 	const struct got_error *err;
-	int delta_cache_fd = -1;
 	struct got_object_idset *idset;
+	struct got_packidx *reuse_packidx = NULL;
+	struct got_pack *reuse_pack = NULL;
 	struct got_pack_metavec deltify, reuse;
 	int ncolored = 0, nfound = 0, ntrees = 0;
 	size_t ndeltify;
@@ -1844,12 +1865,6 @@ got_pack_create(uint8_t *packsha1, int packfd, FILE *d
 		goto done;
 	}
 
-	delta_cache_fd = dup(fileno(delta_cache));
-	if (delta_cache_fd == -1) {
-		err = got_error_from_errno("dup");
-		goto done;
-	}
-
 	reuse.metasz = 64;
 	reuse.meta = calloc(reuse.metasz,
 	    sizeof(struct got_pack_meta *));
@@ -1858,12 +1873,18 @@ got_pack_create(uint8_t *packsha1, int packfd, FILE *d
 		goto done;
 	}
 
-	err = got_pack_search_deltas(&reuse, idset, delta_cache_fd,
-	    ncolored, nfound, ntrees, nours,
+	err = got_pack_search_deltas(&reuse_packidx, &reuse_pack,
+	    &reuse, idset, ncolored, nfound, ntrees, nours,
 	    repo, progress_cb, progress_arg, rl, cancel_cb, cancel_arg);
 	if (err)
 		goto done;
 
+	if (reuse_packidx && reuse_pack) {
+		err = got_repo_pin_pack(repo, reuse_packidx, reuse_pack);
+		if (err)
+			goto done;
+	}
+
 	if (fseeko(delta_cache, 0L, SEEK_END) == -1) {
 		err = got_error_from_errno("fseeko");
 		goto done;
@@ -1910,7 +1931,7 @@ got_pack_create(uint8_t *packsha1, int packfd, FILE *d
 			goto done;
 	}
 
-	err = genpack(packsha1, packfd, delta_cache, deltify.meta,
+	err = genpack(packsha1, packfd, reuse_pack, delta_cache, deltify.meta,
 	    deltify.nmeta, reuse.meta, reuse.nmeta, ncolored, nfound, ntrees,
 	    nours, repo, progress_cb, progress_arg, rl,
 	    cancel_cb, cancel_arg);
@@ -1920,7 +1941,6 @@ done:
 	free_nmeta(deltify.meta, deltify.nmeta);
 	free_nmeta(reuse.meta, reuse.nmeta);
 	got_object_idset_free(idset);
-	if (delta_cache_fd != -1 && close(delta_cache_fd) == -1 && err == NULL)
-		err = got_error_from_errno("close");
+	got_repo_unpin_pack(repo);
 	return err;
 }
blob - 2adcf9e794f2de58f508160da8633a854ebfdef9
blob + 0b3aca3719b7e157075b51f68f02045737d727e1
--- lib/pack_create_io.c
+++ lib/pack_create_io.c
@@ -120,20 +120,7 @@ search_delta_for_object(struct got_object_id *id, void
 
 	if (got_object_idset_contains(a->idset, &base_id)) {
 		struct got_pack_meta *m, *base;
-		off_t delta_out_offset;
-		ssize_t w;
 
-		delta_out_offset = lseek(a->delta_cache_fd, 0, SEEK_CUR);
-		if (delta_out_offset == -1) {
-			err = got_error_from_errno("lseek");
-			goto done;
-		}
-		w = write(a->delta_cache_fd, delta_buf, delta_compressed_size);
-		if (w != delta_compressed_size) {
-			err = got_error_from_errno("write");
-			goto done;
-		}
-
 		m = got_object_idset_get(a->idset, id);
 		if (m == NULL) {
 			err = got_error_msg(GOT_ERR_NO_OBJ,
@@ -159,7 +146,7 @@ search_delta_for_object(struct got_object_id *id, void
 		m->delta_len = delta_size;
 		m->delta_compressed_len = delta_compressed_size;
 		m->reused_delta_offset = delta_offset;
-		m->delta_offset = delta_out_offset;
+		m->delta_offset = 0;
 
 		err = got_pack_add_meta(m, a->v);
 		if (err)
blob - a08374f7b103b4078c61a1872a293f454f078ec4
blob + 63e729e49404dd6fb94579bd500fa06cee7ac5fe
--- lib/pack_create_privsep.c
+++ lib/pack_create_privsep.c
@@ -114,7 +114,7 @@ recv_reused_delta(struct got_imsg_reused_delta *delta,
 
 	m->delta_len = delta->delta_size;
 	m->delta_compressed_len = delta->delta_compressed_size;
-	m->delta_offset = delta->delta_out_offset;
+	m->delta_offset = 0;
 	m->prev = base;
 	m->size = delta->result_size;
 	m->reused_delta_offset = delta->delta_offset;
@@ -125,69 +125,45 @@ static const struct got_error *
 	return got_pack_add_meta(m, v);
 }
 
-static const struct got_error *
-prepare_delta_reuse(struct got_pack *pack, struct got_packidx *packidx,
-    int delta_outfd, struct got_repository *repo)
-{
-	const struct got_error *err = NULL;
-
-	if (!pack->child_has_delta_outfd) {
-		int outfd_child;
-		outfd_child = dup(delta_outfd);
-		if (outfd_child == -1) {
-			err = got_error_from_errno("dup");
-			goto done;
-		}
-		err = got_privsep_send_raw_delta_outfd(
-		    pack->privsep_child->ibuf, outfd_child);
-		if (err)
-			goto done;
-		pack->child_has_delta_outfd = 1;
-	}
-
-	err = got_privsep_send_delta_reuse_req(pack->privsep_child->ibuf);
-done:
-	return err;
-}
-
 const struct got_error *
-got_pack_search_deltas(struct got_pack_metavec *v,
-    struct got_object_idset *idset, int delta_cache_fd,
+got_pack_search_deltas(struct got_packidx **packidx, struct got_pack **pack,
+    struct got_pack_metavec *v, struct got_object_idset *idset,
     int ncolored, int nfound, int ntrees, int ncommits,
     struct got_repository *repo,
     got_pack_progress_cb progress_cb, void *progress_arg,
     struct got_ratelimit *rl, got_cancel_cb cancel_cb, void *cancel_arg)
 {
 	const struct got_error *err = NULL;
-	struct got_packidx *packidx;
-	struct got_pack *pack;
 	struct got_imsg_reused_delta deltas[GOT_IMSG_REUSED_DELTAS_MAX_NDELTAS];
 	size_t ndeltas, i;
 
-	err = got_pack_find_pack_for_reuse(&packidx, repo);
+	*packidx = NULL;
+	*pack = NULL;
+
+	err = got_pack_find_pack_for_reuse(packidx, repo);
 	if (err)
 		return err;
 
-	if (packidx == NULL)
+	if (*packidx == NULL)
 		return NULL;
 
-	err = got_pack_cache_pack_for_packidx(&pack, packidx, repo);
+	err = got_pack_cache_pack_for_packidx(pack, *packidx, repo);
 	if (err)
-		return err;
+		goto done;
 
-	if (pack->privsep_child == NULL) {
-		err = got_pack_start_privsep_child(pack, packidx);
+	if ((*pack)->privsep_child == NULL) {
+		err = got_pack_start_privsep_child(*pack, *packidx);
 		if (err)
-			return err;
+			goto done;
 	}
 
-	err = prepare_delta_reuse(pack, packidx, delta_cache_fd, repo);
+	err = got_privsep_send_delta_reuse_req((*pack)->privsep_child->ibuf);
 	if (err)
-		return err;
+		goto done;
 
-	err = send_idset(pack->privsep_child->ibuf, idset);
+	err = send_idset((*pack)->privsep_child->ibuf, idset);
 	if (err)
-		return err;
+		goto done;
 
 	for (;;) {
 		int done = 0;
@@ -199,7 +175,7 @@ got_pack_search_deltas(struct got_pack_metavec *v,
 		}
 
 		err = got_privsep_recv_reused_deltas(&done, deltas, &ndeltas,
-		    pack->privsep_child->ibuf);
+		    (*pack)->privsep_child->ibuf);
 		if (err || done)
 			break;
 
blob - e49fd95ba0b7919efe9005b306782db008b22370
blob + a86f3dde84274fdf37db09da580f7bba1e012e46
--- libexec/got-read-pack/got-read-pack.c
+++ libexec/got-read-pack/got-read-pack.c
@@ -869,7 +869,7 @@ raw_delta_request(struct imsg *imsg, struct imsgbuf *i
 	const struct got_error *err = NULL;
 	struct got_imsg_raw_delta_request req;
 	size_t datalen, delta_size, delta_compressed_size;
-	off_t delta_offset;
+	off_t delta_offset, delta_data_offset;
 	uint8_t *delta_buf = NULL;
 	struct got_object_id id, base_id;
 	off_t base_offset, delta_out_offset = 0;
@@ -885,8 +885,9 @@ raw_delta_request(struct imsg *imsg, struct imsgbuf *i
 	imsg->fd = -1;
 
 	err = got_packfile_extract_raw_delta(&delta_buf, &delta_size,
-	    &delta_compressed_size, &delta_offset, &base_offset, &base_id,
-	    &base_size, &result_size, pack, packidx, req.idx);
+	    &delta_compressed_size, &delta_offset, &delta_data_offset,
+	    &base_offset, &base_id, &base_size, &result_size,
+	    pack, packidx, req.idx);
 	if (err)
 		goto done;
 
@@ -924,7 +925,6 @@ struct search_deltas_arg {
 	struct got_packidx *packidx;
 	struct got_pack *pack;
 	struct got_object_idset *idset;
-	FILE *delta_outfile;
 	struct got_imsg_reused_delta deltas[GOT_IMSG_REUSED_DELTAS_MAX_NDELTAS];
 	size_t ndeltas;
 };
@@ -938,7 +938,7 @@ search_delta_for_object(struct got_object_id *id, void
 	uint8_t *delta_buf = NULL;
 	uint64_t base_size, result_size;
 	size_t delta_size, delta_compressed_size;
-	off_t delta_offset, base_offset;
+	off_t delta_offset, delta_data_offset, base_offset;
 	struct got_object_id base_id;
 
 	if (sigint_received)
@@ -949,8 +949,9 @@ search_delta_for_object(struct got_object_id *id, void
 		return NULL; /* object not present in our pack file */
 
 	err = got_packfile_extract_raw_delta(&delta_buf, &delta_size,
-	    &delta_compressed_size, &delta_offset, &base_offset, &base_id,
-	    &base_size, &result_size, a->pack, a->packidx, obj_idx);
+	    &delta_compressed_size, &delta_offset, &delta_data_offset,
+	    &base_offset, &base_id, &base_size, &result_size,
+	    a->pack, a->packidx, obj_idx);
 	if (err) {
 		if (err->code == GOT_ERR_OBJ_TYPE)
 			return NULL; /* object not stored as a delta */
@@ -969,16 +970,7 @@ search_delta_for_object(struct got_object_id *id, void
 
 	if (got_object_idset_contains(a->idset, &base_id)) {
 		struct got_imsg_reused_delta *delta;
-		off_t delta_out_offset = ftello(a->delta_outfile);
-		size_t w;
 
-		w = fwrite(delta_buf, 1, delta_compressed_size,
-		    a->delta_outfile);
-		if (w != delta_compressed_size) {
-			err = got_ferror(a->delta_outfile, GOT_ERR_IO);
-			goto done;
-		}
-
 		delta = &a->deltas[a->ndeltas++];
 		memcpy(&delta->id, id, sizeof(delta->id));
 		memcpy(&delta->base_id, &base_id, sizeof(delta->base_id));
@@ -986,8 +978,7 @@ search_delta_for_object(struct got_object_id *id, void
 		delta->result_size = result_size;
 		delta->delta_size = delta_size;
 		delta->delta_compressed_size = delta_compressed_size;
-		delta->delta_offset = delta_offset;
-		delta->delta_out_offset = delta_out_offset;
+		delta->delta_offset = delta_data_offset;
 
 		if (a->ndeltas >= GOT_IMSG_REUSED_DELTAS_MAX_NDELTAS) {
 			err = got_privsep_send_reused_deltas(a->ibuf,
@@ -1054,7 +1045,7 @@ delta_reuse_request(struct imsg *imsg, struct imsgbuf 
 
 static const struct got_error *
 delta_reuse_request(struct imsg *imsg, struct imsgbuf *ibuf,
-    FILE *delta_outfile, struct got_pack *pack, struct got_packidx *packidx)
+	struct got_pack *pack, struct got_packidx *packidx)
 {
 	const struct got_error *err = NULL;
 	struct got_object_idset *idset;
@@ -1073,7 +1064,6 @@ delta_reuse_request(struct imsg *imsg, struct imsgbuf 
 	sda.idset = idset;
 	sda.pack = pack;
 	sda.packidx = packidx;
-	sda.delta_outfile = delta_outfile;
 	err = got_object_idset_for_each(idset, search_delta_for_object, &sda);
 	if (err)
 		goto done;
@@ -1085,11 +1075,6 @@ delta_reuse_request(struct imsg *imsg, struct imsgbuf 
 			goto done;
 	}
 
-	if (fflush(delta_outfile) == -1) {
-		err = got_error_from_errno("fflush");
-		goto done;
-	}
-
 	err = got_privsep_send_reused_deltas_done(ibuf);
 done:
 	got_object_idset_free(idset);
@@ -2001,12 +1986,7 @@ main(int argc, char *argv[])
 			    pack, packidx);
 			break;
 		case GOT_IMSG_DELTA_REUSE_REQUEST:
-			if (delta_outfile == NULL) {
-				err = got_error(GOT_ERR_PRIVSEP_NO_FD);
-				break;
-			}
-			err = delta_reuse_request(&imsg, &ibuf,
-			    delta_outfile, pack, packidx);
+			err = delta_reuse_request(&imsg, &ibuf, pack, packidx);
 			break;
 		case GOT_IMSG_COMMIT_REQUEST:
 			err = commit_request(&imsg, &ibuf, pack, packidx,