From: Stefan Sperling Subject: Re: faster reuse of deltas To: gameoftrees@openbsd.org Date: Sat, 3 Dec 2022 14:51:37 +0100 On Sat, Dec 03, 2022 at 12:29:54PM +0100, Stefan Sperling wrote: > At present, 'gotadmin pack' copies reused deltas to the delta cache file, > and then copies deltas from this cache into the generated pack file. > The extra copy operation is redundant. It was implemented because, at > some earlier point in development history, the code which copied data > to the pack file was hard-wired to compress data while writing it out. > So we needed to store reused deltas in decompressed form somewhere. > > Nowadays, the cache is storing deltas in compressed form, and the extra > copy operation just slows us down for no good reason. Deltas are being > parsed and provided by got-read-pack either way, so this helper remains > in full control over the data which the main process ends up copying (but > not parsing). > > With the patch below, got-read-pack tells the main process only about the > locations of reused deltas in the original pack file, and the main process > reads deltas directly from that file. This speeds things up and will make > it easier to implement reuse of non-deltified packed objects later. Updated diff. The previous version of this diff was missing necessary changes for gotd(8). diff 270c41a2b8c0d37d0ea9710a656369efa551dfcd 35bbeac873a7d5494c91b3bf2cc0b6c7f89ab569 commit - 270c41a2b8c0d37d0ea9710a656369efa551dfcd commit + 35bbeac873a7d5494c91b3bf2cc0b6c7f89ab569 blob - 00518379da8d1ea10fa247bbd516547bf00019e1 blob + 188c28e9e67bdc37146176e911d6226b223c40d2 --- lib/got_lib_pack.h +++ lib/got_lib_pack.h @@ -222,5 +222,5 @@ const struct got_error *got_packfile_extract_raw_delta const struct got_error *got_packfile_extract_object_to_mem(uint8_t **, size_t *, struct got_object *, struct got_pack *); const struct got_error *got_packfile_extract_raw_delta(uint8_t **, size_t *, - size_t *, off_t *, off_t *, struct got_object_id *, uint64_t *, uint64_t *, - struct got_pack *, struct got_packidx *, int); + size_t *, off_t *, off_t *, off_t *, struct got_object_id *, uint64_t *, + uint64_t *, struct got_pack *, struct got_packidx *, int); blob - 7ad544d7095d79e959a67e5c9d4d84fd6da2c0f4 blob + 7a3c6579c0c3c5ab813f5c5cdb1093a791ac08b4 --- lib/got_lib_pack_create.h +++ lib/got_lib_pack_create.h @@ -95,8 +95,8 @@ got_pack_search_deltas(struct got_pack_metavec *v, struct got_pack_metavec *v); const struct got_error * -got_pack_search_deltas(struct got_pack_metavec *v, - struct got_object_idset *idset, int delta_cache_fd, +got_pack_search_deltas(struct got_packidx **packidx, struct got_pack **pack, + struct got_pack_metavec *v, struct got_object_idset *idset, int ncolored, int nfound, int ntrees, int ncommits, struct got_repository *repo, got_pack_progress_cb progress_cb, void *progress_arg, blob - 37d537ab48623fa50dd909cf2725e58151958e35 blob + 5010cb878bd556b6b5255748ec43c5bda5a060da --- lib/got_lib_privsep.h +++ lib/got_lib_privsep.h @@ -327,12 +327,6 @@ struct got_imsg_reused_delta { off_t delta_size; off_t delta_compressed_size; off_t delta_offset; - off_t delta_out_offset; - - /* - * Delta data has been written at delta_out_offset to the file - * descriptor passed via the GOT_IMSG_RAW_DELTA_OUTFD imsg. - */ }; struct got_imsg_reused_deltas { size_t ndeltas; blob - 890220f4bbc04471e5b17866775b2696968d025b blob + 68de5bf9f8523cf07c88e05955ce9b4c6abc451a --- lib/pack.c +++ lib/pack.c @@ -1897,7 +1897,8 @@ got_packfile_extract_raw_delta(uint8_t **delta_buf, si const struct got_error * got_packfile_extract_raw_delta(uint8_t **delta_buf, size_t *delta_size, - size_t *delta_compressed_size, off_t *delta_offset, off_t *base_offset, + size_t *delta_compressed_size, off_t *delta_offset, + off_t *delta_data_offset, off_t *base_offset, struct got_object_id *base_id, uint64_t *base_size, uint64_t *result_size, struct got_pack *pack, struct got_packidx *packidx, int idx) { @@ -1906,12 +1907,12 @@ got_packfile_extract_raw_delta(uint8_t **delta_buf, si uint8_t type; uint64_t size; size_t tslen, delta_hdrlen; - off_t delta_data_offset; *delta_buf = NULL; *delta_size = 0; *delta_compressed_size = 0; *delta_offset = 0; + *delta_data_offset = 0; *base_offset = 0; *base_size = 0; *result_size = 0; @@ -1955,9 +1956,9 @@ got_packfile_extract_raw_delta(uint8_t **delta_buf, si offset + delta_hdrlen < delta_hdrlen) return got_error(GOT_ERR_BAD_DELTA); - delta_data_offset = offset + tslen + delta_hdrlen; + *delta_data_offset = offset + tslen + delta_hdrlen; err = read_raw_delta_data(delta_buf, delta_size, delta_compressed_size, - base_size, result_size, delta_data_offset, pack, packidx); + base_size, result_size, *delta_data_offset, pack, packidx); if (err) return err; blob - 44ccdb3791fa3057c6bda8686d44d57d48169e2e blob + 0dda4f95f2b3f8b16523c768cd55fa04de120f90 --- lib/pack_create.c +++ lib/pack_create.c @@ -1597,11 +1597,16 @@ write_packed_object(off_t *packfile_size, int packfd, char buf[32]; int nh; struct got_raw_object *raw = NULL; - off_t outlen; + off_t outlen, delta_offset; csum.output_sha1 = ctx; csum.output_crc = NULL; + if (m->reused_delta_offset) + delta_offset = m->reused_delta_offset; + else + delta_offset = m->delta_offset; + m->off = *packfile_size; if (m->delta_len == 0) { err = got_object_raw_open(&raw, outfd, repo, &m->id); @@ -1650,15 +1655,14 @@ write_packed_object(off_t *packfile_size, int packfd, err = deltahdr(packfile_size, ctx, packfd, m); if (err) goto done; - err = hcopy_mmap(delta_cache_map, m->delta_offset, + err = hcopy_mmap(delta_cache_map, delta_offset, delta_cache_size, packfd, m->delta_compressed_len, ctx); if (err) goto done; *packfile_size += m->delta_compressed_len; } else { - if (fseeko(delta_cache, m->delta_offset, SEEK_SET) - == -1) { + if (fseeko(delta_cache, delta_offset, SEEK_SET) == -1) { err = got_error_from_errno("fseeko"); goto done; } @@ -1678,8 +1682,8 @@ genpack(uint8_t *pack_sha1, int packfd, FILE *delta_ca } static const struct got_error * -genpack(uint8_t *pack_sha1, int packfd, FILE *delta_cache, - struct got_pack_meta **deltify, int ndeltify, +genpack(uint8_t *pack_sha1, int packfd, struct got_pack *reuse_pack, + FILE *delta_cache, struct got_pack_meta **deltify, int ndeltify, struct got_pack_meta **reuse, int nreuse, int ncolored, int nfound, int ntrees, int nours, struct got_repository *repo, @@ -1697,6 +1701,7 @@ genpack(uint8_t *pack_sha1, int packfd, FILE *delta_ca int delta_cache_fd = -1; uint8_t *delta_cache_map = NULL; size_t delta_cache_size = 0; + FILE *packfile = NULL; SHA1Init(&ctx); @@ -1752,6 +1757,19 @@ genpack(uint8_t *pack_sha1, int packfd, FILE *delta_ca qsort(reuse, nreuse, sizeof(struct got_pack_meta *), reuse_write_order_cmp); + if (nreuse > 0 && reuse_pack->map == NULL) { + int fd = dup(reuse_pack->fd); + if (fd == -1) { + err = got_error_from_errno("dup"); + goto done; + } + packfile = fdopen(fd, "r"); + if (packfile == NULL) { + err = got_error_from_errno("fdopen"); + close(fd); + goto done; + } + } for (i = 0; i < nreuse; i++) { err = got_pack_report_progress(progress_cb, progress_arg, rl, ncolored, nfound, ntrees, packfile_size, nours, @@ -1760,7 +1778,7 @@ genpack(uint8_t *pack_sha1, int packfd, FILE *delta_ca goto done; m = reuse[i]; err = write_packed_object(&packfile_size, packfd, - delta_cache, delta_cache_map, delta_cache_size, + packfile, reuse_pack->map, reuse_pack->filesize, m, &outfd, &ctx, repo); if (err) goto done; @@ -1786,6 +1804,8 @@ done: err = got_error_from_errno("munmap"); if (delta_cache_fd != -1 && close(delta_cache_fd) == -1 && err == NULL) err = got_error_from_errno("close"); + if (packfile && fclose(packfile) == EOF && err == NULL) + err = got_error_from_errno("fclose"); return err; } @@ -1810,8 +1830,9 @@ got_pack_create(uint8_t *packsha1, int packfd, FILE *d struct got_ratelimit *rl, got_cancel_cb cancel_cb, void *cancel_arg) { const struct got_error *err; - int delta_cache_fd = -1; struct got_object_idset *idset; + struct got_packidx *reuse_packidx = NULL; + struct got_pack *reuse_pack = NULL; struct got_pack_metavec deltify, reuse; int ncolored = 0, nfound = 0, ntrees = 0; size_t ndeltify; @@ -1844,12 +1865,6 @@ got_pack_create(uint8_t *packsha1, int packfd, FILE *d goto done; } - delta_cache_fd = dup(fileno(delta_cache)); - if (delta_cache_fd == -1) { - err = got_error_from_errno("dup"); - goto done; - } - reuse.metasz = 64; reuse.meta = calloc(reuse.metasz, sizeof(struct got_pack_meta *)); @@ -1858,12 +1873,18 @@ got_pack_create(uint8_t *packsha1, int packfd, FILE *d goto done; } - err = got_pack_search_deltas(&reuse, idset, delta_cache_fd, - ncolored, nfound, ntrees, nours, + err = got_pack_search_deltas(&reuse_packidx, &reuse_pack, + &reuse, idset, ncolored, nfound, ntrees, nours, repo, progress_cb, progress_arg, rl, cancel_cb, cancel_arg); if (err) goto done; + if (reuse_packidx && reuse_pack) { + err = got_repo_pin_pack(repo, reuse_packidx, reuse_pack); + if (err) + goto done; + } + if (fseeko(delta_cache, 0L, SEEK_END) == -1) { err = got_error_from_errno("fseeko"); goto done; @@ -1910,7 +1931,7 @@ got_pack_create(uint8_t *packsha1, int packfd, FILE *d goto done; } - err = genpack(packsha1, packfd, delta_cache, deltify.meta, + err = genpack(packsha1, packfd, reuse_pack, delta_cache, deltify.meta, deltify.nmeta, reuse.meta, reuse.nmeta, ncolored, nfound, ntrees, nours, repo, progress_cb, progress_arg, rl, cancel_cb, cancel_arg); @@ -1920,7 +1941,6 @@ done: free_nmeta(deltify.meta, deltify.nmeta); free_nmeta(reuse.meta, reuse.nmeta); got_object_idset_free(idset); - if (delta_cache_fd != -1 && close(delta_cache_fd) == -1 && err == NULL) - err = got_error_from_errno("close"); + got_repo_unpin_pack(repo); return err; } blob - 2adcf9e794f2de58f508160da8633a854ebfdef9 blob + f9b0db5b972aba181cf96805d21968bce7c814c6 --- lib/pack_create_io.c +++ lib/pack_create_io.c @@ -68,7 +68,6 @@ struct search_deltas_arg { struct got_packidx *packidx; struct got_pack *pack; struct got_object_idset *idset; - int delta_cache_fd; int ncolored, nfound, ntrees, ncommits; got_pack_progress_cb progress_cb; void *progress_arg; @@ -86,7 +85,7 @@ search_delta_for_object(struct got_object_id *id, void uint8_t *delta_buf = NULL; uint64_t base_size, result_size; size_t delta_size, delta_compressed_size; - off_t delta_offset, base_offset; + off_t delta_offset, delta_data_offset, base_offset; struct got_object_id base_id; if (a->cancel_cb) { @@ -100,8 +99,9 @@ search_delta_for_object(struct got_object_id *id, void return NULL; /* object not present in our pack file */ err = got_packfile_extract_raw_delta(&delta_buf, &delta_size, - &delta_compressed_size, &delta_offset, &base_offset, &base_id, - &base_size, &result_size, a->pack, a->packidx, obj_idx); + &delta_compressed_size, &delta_offset, &delta_data_offset, + &base_offset, &base_id, &base_size, &result_size, + a->pack, a->packidx, obj_idx); if (err) { if (err->code == GOT_ERR_OBJ_TYPE) return NULL; /* object not stored as a delta */ @@ -120,20 +120,7 @@ search_delta_for_object(struct got_object_id *id, void if (got_object_idset_contains(a->idset, &base_id)) { struct got_pack_meta *m, *base; - off_t delta_out_offset; - ssize_t w; - delta_out_offset = lseek(a->delta_cache_fd, 0, SEEK_CUR); - if (delta_out_offset == -1) { - err = got_error_from_errno("lseek"); - goto done; - } - w = write(a->delta_cache_fd, delta_buf, delta_compressed_size); - if (w != delta_compressed_size) { - err = got_error_from_errno("write"); - goto done; - } - m = got_object_idset_get(a->idset, id); if (m == NULL) { err = got_error_msg(GOT_ERR_NO_OBJ, @@ -158,8 +145,8 @@ search_delta_for_object(struct got_object_id *id, void m->size = result_size; m->delta_len = delta_size; m->delta_compressed_len = delta_compressed_size; - m->reused_delta_offset = delta_offset; - m->delta_offset = delta_out_offset; + m->reused_delta_offset = delta_data_offset; + m->delta_offset = 0; err = got_pack_add_meta(m, a->v); if (err) @@ -177,35 +164,35 @@ got_pack_search_deltas(struct got_pack_metavec *v, } const struct got_error * -got_pack_search_deltas(struct got_pack_metavec *v, - struct got_object_idset *idset, int delta_cache_fd, +got_pack_search_deltas(struct got_packidx **packidx, struct got_pack **pack, + struct got_pack_metavec *v, struct got_object_idset *idset, int ncolored, int nfound, int ntrees, int ncommits, struct got_repository *repo, got_pack_progress_cb progress_cb, void *progress_arg, struct got_ratelimit *rl, got_cancel_cb cancel_cb, void *cancel_arg) { const struct got_error *err = NULL; - struct got_packidx *packidx; - struct got_pack *pack; struct search_deltas_arg sda; - err = got_pack_find_pack_for_reuse(&packidx, repo); + *packidx = NULL; + *pack = NULL; + + err = got_pack_find_pack_for_reuse(packidx, repo); if (err) return err; - if (packidx == NULL) + if (*packidx == NULL) return NULL; - err = got_pack_cache_pack_for_packidx(&pack, packidx, repo); + err = got_pack_cache_pack_for_packidx(pack, *packidx, repo); if (err) return err; memset(&sda, 0, sizeof(sda)); sda.v = v; sda.idset = idset; - sda.pack = pack; - sda.packidx = packidx; - sda.delta_cache_fd = delta_cache_fd; + sda.pack = *pack; + sda.packidx = *packidx; sda.ncolored = ncolored; sda.nfound = nfound; sda.ntrees = ntrees; blob - a08374f7b103b4078c61a1872a293f454f078ec4 blob + 63e729e49404dd6fb94579bd500fa06cee7ac5fe --- lib/pack_create_privsep.c +++ lib/pack_create_privsep.c @@ -114,7 +114,7 @@ recv_reused_delta(struct got_imsg_reused_delta *delta, m->delta_len = delta->delta_size; m->delta_compressed_len = delta->delta_compressed_size; - m->delta_offset = delta->delta_out_offset; + m->delta_offset = 0; m->prev = base; m->size = delta->result_size; m->reused_delta_offset = delta->delta_offset; @@ -125,69 +125,45 @@ static const struct got_error * return got_pack_add_meta(m, v); } -static const struct got_error * -prepare_delta_reuse(struct got_pack *pack, struct got_packidx *packidx, - int delta_outfd, struct got_repository *repo) -{ - const struct got_error *err = NULL; - - if (!pack->child_has_delta_outfd) { - int outfd_child; - outfd_child = dup(delta_outfd); - if (outfd_child == -1) { - err = got_error_from_errno("dup"); - goto done; - } - err = got_privsep_send_raw_delta_outfd( - pack->privsep_child->ibuf, outfd_child); - if (err) - goto done; - pack->child_has_delta_outfd = 1; - } - - err = got_privsep_send_delta_reuse_req(pack->privsep_child->ibuf); -done: - return err; -} - const struct got_error * -got_pack_search_deltas(struct got_pack_metavec *v, - struct got_object_idset *idset, int delta_cache_fd, +got_pack_search_deltas(struct got_packidx **packidx, struct got_pack **pack, + struct got_pack_metavec *v, struct got_object_idset *idset, int ncolored, int nfound, int ntrees, int ncommits, struct got_repository *repo, got_pack_progress_cb progress_cb, void *progress_arg, struct got_ratelimit *rl, got_cancel_cb cancel_cb, void *cancel_arg) { const struct got_error *err = NULL; - struct got_packidx *packidx; - struct got_pack *pack; struct got_imsg_reused_delta deltas[GOT_IMSG_REUSED_DELTAS_MAX_NDELTAS]; size_t ndeltas, i; - err = got_pack_find_pack_for_reuse(&packidx, repo); + *packidx = NULL; + *pack = NULL; + + err = got_pack_find_pack_for_reuse(packidx, repo); if (err) return err; - if (packidx == NULL) + if (*packidx == NULL) return NULL; - err = got_pack_cache_pack_for_packidx(&pack, packidx, repo); + err = got_pack_cache_pack_for_packidx(pack, *packidx, repo); if (err) - return err; + goto done; - if (pack->privsep_child == NULL) { - err = got_pack_start_privsep_child(pack, packidx); + if ((*pack)->privsep_child == NULL) { + err = got_pack_start_privsep_child(*pack, *packidx); if (err) - return err; + goto done; } - err = prepare_delta_reuse(pack, packidx, delta_cache_fd, repo); + err = got_privsep_send_delta_reuse_req((*pack)->privsep_child->ibuf); if (err) - return err; + goto done; - err = send_idset(pack->privsep_child->ibuf, idset); + err = send_idset((*pack)->privsep_child->ibuf, idset); if (err) - return err; + goto done; for (;;) { int done = 0; @@ -199,7 +175,7 @@ got_pack_search_deltas(struct got_pack_metavec *v, } err = got_privsep_recv_reused_deltas(&done, deltas, &ndeltas, - pack->privsep_child->ibuf); + (*pack)->privsep_child->ibuf); if (err || done) break; blob - 5ede5726ac196d5b3ba820a830a44852e453b174 blob + 7500a424c3c703e986507e47bad61b771851f850 --- lib/repository.c +++ lib/repository.c @@ -1519,7 +1519,8 @@ got_repo_pin_pack(struct got_repository *repo, struct repo->pinned_pack = pinned_pack; repo->pinned_packidx = pinned_packidx; - repo->pinned_pid = repo->packs[pinned_pack].privsep_child->pid; + if (repo->packs[pinned_pack].privsep_child) + repo->pinned_pid = repo->packs[pinned_pack].privsep_child->pid; return NULL; } blob - e49fd95ba0b7919efe9005b306782db008b22370 blob + a86f3dde84274fdf37db09da580f7bba1e012e46 --- libexec/got-read-pack/got-read-pack.c +++ libexec/got-read-pack/got-read-pack.c @@ -869,7 +869,7 @@ raw_delta_request(struct imsg *imsg, struct imsgbuf *i const struct got_error *err = NULL; struct got_imsg_raw_delta_request req; size_t datalen, delta_size, delta_compressed_size; - off_t delta_offset; + off_t delta_offset, delta_data_offset; uint8_t *delta_buf = NULL; struct got_object_id id, base_id; off_t base_offset, delta_out_offset = 0; @@ -885,8 +885,9 @@ raw_delta_request(struct imsg *imsg, struct imsgbuf *i imsg->fd = -1; err = got_packfile_extract_raw_delta(&delta_buf, &delta_size, - &delta_compressed_size, &delta_offset, &base_offset, &base_id, - &base_size, &result_size, pack, packidx, req.idx); + &delta_compressed_size, &delta_offset, &delta_data_offset, + &base_offset, &base_id, &base_size, &result_size, + pack, packidx, req.idx); if (err) goto done; @@ -924,7 +925,6 @@ struct search_deltas_arg { struct got_packidx *packidx; struct got_pack *pack; struct got_object_idset *idset; - FILE *delta_outfile; struct got_imsg_reused_delta deltas[GOT_IMSG_REUSED_DELTAS_MAX_NDELTAS]; size_t ndeltas; }; @@ -938,7 +938,7 @@ search_delta_for_object(struct got_object_id *id, void uint8_t *delta_buf = NULL; uint64_t base_size, result_size; size_t delta_size, delta_compressed_size; - off_t delta_offset, base_offset; + off_t delta_offset, delta_data_offset, base_offset; struct got_object_id base_id; if (sigint_received) @@ -949,8 +949,9 @@ search_delta_for_object(struct got_object_id *id, void return NULL; /* object not present in our pack file */ err = got_packfile_extract_raw_delta(&delta_buf, &delta_size, - &delta_compressed_size, &delta_offset, &base_offset, &base_id, - &base_size, &result_size, a->pack, a->packidx, obj_idx); + &delta_compressed_size, &delta_offset, &delta_data_offset, + &base_offset, &base_id, &base_size, &result_size, + a->pack, a->packidx, obj_idx); if (err) { if (err->code == GOT_ERR_OBJ_TYPE) return NULL; /* object not stored as a delta */ @@ -969,16 +970,7 @@ search_delta_for_object(struct got_object_id *id, void if (got_object_idset_contains(a->idset, &base_id)) { struct got_imsg_reused_delta *delta; - off_t delta_out_offset = ftello(a->delta_outfile); - size_t w; - w = fwrite(delta_buf, 1, delta_compressed_size, - a->delta_outfile); - if (w != delta_compressed_size) { - err = got_ferror(a->delta_outfile, GOT_ERR_IO); - goto done; - } - delta = &a->deltas[a->ndeltas++]; memcpy(&delta->id, id, sizeof(delta->id)); memcpy(&delta->base_id, &base_id, sizeof(delta->base_id)); @@ -986,8 +978,7 @@ search_delta_for_object(struct got_object_id *id, void delta->result_size = result_size; delta->delta_size = delta_size; delta->delta_compressed_size = delta_compressed_size; - delta->delta_offset = delta_offset; - delta->delta_out_offset = delta_out_offset; + delta->delta_offset = delta_data_offset; if (a->ndeltas >= GOT_IMSG_REUSED_DELTAS_MAX_NDELTAS) { err = got_privsep_send_reused_deltas(a->ibuf, @@ -1054,7 +1045,7 @@ delta_reuse_request(struct imsg *imsg, struct imsgbuf static const struct got_error * delta_reuse_request(struct imsg *imsg, struct imsgbuf *ibuf, - FILE *delta_outfile, struct got_pack *pack, struct got_packidx *packidx) + struct got_pack *pack, struct got_packidx *packidx) { const struct got_error *err = NULL; struct got_object_idset *idset; @@ -1073,7 +1064,6 @@ delta_reuse_request(struct imsg *imsg, struct imsgbuf sda.idset = idset; sda.pack = pack; sda.packidx = packidx; - sda.delta_outfile = delta_outfile; err = got_object_idset_for_each(idset, search_delta_for_object, &sda); if (err) goto done; @@ -1085,11 +1075,6 @@ delta_reuse_request(struct imsg *imsg, struct imsgbuf goto done; } - if (fflush(delta_outfile) == -1) { - err = got_error_from_errno("fflush"); - goto done; - } - err = got_privsep_send_reused_deltas_done(ibuf); done: got_object_idset_free(idset); @@ -2001,12 +1986,7 @@ main(int argc, char *argv[]) pack, packidx); break; case GOT_IMSG_DELTA_REUSE_REQUEST: - if (delta_outfile == NULL) { - err = got_error(GOT_ERR_PRIVSEP_NO_FD); - break; - } - err = delta_reuse_request(&imsg, &ibuf, - delta_outfile, pack, packidx); + err = delta_reuse_request(&imsg, &ibuf, pack, packidx); break; case GOT_IMSG_COMMIT_REQUEST: err = commit_request(&imsg, &ibuf, pack, packidx,