From: Stefan Sperling Subject: Re: reuse deltas while packing To: gameoftrees@openbsd.org Date: Sat, 5 Feb 2022 15:39:00 +0100 Hi, Is anyone planning to test or review this? On Thu, Jan 27, 2022 at 03:01:04PM +0100, Stefan Sperling wrote: > This patch adds initial support for reusing existing deltas while > creating new pack files. This is important for achieving useful > performance, especially on a server which is serving 'got clone'. > Clients which want to upload large pack files, e.g. to populate > an empty repository on a server, will also benefit from this. > > By far, this patch does not make things as fast as they need to be. > But there is a noticable performance improvement. On my system the > time spent on packing got.git goes from about 30 seconds to about > 10 seconds, with the "deltify:" step running significantly faster. > Packing all of OpenBSD src.git now takes about half an hour on > my laptop. > > The way delta-reuse works is that we first identify exactly one pack > file in the repository as a source of reused deltas. The order in > which deltas get written to a pack file matters for indexing the > pack file, since any dependencies of a delta should appear before > the delta itself. Otherwise Got or even Git may fail to index the pack. > Using just one pack file as a source means we can simply write reused > deltas in the same order they had in the original pack. If we were reusing > deltas from multiple pack files avoiding such issues would be non-trivial. > > The pack we will reuse is the one that contains the largest amount of > objects, which should be fine for well-kept repositories where most > objects will be packed as a result of regular maintaince. > (This feature is best tested after running 'git repack -a -d'; we do > not yet have equivalent functionality in Got's own tooling as there > is no way yet to delete redundant pack files with gotadmin yet.) > > got-read-pack supports a new imsg request which asks for plain delta > data to be written into a file. This file is the delta cache used by > the gotadmin process, meaning we will initially fill this cache with > any deltas we can reuse. > > A delta can be reused if we want to add both the corresponding object > itself as well as its delta's base to the pack file. Finding the base > object ID of a delta can be expensive without a reverse-mapping from > a given pack file offset to an index into the pack index table which > can be used to look up an object ID. > This reverse-mapping is computed on the fly by got-read-pack when > the first request for a raw delta arrives. > > Once the delta cache file has been filled with reused deltas, we > deltify any remaining objects as usual and append them to cache. > > When writing the pack file we first write any objects we deltified > ourselves since they have no dependencies among the reused deltas, > and then write any reused deltas out in the order they appeared in > their original pack file. (We could probably make our own deltified > objects depend on reused deltas, but for now this is not worth the > extra complexity.) > > There was some refactoring necessary to make this work. In particular, > the got_pack_metavec array is no longer populated during the initial > object discovery step. > > Packing still requires a large amount of disk space in /tmp. > One reason is that deltas, whether re-used or not, are stored in the > delta cache file in uncompressed form. This can also be fixed later. > > Since this patch is large I will try to split it into a series of > smaller commits before sending it to our repo. > > ok? > > diff 8669fd5c8f1a790cfe005834fd9fbd34ceef71f6 620a4020edb35fa9c0505d8fdfd619d864d1e2fb > blob - 60920514cdf4045702691ca0ad19a5d080a8893e > blob + 7a4aaaaec7e976d89c7989136f80818dd4091850 > --- lib/got_lib_object.h > +++ lib/got_lib_object.h > @@ -102,6 +102,14 @@ const struct got_error *got_object_open_loose_fd(int * > struct got_repository *); > const struct got_error *got_object_open_packed(struct got_object **, > struct got_object_id *, struct got_repository *); > +struct got_pack; > +struct got_packidx; > +const struct got_error *got_object_open_from_packfile(struct got_object **, > + struct got_object_id *, struct got_pack *, struct got_packidx *, int, > + struct got_repository *); > +const struct got_error *got_object_read_raw_delta(uint64_t *, uint64_t *, > + off_t *, off_t *, off_t *, struct got_object_id **, int, > + struct got_packidx *, int, struct got_object_id *, struct got_repository *); > const struct got_error *got_object_read_header_privsep(struct got_object **, > struct got_object_id *, struct got_repository *, int); > const struct got_error *got_object_open(struct got_object **, > blob - 750be4962021434fb27d2dcc95867ff2e333a776 > blob + 5346f7f2ce1559b966b7c6e2f1bde6da40d729a4 > --- lib/got_lib_object_idset.h > +++ lib/got_lib_object_idset.h > @@ -31,3 +31,13 @@ const struct got_error *got_object_idset_for_each(stru > const struct got_error *(*cb)(struct got_object_id *, void *, void *), > void *); > int got_object_idset_num_elements(struct got_object_idset *); > + > +struct got_object_idset_element; > +struct got_object_idset_element *got_object_idset_get_element( > + struct got_object_idset *, struct got_object_id *); > +void *got_object_idset_get_element_data(struct got_object_idset_element *); > +const struct got_error *got_object_idset_for_each_element(struct got_object_idset *, > + const struct got_error *(*cb)(struct got_object_idset_element *, void *), void *); > +void got_object_idset_remove_element(struct got_object_idset *, > + struct got_object_idset_element *); > + > blob - 2a9c135628c6fed4aeef4fb17093f23e0b5df9ae > blob + e8fb373e287ee80486d50ed07964d9d39924308d > --- lib/got_lib_pack.h > +++ lib/got_lib_pack.h > @@ -22,6 +22,7 @@ struct got_pack { > size_t filesize; > struct got_privsep_child *privsep_child; > int child_has_tempfiles; > + int child_has_delta_outfd; > struct got_delta_cache *delta_cache; > }; > > @@ -96,6 +97,16 @@ struct got_packidx_v2_hdr { > struct got_packidx_trailer *trailer; > }; > > +struct got_pack_offset_index { > + uint32_t offset; > + uint32_t idx; > +}; > + > +struct got_pack_large_offset_index { > + uint64_t offset; > + uint32_t idx; > +}; > + > /* An open pack index file. */ > struct got_packidx { > char *path_packidx; /* actual on-disk path */ > @@ -104,6 +115,8 @@ struct got_packidx { > size_t len; > size_t nlargeobj; > struct got_packidx_v2_hdr hdr; /* convenient pointers into map */ > + struct got_pack_offset_index *sorted_offsets; > + struct got_pack_large_offset_index *sorted_large_offsets; > }; > > struct got_packfile_hdr { > @@ -177,6 +190,10 @@ const struct got_error *got_packidx_close(struct got_p > const struct got_error *got_packidx_get_packfile_path(char **, const char *); > off_t got_packidx_get_object_offset(struct got_packidx *, int idx); > int got_packidx_get_object_idx(struct got_packidx *, struct got_object_id *); > +const struct got_error *got_packidx_get_offset_idx(int *, struct got_packidx *, > + off_t); > +const struct got_error *got_packidx_get_object_id(struct got_object_id *, > + struct got_packidx *, int); > const struct got_error *got_packidx_match_id_str_prefix( > struct got_object_id_queue *, struct got_packidx *, const char *); > > @@ -194,5 +211,8 @@ const struct got_error *got_packfile_extract_object(st > struct got_object *, FILE *, FILE *, FILE *); > const struct got_error *got_packfile_extract_object_to_mem(uint8_t **, size_t *, > struct got_object *, struct got_pack *); > +const struct got_error *got_packfile_extract_raw_delta(uint8_t **, size_t *, > + off_t *, off_t *, struct got_object_id *, uint64_t *, uint64_t *, > + struct got_pack *, struct got_packidx *, int); > struct got_pack *got_repo_get_cached_pack(struct got_repository *, > const char *); > blob - ce62cbf5e948bfee2b29b0f0524208b1a15a93e9 > blob + 274e89878290befef48084afc0ae191cd5c36b16 > --- lib/got_lib_privsep.h > +++ lib/got_lib_privsep.h > @@ -174,6 +174,11 @@ enum got_imsg_type { > GOT_IMSG_RAW_OBJECT_OUTFD, > GOT_IMSG_PACKED_RAW_OBJECT_REQUEST, > GOT_IMSG_RAW_OBJECT, > + > + /* Read raw delta data from pack files. */ > + GOT_IMSG_RAW_DELTA_OUTFD, > + GOT_IMSG_RAW_DELTA_REQUEST, > + GOT_IMSG_RAW_DELTA, > }; > > /* Structure for GOT_IMSG_ERROR. */ > @@ -262,6 +267,21 @@ struct got_imsg_raw_obj { > (MAX_IMSGSIZE - IMSG_HEADER_SIZE - sizeof(struct got_imsg_raw_obj)) > }; > > +/* Structure for GOT_IMSG_RAW_DELTA. */ > +struct got_imsg_raw_delta { > + uint8_t base_id[SHA1_DIGEST_LENGTH]; > + uint64_t base_size; > + uint64_t result_size; > + off_t delta_size; > + off_t delta_offset; > + off_t delta_out_offset; > + > + /* > + * Delta data has been written at delta_out_offset to the file > + * descriptor passed via the GOT_IMSG_RAW_DELTA_OUTFD imsg. > + */ > +}; > + > /* Structure for GOT_IMSG_TAG data. */ > struct got_imsg_tag_object { > uint8_t id[SHA1_DIGEST_LENGTH]; > @@ -428,6 +448,26 @@ struct got_imsg_packed_object { > int idx; > } __attribute__((__packed__)); > > +/* > + * Structure for GOT_IMSG_DELTA data. > + */ > +struct got_imsg_delta { > + /* These fields are the same as in struct got_delta. */ > + off_t offset; > + size_t tslen; > + int type; > + size_t size; > + off_t data_offset; > +}; > + > +/* > + * Structure for GOT_IMSG_RAW_DELTA_REQUEST data. > + */ > +struct got_imsg_raw_delta_request { > + uint8_t id[SHA1_DIGEST_LENGTH]; > + int idx; > +}; > + > /* Structure for GOT_IMSG_COMMIT_TRAVERSAL_REQUEST */ > struct got_imsg_commit_traversal_request { > uint8_t id[SHA1_DIGEST_LENGTH]; > @@ -588,4 +628,12 @@ const struct got_error *got_privsep_recv_traversed_com > struct got_commit_object **, struct got_object_id **, > struct got_object_id_queue *, struct imsgbuf *); > > +const struct got_error *got_privsep_send_raw_delta_req(struct imsgbuf *, int, > + struct got_object_id *); > +const struct got_error *got_privsep_send_raw_delta_outfd(struct imsgbuf *, int); > +const struct got_error *got_privsep_send_raw_delta(struct imsgbuf *, uint64_t, > + uint64_t, off_t, off_t, off_t, struct got_object_id *); > +const struct got_error *got_privsep_recv_raw_delta(uint64_t *, uint64_t *, > + off_t *, off_t *, off_t *, struct got_object_id **, struct imsgbuf *); > + > void got_privsep_exec_child(int[2], const char *, const char *); > blob - 831cb967da0e76447d97b70ff058416fbf492de4 > blob + 798762e89557feb475f650e62e549abe41eb43c4 > --- lib/got_lib_repository.h > +++ lib/got_lib_repository.h > @@ -122,7 +122,13 @@ const struct got_error*got_repo_cache_raw_object(struc > struct got_raw_object *got_repo_get_cached_raw_object(struct got_repository *, > struct got_object_id *); > int got_repo_is_packidx_filename(const char *, size_t); > +int got_repo_check_packidx_bloom_filter(struct got_repository *, > + const char *, struct got_object_id *); > const struct got_error *got_repo_search_packidx(struct got_packidx **, int *, > struct got_repository *, struct got_object_id *); > +const struct got_error *got_repo_list_packidx(struct got_pathlist_head *, > + struct got_repository *); > +const struct got_error *got_repo_get_packidx(struct got_packidx **, const char *, > + struct got_repository *); > const struct got_error *got_repo_cache_pack(struct got_pack **, > struct got_repository *, const char *, struct got_packidx *); > blob - ce95f8795c1e303f174dcdf4d6eeb41563569e6f > blob + 8218b2c4c146e832c63f84d4a9f6a43f8b3283f8 > --- lib/object.c > +++ lib/object.c > @@ -267,6 +267,7 @@ start_pack_privsep_child(struct got_pack *pack, struct > return err; > } > pack->child_has_tempfiles = 0; > + pack->child_has_delta_outfd = 0; > > if (socketpair(AF_UNIX, SOCK_STREAM, PF_UNSPEC, imsg_fds) == -1) { > err = got_error_from_errno("socketpair"); > @@ -376,6 +377,71 @@ done: > return err; > } > > +const struct got_error * > +got_object_open_from_packfile(struct got_object **obj, struct got_object_id *id, > + struct got_pack *pack, struct got_packidx *packidx, int obj_idx, > + struct got_repository *repo) > +{ > + return read_packed_object_privsep(obj, repo, pack, packidx, > + obj_idx, id); > +} > + > +const struct got_error * > +got_object_read_raw_delta(uint64_t *base_size, uint64_t *result_size, > + off_t *delta_size, off_t *delta_offset, off_t *delta_out_offset, > + struct got_object_id **base_id, int delta_cache_fd, > + struct got_packidx *packidx, int obj_idx, struct got_object_id *id, > + struct got_repository *repo) > +{ > + const struct got_error *err = NULL; > + struct got_pack *pack = NULL; > + char *path_packfile; > + > + *base_size = 0; > + *result_size = 0; > + *delta_size = 0; > + *delta_offset = 0; > + *delta_out_offset = 0; > + > + err = got_packidx_get_packfile_path(&path_packfile, > + packidx->path_packidx); > + if (err) > + return err; > + > + pack = got_repo_get_cached_pack(repo, path_packfile); > + if (pack == NULL) { > + err = got_repo_cache_pack(&pack, repo, path_packfile, packidx); > + if (err) > + return err; > + } > + > + if (pack->privsep_child == NULL) { > + err = start_pack_privsep_child(pack, packidx); > + if (err) > + return err; > + } > + > + if (!pack->child_has_delta_outfd) { > + int outfd_child; > + outfd_child = dup(delta_cache_fd); > + if (outfd_child == -1) > + return got_error_from_errno("dup"); > + err = got_privsep_send_raw_delta_outfd( > + pack->privsep_child->ibuf, outfd_child); > + if (err) > + return err; > + pack->child_has_delta_outfd = 1; > + } > + > + err = got_privsep_send_raw_delta_req(pack->privsep_child->ibuf, > + obj_idx, id); > + if (err) > + return err; > + > + return got_privsep_recv_raw_delta(base_size, result_size, delta_size, > + delta_offset, delta_out_offset, base_id, pack->privsep_child->ibuf); > +} > + > static const struct got_error * > request_object(struct got_object **obj, struct got_object_id *id, > struct got_repository *repo, int fd) > blob - 152534fe9ad8d3e39e0d2945b284bbae1f218a2f > blob + bec28e2592f77e2e43e9df547016ff61d8886b42 > --- lib/object_idset.c > +++ lib/object_idset.c > @@ -190,5 +190,42 @@ got_object_idset_num_elements(struct got_object_idset > return set->totelem; > } > > +struct got_object_idset_element * > +got_object_idset_get_element(struct got_object_idset *set, struct got_object_id *id) > +{ > + return find_element(set, id); > +} > + > +void * > +got_object_idset_get_element_data(struct got_object_idset_element *entry) > +{ > + return entry->data; > +} > + > +const struct got_error * > +got_object_idset_for_each_element(struct got_object_idset *set, > + const struct got_error *(*cb)(struct got_object_idset_element *, void *), > + void *arg) > +{ > + const struct got_error *err; > + struct got_object_idset_element *entry, *tmp; > + > + RB_FOREACH_SAFE(entry, got_object_idset_tree, &set->entries, tmp) { > + err = (*cb)(entry, arg); > + if (err) > + return err; > + } > + return NULL; > +} > + > +void > +got_object_idset_remove_element(struct got_object_idset *set, > + struct got_object_idset_element *entry) > +{ > + RB_REMOVE(got_object_idset_tree, &set->entries, entry); > + free(entry); > + set->totelem--; > +} > + > RB_GENERATE(got_object_idset_tree, got_object_idset_element, entry, > cmp_elements); > blob - 8a01d22a8f4fe6686821dab1e44a139ea351a8a1 > blob + c981da3572edb6919e64e8fa50227b8ab99e0702 > --- lib/pack.c > +++ lib/pack.c > @@ -433,6 +433,8 @@ got_packidx_close(struct got_packidx *packidx) > } > if (close(packidx->fd) == -1 && err == NULL) > err = got_error_from_errno("close"); > + free(packidx->sorted_offsets); > + free(packidx->sorted_large_offsets); > free(packidx); > > return err; > @@ -508,7 +510,155 @@ got_packidx_get_object_idx(struct got_packidx *packidx > return -1; > } > > +static int > +offset_cmp(const void *pa, const void *pb) > +{ > + const struct got_pack_offset_index *a, *b; > + > + a = (const struct got_pack_offset_index *)pa; > + b = (const struct got_pack_offset_index *)pb; > + > + if (a->offset < b->offset) > + return -1; > + else if (a->offset > b->offset) > + return 1; > + > + return 0; > +} > + > +static int > +large_offset_cmp(const void *pa, const void *pb) > +{ > + const struct got_pack_large_offset_index *a, *b; > + > + a = (const struct got_pack_large_offset_index *)pa; > + b = (const struct got_pack_large_offset_index *)pb; > + > + if (a->offset < b->offset) > + return -1; > + else if (a->offset > b->offset) > + return 1; > + > + return 0; > +} > + > +static const struct got_error * > +build_offset_index(struct got_packidx *p) > +{ > + uint32_t nobj = be32toh(p->hdr.fanout_table[0xff]); > + unsigned int i, j, k; > + > + p->sorted_offsets = calloc(nobj - p->nlargeobj, > + sizeof(p->sorted_offsets[0])); > + if (p->sorted_offsets == NULL) > + return got_error_from_errno("calloc"); > + > + if (p->nlargeobj > 0) { > + p->sorted_large_offsets = calloc(p->nlargeobj, > + sizeof(p->sorted_large_offsets[0])); > + if (p->sorted_large_offsets == NULL) > + return got_error_from_errno("calloc"); > + } > + > + j = 0; > + k = 0; > + for (i = 0; i < nobj; i++) { > + uint32_t offset = be32toh(p->hdr.offsets[i]); > + if (offset & GOT_PACKIDX_OFFSET_VAL_IS_LARGE_IDX) { > + uint64_t loffset; > + uint32_t idx; > + idx = offset & GOT_PACKIDX_OFFSET_VAL_MASK; > + if (idx >= p->nlargeobj || > + p->nlargeobj == 0 || > + p->hdr.large_offsets == NULL) > + return got_error(GOT_ERR_BAD_PACKIDX); > + loffset = be64toh(p->hdr.large_offsets[idx]); > + p->sorted_large_offsets[j].offset = loffset; > + p->sorted_large_offsets[j].idx = i; > + j++; > + } else { > + p->sorted_offsets[k].offset = offset; > + p->sorted_offsets[k].idx = i; > + k++; > + } > + } > + if (j != p->nlargeobj || k != nobj - p->nlargeobj) > + return got_error(GOT_ERR_BAD_PACKIDX); > + > + qsort(p->sorted_offsets, nobj - p->nlargeobj, > + sizeof(p->sorted_offsets[0]), offset_cmp); > + > + if (p->sorted_large_offsets) > + qsort(p->sorted_large_offsets, p->nlargeobj, > + sizeof(p->sorted_large_offsets[0]), large_offset_cmp); > + > + return NULL; > +} > + > const struct got_error * > +got_packidx_get_offset_idx(int *idx, struct got_packidx *packidx, off_t offset) > +{ > + const struct got_error *err; > + uint32_t totobj = be32toh(packidx->hdr.fanout_table[0xff]); > + int i, left, right; > + > + *idx = -1; > + > + if (packidx->sorted_offsets == NULL) { > + err = build_offset_index(packidx); > + if (err) > + return err; > + } > + > + if (offset >= 0x7fffffff) { > + uint64_t lo; > + left = 0, right = packidx->nlargeobj - 1; > + while (left <= right) { > + i = ((left + right) / 2); > + lo = packidx->sorted_large_offsets[i].offset; > + if (lo == offset) { > + *idx = packidx->sorted_large_offsets[i].idx; > + break; > + } else if (offset > lo) > + left = i + 1; > + else if (offset < lo) > + right = i - 1; > + } > + } else { > + uint32_t o; > + left = 0, right = totobj - packidx->nlargeobj - 1; > + while (left <= right) { > + i = ((left + right) / 2); > + o = packidx->sorted_offsets[i].offset; > + if (o == offset) { > + *idx = packidx->sorted_offsets[i].idx; > + break; > + } else if (offset > o) > + left = i + 1; > + else if (offset < o) > + right = i - 1; > + } > + } > + > + return NULL; > +} > + > +const struct got_error * > +got_packidx_get_object_id(struct got_object_id *id, > + struct got_packidx *packidx, int idx) > +{ > + uint32_t totobj = be32toh(packidx->hdr.fanout_table[0xff]); > + struct got_packidx_object_id *oid; > + > + if (idx < 0 || idx >= totobj) > + return got_error(GOT_ERR_NO_OBJ); > + > + oid = &packidx->hdr.sorted_ids[idx]; > + memcpy(id->sha1, oid->sha1, SHA1_DIGEST_LENGTH); > + return NULL; > +} > + > +const struct got_error * > got_packidx_match_id_str_prefix(struct got_object_id_queue *matched_ids, > struct got_packidx *packidx, const char *id_str_prefix) > { > @@ -1452,3 +1602,84 @@ got_packfile_extract_object_to_mem(uint8_t **buf, size > > return err; > } > + > +const struct got_error * > +got_packfile_extract_raw_delta(uint8_t **delta_buf, size_t *delta_size, > + off_t *delta_offset, off_t *base_offset, struct got_object_id *base_id, > + uint64_t *base_size, uint64_t *result_size, struct got_pack *pack, > + struct got_packidx *packidx, int idx) > +{ > + const struct got_error *err = NULL; > + off_t offset; > + uint8_t type; > + uint64_t size; > + size_t tslen, delta_hdrlen; > + > + *delta_buf = NULL; > + *delta_size = 0; > + *delta_offset = 0; > + *base_offset = 0; > + *base_size = 0; > + *result_size = 0; > + > + offset = got_packidx_get_object_offset(packidx, idx); > + if (offset == (uint64_t)-1) > + return got_error(GOT_ERR_BAD_PACKIDX); > + > + if (offset >= pack->filesize) > + return got_error(GOT_ERR_PACK_OFFSET); > + > + err = got_pack_parse_object_type_and_size(&type, &size, &tslen, > + pack, offset); > + if (err) > + return err; > + > + if (tslen + size < tslen || offset + size < size || > + tslen + offset < tslen) > + return got_error(GOT_ERR_PACK_OFFSET); > + > + switch (type) { > + case GOT_OBJ_TYPE_OFFSET_DELTA: > + err = got_pack_parse_offset_delta(base_offset, &delta_hdrlen, > + pack, offset, tslen); > + if (err) > + return err; > + break; > + case GOT_OBJ_TYPE_REF_DELTA: > + err = got_pack_parse_ref_delta(base_id, pack, offset, tslen); > + if (err) > + return err; > + delta_hdrlen = SHA1_DIGEST_LENGTH; > + break; > + default: > + return got_error_fmt(GOT_ERR_OBJ_TYPE, > + "non-delta object type %d found at offset %llu", > + type, offset); > + } > + > + if (tslen + delta_hdrlen < delta_hdrlen || > + offset + delta_hdrlen < delta_hdrlen) > + return got_error(GOT_ERR_BAD_DELTA); > + > + err = read_delta_data(delta_buf, delta_size, > + offset + tslen + delta_hdrlen, pack); > + if (err) > + return err; > + > + if (*delta_size != size) { > + err = got_error(GOT_ERR_BAD_DELTA); > + goto done; > + } > + > + err = got_delta_get_sizes(base_size, result_size, *delta_buf, size); > + if (err) > + goto done; > + > + *delta_offset = offset; > +done: > + if (err) { > + free(*delta_buf); > + *delta_buf = NULL; > + } > + return err; > +} > blob - f4c9f4cd1e5f2d7ec2bb640a2745e9ef11b8f29a > blob + 126be35e5fb8688252b462d73222675dfe1e830d > --- lib/pack_create.c > +++ lib/pack_create.c > @@ -74,6 +74,10 @@ struct got_pack_meta { > off_t delta_len; /* encoded delta length */ > int nchain; > > + int have_reused_delta; > + off_t reused_delta_offset; /* offset of delta in reused pack file */ > + struct got_object_id *base_obj_id; > + > /* Only used for delta window */ > struct got_delta_table *dtab; > > @@ -124,6 +128,8 @@ clear_meta(struct got_pack_meta *meta) > meta->path = NULL; > free(meta->delta_buf); > meta->delta_buf = NULL; > + free(meta->base_obj_id); > + meta->base_obj_id = NULL; > } > > static void > @@ -419,12 +425,229 @@ report_progress(got_pack_progress_cb progress_cb, void > } > > static const struct got_error * > -pick_deltas(struct got_pack_meta **meta, int nmeta, int nours, > - FILE *delta_cache, struct got_repository *repo, > +add_meta(struct got_pack_meta *m, struct got_pack_metavec *v) > +{ > + if (v->nmeta == v->metasz){ > + size_t newsize = 2 * v->metasz; > + struct got_pack_meta **new; > + new = reallocarray(v->meta, newsize, sizeof(*new)); > + if (new == NULL) > + return got_error_from_errno("reallocarray"); > + v->meta = new; > + v->metasz = newsize; > + } > + > + v->meta[v->nmeta++] = m; > + return NULL; > +} > + > +static const struct got_error * > +reuse_delta(int idx, struct got_pack_meta *m, struct got_pack_metavec *v, > + struct got_object_idset *idset, struct got_pack *pack, > + struct got_packidx *packidx, int delta_cache_fd, > + struct got_repository *repo) > +{ > + const struct got_error *err = NULL; > + struct got_pack_meta *base = NULL; > + struct got_object_id *base_obj_id = NULL; > + off_t delta_len = 0, delta_offset = 0, delta_cache_offset = 0; > + uint64_t base_size, result_size; > + > + if (m->have_reused_delta) > + return NULL; > + > + err = got_object_read_raw_delta(&base_size, &result_size, &delta_len, > + &delta_offset, &delta_cache_offset, &base_obj_id, delta_cache_fd, > + packidx, idx, &m->id, repo); > + if (err) > + return err; > + > + if (delta_offset + delta_len < delta_offset) > + return got_error(GOT_ERR_BAD_PACKFILE); > + > + base = got_object_idset_get(idset, base_obj_id); > + if (base == NULL) > + goto done; > + > + m->delta_len = delta_len; > + m->delta_offset = delta_cache_offset; > + m->prev = base; > + m->size = result_size; > + m->have_reused_delta = 1; > + m->reused_delta_offset = delta_offset; > + m->base_obj_id = base_obj_id; > + base_obj_id = NULL; > + err = add_meta(m, v); > +done: > + free(base_obj_id); > + return err; > +} > + > +static const struct got_error * > +find_pack_for_reuse(struct got_packidx **best_packidx, > + struct got_repository *repo) > +{ > + const struct got_error *err; > + struct got_pathlist_head packidx_paths; > + struct got_pathlist_entry *pe; > + const char *best_packidx_path = NULL; > + int nobj_max = 0; > + > + TAILQ_INIT(&packidx_paths); > + *best_packidx = NULL; > + > + err = got_repo_list_packidx(&packidx_paths, repo); > + if (err) > + return err; > + > + TAILQ_FOREACH(pe, &packidx_paths, entry) { > + const char *path_packidx = pe->path; > + struct got_packidx *packidx; > + int nobj; > + > + err = got_repo_get_packidx(&packidx, path_packidx, repo); > + if (err) > + break; > + > + nobj = be32toh(packidx->hdr.fanout_table[0xff]); > + if (nobj > nobj_max) { > + best_packidx_path = path_packidx; > + nobj_max = nobj; > + } > + } > + > + if (best_packidx_path) { > + err = got_repo_get_packidx(best_packidx, best_packidx_path, > + repo); > + } > + > + TAILQ_FOREACH(pe, &packidx_paths, entry) > + free((void *)pe->path); > + got_pathlist_free(&packidx_paths); > + return err; > +} > + > +struct search_deltas_arg { > + struct got_packidx *packidx; > + struct got_pack *pack; > + struct got_object_idset *idset; > + struct got_pack_metavec *v; > + int delta_cache_fd; > + struct got_repository *repo; > + got_pack_progress_cb progress_cb; > + void *progress_arg; > + struct got_ratelimit *rl; > + got_cancel_cb cancel_cb; > + void *cancel_arg; > + int ncommits; > +}; > + > +static const struct got_error * > +search_delta_for_object(struct got_object_id *id, void *data, void *arg) > +{ > + const struct got_error *err; > + struct got_pack_meta *m = data; > + struct search_deltas_arg *a = arg; > + int obj_idx; > + struct got_object *obj = NULL; > + > + if (a->cancel_cb) { > + err = (*a->cancel_cb)(a->cancel_arg); > + if (err) > + return err; > + } > + > + if (!got_repo_check_packidx_bloom_filter(a->repo, > + a->packidx->path_packidx, id)) > + return NULL; > + > + obj_idx = got_packidx_get_object_idx(a->packidx, id); > + if (obj_idx == -1) > + return NULL; > + > + /* TODO: > + * Opening and closing an object just to check its flags > + * is a bit expensive. We could have an imsg which requests > + * plain type/size information for an object without doing > + * work such as traversing the object's entire delta chain > + * to find the base object type, and other such info which > + * we don't really need here. > + */ > + err = got_object_open_from_packfile(&obj, &m->id, a->pack, > + a->packidx, obj_idx, a->repo); > + if (err) > + return err; > + > + if (obj->flags & GOT_OBJ_FLAG_DELTIFIED) { > + reuse_delta(obj_idx, m, a->v, a->idset, a->pack, a->packidx, > + a->delta_cache_fd, a->repo); > + if (err) > + goto done; > + err = report_progress(a->progress_cb, a->progress_arg, a->rl, > + 0L, a->ncommits, got_object_idset_num_elements(a->idset), > + a->v->nmeta, 0); > + } > +done: > + got_object_close(obj); > + return err; > +} > + > +static const struct got_error * > +search_deltas(struct got_pack_metavec *v, struct got_object_idset *idset, > + int delta_cache_fd, int ncommits, struct got_repository *repo, > got_pack_progress_cb progress_cb, void *progress_arg, > struct got_ratelimit *rl, got_cancel_cb cancel_cb, void *cancel_arg) > { > const struct got_error *err = NULL; > + char *path_packfile = NULL; > + struct got_packidx *packidx; > + struct got_pack *pack; > + struct search_deltas_arg sda; > + > + err = find_pack_for_reuse(&packidx, repo); > + if (err) > + return err; > + > + if (packidx == NULL) > + return NULL; > + > + err = got_packidx_get_packfile_path(&path_packfile, > + packidx->path_packidx); > + if (err) > + return err; > + > + pack = got_repo_get_cached_pack(repo, path_packfile); > + if (pack == NULL) { > + err = got_repo_cache_pack(&pack, repo, path_packfile, packidx); > + if (err) > + goto done; > + } > + > + sda.packidx = packidx; > + sda.pack = pack; > + sda.idset = idset; > + sda.v = v; > + sda.delta_cache_fd = delta_cache_fd; > + sda.repo = repo; > + sda.progress_cb = progress_cb; > + sda.progress_arg = progress_arg; > + sda.rl = rl; > + sda.cancel_cb = cancel_cb; > + sda.cancel_arg = cancel_arg; > + sda.ncommits = ncommits; > + err = got_object_idset_for_each(idset, search_delta_for_object, &sda); > +done: > + free(path_packfile); > + return err; > +} > + > +static const struct got_error * > +pick_deltas(struct got_pack_meta **meta, int nmeta, int ncommits, > + int nreused, FILE *delta_cache, struct got_repository *repo, > + got_pack_progress_cb progress_cb, void *progress_arg, > + struct got_ratelimit *rl, got_cancel_cb cancel_cb, void *cancel_arg) > +{ > + const struct got_error *err = NULL; > struct got_pack_meta *m = NULL, *base = NULL; > struct got_raw_object *raw = NULL, *base_raw = NULL; > struct got_delta_instruction *deltas = NULL, *best_deltas = NULL; > @@ -443,7 +666,7 @@ pick_deltas(struct got_pack_meta **meta, int nmeta, in > break; > } > err = report_progress(progress_cb, progress_arg, rl, > - 0L, nours, nmeta, i, 0); > + 0L, ncommits, nreused + nmeta, nreused + i, 0); > if (err) > goto done; > m = meta[i]; > @@ -492,6 +715,7 @@ pick_deltas(struct got_pack_meta **meta, int nmeta, in > &base->id); > if (err) > goto done; > + > if (raw->f == NULL && base_raw->f == NULL) { > err = got_deltify_mem_mem(&deltas, &ndeltas, > raw->data, raw->hdrlen, > @@ -556,6 +780,15 @@ pick_deltas(struct got_pack_meta **meta, int nmeta, in > best_ndeltas, best_size, m->prev->size); > } else { > m->delta_offset = ftello(delta_cache); > + /* > + * TODO: > + * Storing compressed delta data in the delta > + * cache file would probably be more efficient > + * than writing uncompressed delta data here > + * and compressing it while writing the pack > + * file. This would also allow for reusing > + * deltas in their compressed form. > + */ > err = encode_delta(m, raw, best_deltas, > best_ndeltas, m->prev->size, delta_cache); > } > @@ -614,12 +847,12 @@ static const int obj_types[] = { > }; > > static const struct got_error * > -add_meta(struct got_pack_metavec *v, struct got_object_idset *idset, > +add_object(int want_meta, struct got_object_idset *idset, > struct got_object_id *id, const char *path, int obj_type, > time_t mtime, int loose_obj_only, struct got_repository *repo) > { > const struct got_error *err; > - struct got_pack_meta *m; > + struct got_pack_meta *m = NULL; > > if (loose_obj_only) { > int is_packed; > @@ -630,40 +863,17 @@ add_meta(struct got_pack_metavec *v, struct got_object > return NULL; > } > > - err = got_object_idset_add(idset, id, (void *)&obj_types[obj_type]); > - if (err) > - return err; > - > - if (v == NULL) > - return NULL; > - > - err = alloc_meta(&m, id, path, obj_type, mtime); > - if (err) > - goto done; > - > - if (v->nmeta == v->metasz){ > - size_t newsize = 2 * v->metasz; > - struct got_pack_meta **new; > - new = reallocarray(v->meta, newsize, sizeof(*new)); > - if (new == NULL) { > - err = got_error_from_errno("reallocarray"); > - goto done; > - } > - v->meta = new; > - v->metasz = newsize; > + if (want_meta) { > + err = alloc_meta(&m, id, path, obj_type, mtime); > + if (err) > + return err; > } > -done: > - if (err) { > - clear_meta(m); > - free(m); > - } else > - v->meta[v->nmeta++] = m; > > - return err; > + return got_object_idset_add(idset, id, m); > } > > static const struct got_error * > -load_tree_entries(struct got_object_id_queue *ids, struct got_pack_metavec *v, > +load_tree_entries(struct got_object_id_queue *ids, int want_meta, > struct got_object_idset *idset, struct got_object_id *tree_id, > const char *dpath, time_t mtime, struct got_repository *repo, > int loose_obj_only, got_cancel_cb cancel_cb, void *cancel_arg) > @@ -705,8 +915,8 @@ load_tree_entries(struct got_object_id_queue *ids, str > break; > STAILQ_INSERT_TAIL(ids, qid, entry); > } else if (S_ISREG(mode) || S_ISLNK(mode)) { > - err = add_meta(v, idset, id, p, GOT_OBJ_TYPE_BLOB, > - mtime, loose_obj_only, repo); > + err = add_object(want_meta, idset, id, p, > + GOT_OBJ_TYPE_BLOB, mtime, loose_obj_only, repo); > if (err) > break; > } > @@ -720,7 +930,7 @@ load_tree_entries(struct got_object_id_queue *ids, str > } > > static const struct got_error * > -load_tree(struct got_pack_metavec *v, struct got_object_idset *idset, > +load_tree(int want_meta, struct got_object_idset *idset, > struct got_object_id *tree_id, const char *dpath, time_t mtime, > int loose_obj_only, struct got_repository *repo, > got_cancel_cb cancel_cb, void *cancel_arg) > @@ -754,15 +964,15 @@ load_tree(struct got_pack_metavec *v, struct got_objec > continue; > } > > - err = add_meta(v, idset, qid->id, dpath, GOT_OBJ_TYPE_TREE, > - mtime, loose_obj_only, repo); > + err = add_object(want_meta, idset, qid->id, dpath, > + GOT_OBJ_TYPE_TREE, mtime, loose_obj_only, repo); > if (err) { > got_object_qid_free(qid); > break; > } > > - err = load_tree_entries(&tree_ids, v, idset, qid->id, dpath, > - mtime, repo, loose_obj_only, cancel_cb, cancel_arg); > + err = load_tree_entries(&tree_ids, want_meta, idset, qid->id, > + dpath, mtime, repo, loose_obj_only, cancel_cb, cancel_arg); > got_object_qid_free(qid); > if (err) > break; > @@ -773,7 +983,7 @@ load_tree(struct got_pack_metavec *v, struct got_objec > } > > static const struct got_error * > -load_commit(struct got_pack_metavec *v, struct got_object_idset *idset, > +load_commit(int want_meta, struct got_object_idset *idset, > struct got_object_id *id, struct got_repository *repo, int loose_obj_only, > got_cancel_cb cancel_cb, void *cancel_arg) > { > @@ -796,13 +1006,13 @@ load_commit(struct got_pack_metavec *v, struct got_obj > if (err) > return err; > > - err = add_meta(v, idset, id, "", GOT_OBJ_TYPE_COMMIT, > + err = add_object(want_meta, idset, id, "", GOT_OBJ_TYPE_COMMIT, > got_object_commit_get_committer_time(commit), > loose_obj_only, repo); > if (err) > goto done; > > - err = load_tree(v, idset, got_object_commit_get_tree_id(commit), > + err = load_tree(want_meta, idset, got_object_commit_get_tree_id(commit), > "", got_object_commit_get_committer_time(commit), > loose_obj_only, repo, cancel_cb, cancel_arg); > done: > @@ -811,7 +1021,7 @@ done: > } > > static const struct got_error * > -load_tag(struct got_pack_metavec *v, struct got_object_idset *idset, > +load_tag(int want_meta, struct got_object_idset *idset, > struct got_object_id *id, struct got_repository *repo, int loose_obj_only, > got_cancel_cb cancel_cb, void *cancel_arg) > { > @@ -834,7 +1044,7 @@ load_tag(struct got_pack_metavec *v, struct got_object > if (err) > return err; > > - err = add_meta(v, idset, id, "", GOT_OBJ_TYPE_TAG, > + err = add_object(want_meta, idset, id, "", GOT_OBJ_TYPE_TAG, > got_object_tag_get_tagger_time(tag), > loose_obj_only, repo); > if (err) > @@ -842,13 +1052,14 @@ load_tag(struct got_pack_metavec *v, struct got_object > > switch (got_object_tag_get_object_type(tag)) { > case GOT_OBJ_TYPE_COMMIT: > - err = load_commit(v, idset, > + err = load_commit(want_meta, idset, > got_object_tag_get_object_id(tag), repo, > loose_obj_only, cancel_cb, cancel_arg); > break; > case GOT_OBJ_TYPE_TREE: > - err = load_tree(v, idset, got_object_tag_get_object_id(tag), > - "", got_object_tag_get_tagger_time(tag), > + err = load_tree(want_meta, idset, > + got_object_tag_get_object_id(tag), "", > + got_object_tag_get_tagger_time(tag), > loose_obj_only, repo, cancel_cb, cancel_arg); > break; > default: > @@ -1124,7 +1335,7 @@ done: > } > > static const struct got_error * > -read_meta(struct got_pack_meta ***meta, int *nmeta, > +load_object_ids(struct got_object_idset *idset, > struct got_object_id **theirs, int ntheirs, > struct got_object_id **ours, int nours, struct got_repository *repo, > int loose_obj_only, got_pack_progress_cb progress_cb, void *progress_arg, > @@ -1132,25 +1343,8 @@ read_meta(struct got_pack_meta ***meta, int *nmeta, > { > const struct got_error *err = NULL; > struct got_object_id **ids = NULL; > - struct got_object_idset *idset; > int i, nobj = 0, obj_type; > - struct got_pack_metavec v; > > - *meta = NULL; > - *nmeta = 0; > - > - idset = got_object_idset_alloc(); > - if (idset == NULL) > - return got_error_from_errno("got_object_idset_alloc"); > - > - v.nmeta = 0; > - v.metasz = 64; > - v.meta = calloc(v.metasz, sizeof(struct got_pack_meta *)); > - if (v.meta == NULL) { > - err = got_error_from_errno("calloc"); > - goto done; > - } > - > err = findtwixt(&ids, &nobj, ours, nours, theirs, ntheirs, repo, > cancel_cb, cancel_arg); > if (err || nobj == 0) > @@ -1165,79 +1359,81 @@ read_meta(struct got_pack_meta ***meta, int *nmeta, > return err; > if (obj_type != GOT_OBJ_TYPE_COMMIT) > continue; > - err = load_commit(NULL, idset, id, repo, > + err = load_commit(0, idset, id, repo, > loose_obj_only, cancel_cb, cancel_arg); > if (err) > goto done; > err = report_progress(progress_cb, progress_arg, rl, > - 0L, nours, v.nmeta, 0, 0); > + 0L, nours, got_object_idset_num_elements(idset), > + 0, 0); > if (err) > goto done; > } > > for (i = 0; i < ntheirs; i++) { > struct got_object_id *id = theirs[i]; > - int *cached_type; > + struct got_pack_meta *m; > if (id == NULL) > continue; > - cached_type = got_object_idset_get(idset, id); > - if (cached_type == NULL) { > + m = got_object_idset_get(idset, id); > + if (m == NULL) { > err = got_object_get_type(&obj_type, repo, id); > if (err) > goto done; > } else > - obj_type = *cached_type; > + obj_type = m->obj_type; > if (obj_type != GOT_OBJ_TYPE_TAG) > continue; > - err = load_tag(NULL, idset, id, repo, > + err = load_tag(0, idset, id, repo, > loose_obj_only, cancel_cb, cancel_arg); > if (err) > goto done; > err = report_progress(progress_cb, progress_arg, rl, > - 0L, nours, v.nmeta, 0, 0); > + 0L, nours, got_object_idset_num_elements(idset), 0, 0); > if (err) > goto done; > } > > for (i = 0; i < nobj; i++) { > - err = load_commit(&v, idset, ids[i], repo, > + err = load_commit(1, idset, ids[i], repo, > loose_obj_only, cancel_cb, cancel_arg); > if (err) > goto done; > if (err) > goto done; > err = report_progress(progress_cb, progress_arg, rl, > - 0L, nours, v.nmeta, 0, 0); > + 0L, nours, got_object_idset_num_elements(idset), 0, 0); > if (err) > goto done; > } > > for (i = 0; i < nours; i++) { > struct got_object_id *id = ours[i]; > - int *cached_type; > + struct got_pack_meta *m; > if (id == NULL) > continue; > - cached_type = got_object_idset_get(idset, id); > - if (cached_type == NULL) { > + m = got_object_idset_get(idset, id); > + if (m == NULL) { > err = got_object_get_type(&obj_type, repo, id); > if (err) > goto done; > } else > - obj_type = *cached_type; > + obj_type = m->obj_type; > if (obj_type != GOT_OBJ_TYPE_TAG) > continue; > - err = load_tag(&v, idset, id, repo, > + err = load_tag(1, idset, id, repo, > loose_obj_only, cancel_cb, cancel_arg); > if (err) > goto done; > err = report_progress(progress_cb, progress_arg, rl, > - 0L, nours, v.nmeta, 0, 0); > + 0L, nours, got_object_idset_num_elements(idset), 0, 0); > if (err) > goto done; > } > > if (progress_cb) { > - err = progress_cb(progress_arg, 0L, nours, v.nmeta, 0, 0); > + err = progress_cb(progress_arg, 0L, nours, > + got_object_idset_num_elements(idset), 0, 0); > if (err) > goto done; > } > @@ -1246,13 +1442,6 @@ done: > free(ids[i]); > } > free(ids); > - got_object_idset_free(idset); > - if (err == NULL) { > - *meta = v.meta; > - *nmeta = v.nmeta; > - } else > - free(v.meta); > - > return err; > } > > @@ -1295,6 +1484,21 @@ write_order_cmp(const void *pa, const void *pb) > return a->mtime - b->mtime; > } > > +static int > +reuse_write_order_cmp(const void *pa, const void *pb) > +{ > + struct got_pack_meta *a, *b; > + > + a = *(struct got_pack_meta **)pa; > + b = *(struct got_pack_meta **)pb; > + > + if (a->reused_delta_offset < b->reused_delta_offset) > + return -1; > + if (a->reused_delta_offset > b->reused_delta_offset) > + return 1; > + return 0; > +} > + > static const struct got_error * > packhdr(int *hdrlen, char *hdr, size_t bufsize, int obj_type, size_t len) > { > @@ -1337,13 +1541,13 @@ packoff(char *hdr, off_t off) > > static const struct got_error * > deltahdr(off_t *packfile_size, SHA1_CTX *ctx, FILE *packfile, > - struct got_pack_meta *m, int use_offset_deltas) > + struct got_pack_meta *m) > { > const struct got_error *err; > char buf[32]; > int nh; > > - if (use_offset_deltas && m->prev->off != 0) { > + if (m->prev->off != 0) { > err = packhdr(&nh, buf, sizeof(buf), > GOT_OBJ_TYPE_OFFSET_DELTA, m->delta_len); > if (err) > @@ -1373,27 +1577,104 @@ deltahdr(off_t *packfile_size, SHA1_CTX *ctx, FILE *pa > } > > static const struct got_error * > +write_packed_object(off_t *packfile_size, FILE *packfile, > + FILE *delta_cache, struct got_pack_meta *m, int *outfd, > + SHA1_CTX *ctx, struct got_repository *repo) > +{ > + const struct got_error *err = NULL; > + struct got_deflate_checksum csum; > + char buf[32]; > + int nh; > + struct got_raw_object *raw = NULL; > + off_t outlen; > + > + csum.output_sha1 = ctx; > + csum.output_crc = NULL; > + > + m->off = ftello(packfile); > + if (m->delta_len == 0) { > + err = got_object_raw_open(&raw, outfd, repo, &m->id); > + if (err) > + goto done; > + err = packhdr(&nh, buf, sizeof(buf), > + m->obj_type, raw->size); > + if (err) > + goto done; > + err = hwrite(packfile, buf, nh, ctx); > + if (err) > + goto done; > + *packfile_size += nh; > + if (raw->f == NULL) { > + err = got_deflate_to_file_mmap(&outlen, > + raw->data + raw->hdrlen, 0, raw->size, > + packfile, &csum); > + if (err) > + goto done; > + } else { > + if (fseeko(raw->f, raw->hdrlen, SEEK_SET) > + == -1) { > + err = got_error_from_errno("fseeko"); > + goto done; > + } > + err = got_deflate_to_file(&outlen, raw->f, > + raw->size, packfile, &csum); > + if (err) > + goto done; > + } > + *packfile_size += outlen; > + got_object_raw_close(raw); > + raw = NULL; > + } else if (m->delta_buf) { > + err = deltahdr(packfile_size, ctx, packfile, m); > + if (err) > + goto done; > + err = got_deflate_to_file_mmap(&outlen, > + m->delta_buf, 0, m->delta_len, packfile, &csum); > + if (err) > + goto done; > + *packfile_size += outlen; > + free(m->delta_buf); > + m->delta_buf = NULL; > + } else { > + if (fseeko(delta_cache, m->delta_offset, SEEK_SET) > + == -1) { > + err = got_error_from_errno("fseeko"); > + goto done; > + } > + err = deltahdr(packfile_size, ctx, packfile, m); > + if (err) > + goto done; > + err = got_deflate_to_file(&outlen, delta_cache, > + m->delta_len, packfile, &csum); > + if (err) > + goto done; > + *packfile_size += outlen; > + } > +done: > + if (raw) > + got_object_raw_close(raw); > + return err; > +} > + > +static const struct got_error * > genpack(uint8_t *pack_sha1, FILE *packfile, FILE *delta_cache, > - struct got_pack_meta **meta, int nmeta, int nours, > - int use_offset_deltas, struct got_repository *repo, > + struct got_pack_meta **deltify, int ndeltify, > + struct got_pack_meta **reuse, int nreuse, > + int nours, struct got_repository *repo, > got_pack_progress_cb progress_cb, void *progress_arg, > struct got_ratelimit *rl, > got_cancel_cb cancel_cb, void *cancel_arg) > { > const struct got_error *err = NULL; > - int i, nh; > + int i; > SHA1_CTX ctx; > struct got_pack_meta *m; > - struct got_raw_object *raw = NULL; > char buf[32]; > size_t n; > - struct got_deflate_checksum csum; > - off_t outlen, packfile_size = 0; > + off_t packfile_size = 0; > int outfd = -1; > > SHA1Init(&ctx); > - csum.output_sha1 = &ctx; > - csum.output_crc = NULL; > > err = hwrite(packfile, "PACK", 4, &ctx); > if (err) > @@ -1402,79 +1683,41 @@ genpack(uint8_t *pack_sha1, FILE *packfile, FILE *delt > err = hwrite(packfile, buf, 4, &ctx); > if (err) > goto done; > - putbe32(buf, nmeta); > + putbe32(buf, ndeltify + nreuse); > err = hwrite(packfile, buf, 4, &ctx); > if (err) > goto done; > - qsort(meta, nmeta, sizeof(struct got_pack_meta *), write_order_cmp); > - for (i = 0; i < nmeta; i++) { > + > + qsort(deltify, ndeltify, sizeof(struct got_pack_meta *), > + write_order_cmp); > + for (i = 0; i < ndeltify; i++) { > err = report_progress(progress_cb, progress_arg, rl, > - packfile_size, nours, nmeta, nmeta, i); > + packfile_size, nours, ndeltify + nreuse, > + ndeltify + nreuse, i); > if (err) > goto done; > - m = meta[i]; > - m->off = ftello(packfile); > - if (m->delta_len == 0) { > - err = got_object_raw_open(&raw, &outfd, repo, &m->id); > - if (err) > - goto done; > - err = packhdr(&nh, buf, sizeof(buf), > - m->obj_type, raw->size); > - if (err) > - goto done; > - err = hwrite(packfile, buf, nh, &ctx); > - if (err) > - goto done; > - packfile_size += nh; > - if (raw->f == NULL) { > - err = got_deflate_to_file_mmap(&outlen, > - raw->data + raw->hdrlen, 0, raw->size, > - packfile, &csum); > - if (err) > - goto done; > - } else { > - if (fseeko(raw->f, raw->hdrlen, SEEK_SET) > - == -1) { > - err = got_error_from_errno("fseeko"); > - goto done; > - } > - err = got_deflate_to_file(&outlen, raw->f, > - raw->size, packfile, &csum); > - if (err) > - goto done; > - } > - packfile_size += outlen; > - got_object_raw_close(raw); > - raw = NULL; > - } else if (m->delta_buf) { > - err = deltahdr(&packfile_size, &ctx, packfile, > - m, use_offset_deltas); > - if (err) > - goto done; > - err = got_deflate_to_file_mmap(&outlen, > - m->delta_buf, 0, m->delta_len, packfile, &csum); > - if (err) > - goto done; > - packfile_size += outlen; > - free(m->delta_buf); > - m->delta_buf = NULL; > - } else { > - if (fseeko(delta_cache, m->delta_offset, SEEK_SET) > - == -1) { > - err = got_error_from_errno("fseeko"); > - goto done; > - } > - err = deltahdr(&packfile_size, &ctx, packfile, > - m, use_offset_deltas); > - if (err) > - goto done; > - err = got_deflate_to_file(&outlen, delta_cache, > - m->delta_len, packfile, &csum); > - if (err) > - goto done; > - packfile_size += outlen; > - } > + m = deltify[i]; > + err = write_packed_object(&packfile_size, packfile, > + delta_cache, m, &outfd, &ctx, repo); > + if (err) > + goto done; > } > + > + qsort(reuse, nreuse, sizeof(struct got_pack_meta *), > + reuse_write_order_cmp); > + for (i = 0; i < nreuse; i++) { > + err = report_progress(progress_cb, progress_arg, rl, > + packfile_size, nours, ndeltify + nreuse, > + ndeltify + nreuse, ndeltify + i); > + if (err) > + goto done; > + m = reuse[i]; > + err = write_packed_object(&packfile_size, packfile, > + delta_cache, m, &outfd, &ctx, repo); > + if (err) > + goto done; > + } > + > SHA1Final(pack_sha1, &ctx); > n = fwrite(pack_sha1, 1, SHA1_DIGEST_LENGTH, packfile); > if (n != SHA1_DIGEST_LENGTH) > @@ -1483,18 +1726,50 @@ genpack(uint8_t *pack_sha1, FILE *packfile, FILE *delt > packfile_size += sizeof(struct got_packfile_hdr); > if (progress_cb) { > err = progress_cb(progress_arg, packfile_size, nours, > - nmeta, nmeta, nmeta); > + ndeltify + nreuse, ndeltify + nreuse, > + ndeltify + nreuse); > if (err) > goto done; > } > done: > - if (raw) > - got_object_raw_close(raw); > if (outfd != -1 && close(outfd) == -1 && err == NULL) > err = got_error_from_errno("close"); > return err; > } > > +static const struct got_error * > +remove_unused_object(struct got_object_idset_element *entry, void *arg) > +{ > + struct got_object_idset *idset = arg; > + > + if (got_object_idset_get_element_data(entry) == NULL) > + got_object_idset_remove_element(idset, entry); > + > + return NULL; > +} > + > +static const struct got_error * > +remove_reused_object(struct got_object_idset_element *entry, void *arg) > +{ > + struct got_object_idset *idset = arg; > + struct got_pack_meta *m; > + > + m = got_object_idset_get_element_data(entry); > + if (m->have_reused_delta) > + got_object_idset_remove_element(idset, entry); > + > + return NULL; > +} > + > +static const struct got_error * > +add_meta_idset_cb(struct got_object_id *id, void *data, void *arg) > +{ > + struct got_pack_meta *m = data; > + struct got_pack_metavec *v = arg; > + > + return add_meta(m, v); > +} > + > const struct got_error * > got_pack_create(uint8_t *packsha1, FILE *packfile, > struct got_object_id **theirs, int ntheirs, > @@ -1504,32 +1779,88 @@ got_pack_create(uint8_t *packsha1, FILE *packfile, > got_cancel_cb cancel_cb, void *cancel_arg) > { > const struct got_error *err; > - struct got_pack_meta **meta; > - int nmeta; > + int delta_cache_fd = -1; > FILE *delta_cache = NULL; > + struct got_object_idset *idset; > struct got_ratelimit rl; > + struct got_pack_metavec deltify, reuse; > > + memset(&deltify, 0, sizeof(deltify)); > + memset(&reuse, 0, sizeof(reuse)); > + > got_ratelimit_init(&rl, 0, 500); > > - err = read_meta(&meta, &nmeta, theirs, ntheirs, ours, nours, repo, > - loose_obj_only, progress_cb, progress_arg, &rl, > + idset = got_object_idset_alloc(); > + if (idset == NULL) > + return got_error_from_errno("got_object_idset_alloc"); > + > + err = load_object_ids(idset, theirs, ntheirs, ours, nours, > + repo, loose_obj_only, progress_cb, progress_arg, &rl, > cancel_cb, cancel_arg); > if (err) > return err; > > - if (nmeta == 0 && !allow_empty) { > + err = got_object_idset_for_each_element(idset, > + remove_unused_object, idset); > + if (err) > + goto done; > + > + if (got_object_idset_num_elements(idset) == 0 && !allow_empty) { > err = got_error(GOT_ERR_CANNOT_PACK); > goto done; > } > > - delta_cache = got_opentemp(); > - if (delta_cache == NULL) { > + delta_cache_fd = got_opentempfd(); > + if (delta_cache_fd == -1) { > err = got_error_from_errno("got_opentemp"); > goto done; > } > > - if (nmeta > 0) { > - err = pick_deltas(meta, nmeta, nours, delta_cache, repo, > + reuse.metasz = 64; > + reuse.meta = calloc(reuse.metasz, > + sizeof(struct got_pack_meta *)); > + if (reuse.meta == NULL) { > + err = got_error_from_errno("calloc"); > + goto done; > + } > + > + err = search_deltas(&reuse, idset, delta_cache_fd, nours, repo, > + progress_cb, progress_arg, &rl, cancel_cb, cancel_arg); > + if (err) > + goto done; > + if (reuse.nmeta > 0) { > + err = got_object_idset_for_each_element(idset, > + remove_reused_object, idset); > + if (err) > + goto done; > + } > + > + delta_cache = fdopen(delta_cache_fd, "a+"); > + if (delta_cache == NULL) { > + err = got_error_from_errno("fdopen"); > + goto done; > + } > + delta_cache_fd = -1; > + > + if (fseeko(delta_cache, 0L, SEEK_END) == -1) { > + err = got_error_from_errno("fseeko"); > + goto done; > + } > + > + deltify.meta = calloc(got_object_idset_num_elements(idset), > + sizeof(struct got_pack_meta *)); > + if (deltify.meta == NULL) { > + err = got_error_from_errno("calloc"); > + goto done; > + } > + deltify.metasz = got_object_idset_num_elements(idset); > + > + err = got_object_idset_for_each(idset, add_meta_idset_cb, &deltify); > + if (err) > + goto done; > + if (deltify.nmeta > 0) { > + err = pick_deltas(deltify.meta, deltify.nmeta, nours, > + reuse.nmeta, delta_cache, repo, > progress_cb, progress_arg, &rl, cancel_cb, cancel_arg); > if (err) > goto done; > @@ -1539,12 +1870,17 @@ got_pack_create(uint8_t *packsha1, FILE *packfile, > } > } > > - err = genpack(packsha1, packfile, delta_cache, meta, nmeta, nours, 1, > - repo, progress_cb, progress_arg, &rl, cancel_cb, cancel_arg); > + err = genpack(packsha1, packfile, delta_cache, deltify.meta, > + deltify.nmeta, reuse.meta, reuse.nmeta, nours, repo, > + progress_cb, progress_arg, &rl, cancel_cb, cancel_arg); > if (err) > goto done; > done: > - free_nmeta(meta, nmeta); > + free_nmeta(deltify.meta, deltify.nmeta); > + free_nmeta(reuse.meta, reuse.nmeta); > + got_object_idset_free(idset); > + if (delta_cache_fd != -1 && close(delta_cache_fd) == -1 && err == NULL) > + err = got_error_from_errno("close"); > if (delta_cache && fclose(delta_cache) == EOF && err == NULL) > err = got_error_from_errno("fclose"); > return err; > blob - 9efd7ae9b91bb9c96b3d3a23e15d29ff1e58f89f > blob + 48e50b65411ba0b6ffccd2853fe7f30044f9f398 > --- lib/privsep.c > +++ lib/privsep.c > @@ -2728,6 +2728,107 @@ got_privsep_recv_traversed_commits(struct got_commit_o > } > > const struct got_error * > +got_privsep_send_raw_delta_req(struct imsgbuf *ibuf, int idx, > + struct got_object_id *id) > +{ > + struct got_imsg_raw_delta_request dreq; > + > + dreq.idx = idx; > + memcpy(dreq.id, id->sha1, SHA1_DIGEST_LENGTH); > + > + if (imsg_compose(ibuf, GOT_IMSG_RAW_DELTA_REQUEST, 0, 0, -1, > + &dreq, sizeof(dreq)) == -1) > + return got_error_from_errno("imsg_compose RAW_DELTA_REQUEST"); > + > + return flush_imsg(ibuf); > +} > + > +const struct got_error * > +got_privsep_send_raw_delta_outfd(struct imsgbuf *ibuf, int fd) > +{ > + return send_fd(ibuf, GOT_IMSG_RAW_DELTA_OUTFD, fd); > +} > + > +const struct got_error * > +got_privsep_send_raw_delta(struct imsgbuf *ibuf, uint64_t base_size, > + uint64_t result_size, off_t delta_size, off_t delta_offset, > + off_t delta_out_offset, struct got_object_id *base_id) > +{ > + struct got_imsg_raw_delta idelta; > + int ret; > + > + idelta.base_size = base_size; > + idelta.result_size = result_size; > + idelta.delta_size = delta_size; > + idelta.delta_offset = delta_offset; > + idelta.delta_out_offset = delta_out_offset; > + memcpy(idelta.base_id, base_id->sha1, SHA1_DIGEST_LENGTH); > + > + ret = imsg_compose(ibuf, GOT_IMSG_RAW_DELTA, 0, 0, -1, > + &idelta, sizeof(idelta)); > + if (ret == -1) > + return got_error_from_errno("imsg_compose RAW_DELTA"); > + > + return flush_imsg(ibuf); > +} > + > +const struct got_error * > +got_privsep_recv_raw_delta(uint64_t *base_size, uint64_t *result_size, > + off_t *delta_size, off_t *delta_offset, off_t *delta_out_offset, > + struct got_object_id **base_id, struct imsgbuf *ibuf) > +{ > + const struct got_error *err = NULL; > + struct imsg imsg; > + struct got_imsg_raw_delta *delta; > + size_t datalen; > + > + *base_size = 0; > + *result_size = 0; > + *delta_size = 0; > + *delta_offset = 0; > + *delta_out_offset = 0; > + *base_id = NULL; > + > + err = got_privsep_recv_imsg(&imsg, ibuf, 0); > + if (err) > + return err; > + > + datalen = imsg.hdr.len - IMSG_HEADER_SIZE; > + > + switch (imsg.hdr.type) { > + case GOT_IMSG_RAW_DELTA: > + if (datalen != sizeof(*delta)) { > + err = got_error(GOT_ERR_PRIVSEP_LEN); > + break; > + } > + delta = imsg.data; > + *base_size = delta->base_size; > + *result_size = delta->result_size; > + *delta_size = delta->delta_size; > + *delta_offset = delta->delta_offset; > + *delta_out_offset = delta->delta_out_offset; > + *base_id = calloc(1, sizeof(**base_id)); > + if (*base_id == NULL) { > + err = got_error_from_errno("malloc"); > + break; > + } > + memcpy((*base_id)->sha1, delta->base_id, SHA1_DIGEST_LENGTH); > + break; > + default: > + err = got_error(GOT_ERR_PRIVSEP_MSG); > + break; > + } > + > + imsg_free(&imsg); > + > + if (err) { > + free(*base_id); > + *base_id = NULL; > + } > + return err; > +} > + > +const struct got_error * > got_privsep_unveil_exec_helpers(void) > { > const char *helpers[] = { > blob - 40f5562db7ad3596203bf083a5084f34f1eb1b05 > blob + 255d035fd6d1b3bd7f0720a14a8463799f261c61 > --- lib/repository.c > +++ lib/repository.c > @@ -1020,8 +1020,8 @@ get_packidx_bloom_filter(struct got_repository *repo, > &repo->packidx_bloom_filters, &key); > } > > -static int > -check_packidx_bloom_filter(struct got_repository *repo, > +int > +got_repo_check_packidx_bloom_filter(struct got_repository *repo, > const char *path_packidx, struct got_object_id *id) > { > struct got_packidx_bloom_filter *bf; > @@ -1103,7 +1103,7 @@ got_repo_search_packidx(struct got_packidx **packidx, > for (i = 0; i < repo->pack_cache_size; i++) { > if (repo->packidx_cache[i] == NULL) > break; > - if (!check_packidx_bloom_filter(repo, > + if (!got_repo_check_packidx_bloom_filter(repo, > repo->packidx_cache[i]->path_packidx, id)) > continue; /* object will not be found in this index */ > *idx = got_packidx_get_object_idx(repo->packidx_cache[i], id); > @@ -1154,7 +1154,8 @@ got_repo_search_packidx(struct got_packidx **packidx, > goto done; > } > > - if (!check_packidx_bloom_filter(repo, path_packidx, id)) { > + if (!got_repo_check_packidx_bloom_filter(repo, > + path_packidx, id)) { > free(path_packidx); > continue; /* object will not be found in this index */ > } > @@ -1205,6 +1206,92 @@ done: > return err; > } > > +const struct got_error * > +got_repo_list_packidx(struct got_pathlist_head *packidx_paths, > + struct got_repository *repo) > +{ > + const struct got_error *err = NULL; > + DIR *packdir = NULL; > + struct dirent *dent; > + char *path_packidx = NULL; > + int packdir_fd; > + > + packdir_fd = openat(got_repo_get_fd(repo), > + GOT_OBJECTS_PACK_DIR, O_DIRECTORY | O_CLOEXEC); > + if (packdir_fd == -1) { > + return got_error_from_errno_fmt("openat: %s/%s", > + got_repo_get_path_git_dir(repo), > + GOT_OBJECTS_PACK_DIR); > + } > + > + packdir = fdopendir(packdir_fd); > + if (packdir == NULL) { > + err = got_error_from_errno("fdopendir"); > + goto done; > + } > + > + while ((dent = readdir(packdir)) != NULL) { > + if (!got_repo_is_packidx_filename(dent->d_name, dent->d_namlen)) > + continue; > + > + if (asprintf(&path_packidx, "%s/%s", GOT_OBJECTS_PACK_DIR, > + dent->d_name) == -1) { > + err = got_error_from_errno("asprintf"); > + path_packidx = NULL; > + break; > + } > + > + err = got_pathlist_append(packidx_paths, path_packidx, NULL); > + if (err) > + break; > + } > +done: > + if (err) > + free(path_packidx); > + if (packdir && closedir(packdir) != 0 && err == NULL) > + err = got_error_from_errno("closedir"); > + return err; > +} > + > +const struct got_error * > +got_repo_get_packidx(struct got_packidx **packidx, const char *path_packidx, > + struct got_repository *repo) > +{ > + const struct got_error *err; > + size_t i; > + > + *packidx = NULL; > + > + /* Search pack index cache. */ > + for (i = 0; i < repo->pack_cache_size; i++) { > + if (repo->packidx_cache[i] == NULL) > + break; > + if (strcmp(repo->packidx_cache[i]->path_packidx, > + path_packidx) == 0) { > + *packidx = repo->packidx_cache[i]; > + return NULL; > + } > + } > + /* No luck. Search the filesystem. */ > + > + err = got_packidx_open(packidx, got_repo_get_fd(repo), > + path_packidx, 0); > + if (err) > + return err; > + > + err = add_packidx_bloom_filter(repo, *packidx, path_packidx); > + if (err) > + goto done; > + > + err = cache_packidx(repo, *packidx, path_packidx); > +done: > + if (err) { > + got_packidx_close(*packidx); > + *packidx = NULL; > + } > + return err; > +} > + > static const struct got_error * > read_packfile_hdr(int fd, struct got_packidx *packidx) > { > blob - f9be16f797d60019bfb883aac0e9b41c7beb846a > blob + 422da924a77fc3864bcdfbc5e266f568fab8cef3 > --- lib/repository_admin.c > +++ lib/repository_admin.c > @@ -140,7 +140,8 @@ const struct got_error * > got_repo_pack_objects(FILE **packfile, struct got_object_id **pack_hash, > struct got_reflist_head *include_refs, > struct got_reflist_head *exclude_refs, struct got_repository *repo, > - int loose_obj_only, got_pack_progress_cb progress_cb, void *progress_arg, > + int loose_obj_only, > + got_pack_progress_cb progress_cb, void *progress_arg, > got_cancel_cb cancel_cb, void *cancel_arg) > { > const struct got_error *err = NULL; > blob - 1ea0d617c52732faf35724fe601e28a24b03992f > blob + a14d051c87c9e8f5b75dd5f4bcf67486dfde5f4c > --- libexec/got-read-pack/got-read-pack.c > +++ libexec/got-read-pack/got-read-pack.c > @@ -289,11 +289,10 @@ done: > } > > static const struct got_error * > -receive_tempfile(FILE **basefile, FILE **accumfile, struct imsg *imsg, > +receive_tempfile(FILE **f, const char *mode, struct imsg *imsg, > struct imsgbuf *ibuf) > { > size_t datalen; > - FILE **f; > > datalen = imsg->hdr.len - IMSG_HEADER_SIZE; > if (datalen != 0) > @@ -302,14 +301,7 @@ receive_tempfile(FILE **basefile, FILE **accumfile, st > if (imsg->fd == -1) > return got_error(GOT_ERR_PRIVSEP_NO_FD); > > - if (*basefile == NULL) > - f = basefile; > - else if (*accumfile == NULL) > - f = accumfile; > - else > - return got_error(GOT_ERR_PRIVSEP_MSG); > - > - *f = fdopen(imsg->fd, "w+"); > + *f = fdopen(imsg->fd, mode); > if (*f == NULL) > return got_error_from_errno("fdopen"); > imsg->fd = -1; > @@ -854,9 +846,80 @@ done: > return err; > } > > +static const struct got_error * > +get_base_object_id(struct got_object_id *base_id, struct got_packidx *packidx, > + off_t base_offset) > +{ > + const struct got_error *err; > + int idx; > > + err = got_packidx_get_offset_idx(&idx, packidx, base_offset); > + if (err) > + return err; > + if (idx == -1) > + return got_error(GOT_ERR_BAD_PACKIDX); > > + return got_packidx_get_object_id(base_id, packidx, idx); > +} > + > static const struct got_error * > +raw_delta_request(struct imsg *imsg, struct imsgbuf *ibuf, > + FILE *delta_outfile, struct got_pack *pack, > + struct got_packidx *packidx) > +{ > + const struct got_error *err = NULL; > + struct got_imsg_raw_delta_request req; > + size_t datalen, delta_size; > + off_t delta_offset; > + uint8_t *delta_buf = NULL; > + struct got_object_id id, base_id; > + off_t base_offset, delta_out_offset = 0; > + uint64_t base_size = 0, result_size = 0; > + size_t w; > + > + datalen = imsg->hdr.len - IMSG_HEADER_SIZE; > + if (datalen != sizeof(req)) > + return got_error(GOT_ERR_PRIVSEP_LEN); > + memcpy(&req, imsg->data, sizeof(req)); > + memcpy(id.sha1, req.id, SHA1_DIGEST_LENGTH); > + > + imsg->fd = -1; > + > + err = got_packfile_extract_raw_delta(&delta_buf, &delta_size, > + &delta_offset, &base_offset, &base_id, &base_size, &result_size, > + pack, packidx, req.idx); > + if (err) > + goto done; > + > + /* > + * If this is an offset delta we must determine the base > + * object ID ourselves. > + */ > + if (base_offset != 0) { > + err = get_base_object_id(&base_id, packidx, base_offset); > + if (err) > + goto done; > + } > + > + delta_out_offset = ftello(delta_outfile); > + w = fwrite(delta_buf, 1, delta_size, delta_outfile); > + if (w != delta_size) { > + err = got_ferror(delta_outfile, GOT_ERR_IO); > + goto done; > + } > + if (fflush(delta_outfile) == -1) { > + err = got_error_from_errno("fflush"); > + goto done; > + } > + > + err = got_privsep_send_raw_delta(ibuf, base_size, result_size, > + delta_size, delta_offset, delta_out_offset, &base_id); > +done: > + free(delta_buf); > + return err; > +} > + > +static const struct got_error * > receive_packidx(struct got_packidx **packidx, struct imsgbuf *ibuf) > { > const struct got_error *err = NULL; > @@ -1009,7 +1072,7 @@ main(int argc, char *argv[]) > struct got_packidx *packidx = NULL; > struct got_pack *pack = NULL; > struct got_object_cache objcache; > - FILE *basefile = NULL, *accumfile = NULL; > + FILE *basefile = NULL, *accumfile = NULL, *delta_outfile = NULL; > > //static int attached; > //while (!attached) sleep(1); > @@ -1066,8 +1129,14 @@ main(int argc, char *argv[]) > > switch (imsg.hdr.type) { > case GOT_IMSG_TMPFD: > - err = receive_tempfile(&basefile, &accumfile, > - &imsg, &ibuf); > + if (basefile == NULL) { > + err = receive_tempfile(&basefile, "w+", > + &imsg, &ibuf); > + } else if (accumfile == NULL) { > + err = receive_tempfile(&accumfile, "w+", > + &imsg, &ibuf); > + } else > + err = got_error(GOT_ERR_PRIVSEP_MSG); > break; > case GOT_IMSG_PACKED_OBJECT_REQUEST: > err = object_request(&imsg, &ibuf, pack, packidx, > @@ -1081,6 +1150,22 @@ main(int argc, char *argv[]) > err = raw_object_request(&imsg, &ibuf, pack, packidx, > &objcache, basefile, accumfile); > break; > + case GOT_IMSG_RAW_DELTA_OUTFD: > + if (delta_outfile != NULL) { > + err = got_error(GOT_ERR_PRIVSEP_MSG); > + break; > + } > + err = receive_tempfile(&delta_outfile, "w", > + &imsg, &ibuf); > + break; > + case GOT_IMSG_RAW_DELTA_REQUEST: > + if (delta_outfile == NULL) { > + err = got_error(GOT_ERR_PRIVSEP_NO_FD); > + break; > + } > + err = raw_delta_request(&imsg, &ibuf, delta_outfile, > + pack, packidx); > + break; > case GOT_IMSG_COMMIT_REQUEST: > err = commit_request(&imsg, &ibuf, pack, packidx, > &objcache); > @@ -1127,6 +1212,8 @@ main(int argc, char *argv[]) > err = got_error_from_errno("fclose"); > if (accumfile && fclose(accumfile) == EOF && err == NULL) > err = got_error_from_errno("fclose"); > + if (delta_outfile && fclose(delta_outfile) == EOF && err == NULL) > + err = got_error_from_errno("fclose"); > if (err) { > if (!sigint_received && err->code != GOT_ERR_PRIVSEP_PIPE) { > fprintf(stderr, "%s: %s\n", getprogname(), err->msg); > >