Download raw body.
fix path meta-data used for packing
This patch isolates and refines a fix also contained in my got-read-pack object enumeration patch. When packing large repositories I have seen gotadmin pack -a run out of memory during the deltification stage. This only happened with the got-read-pack object enumeration patch applied, which didn't make much sense at first. There are 3 problems involved, and the patch below contains fixes for all of them. The first problem fixed by the patch below is that the paths we store in struct got_pack_meta are incorrect (to see why, just print out paths we pass to alloc_meta(), they are obviously wrong). This problem is fixed by making load_tree_entries() store the correct path in qid->data for each subtree we are going to traverse in load_tree(). This leads to a second problem: Storing copies of full paths on meta data entries consumes quite a bit of memory when packing a large repo. To work around this, we can store a hash of the path in got_pack_meta, rather than a full path (Git does something similar here). We want to sort files which share the same path next to each other during deltification, and a hash of the path is good enough for this purpose. I chose murmurhash2() for now because it only needs a uint32_t. The hash function be changed later if needed. With the order of files fixed during deltification, we suddenly produce much better deltas. A full pack file containing all objects of my test repo shrank from 2GB to 1.5GB because of this fix. However, it is a bit slower and drives up memory use during deltification, which is problem number 3: The same out-of-memory problem I saw while testing my patch for got-read-pack. To avoid running out of memory we can reduce max_delta_memsize in pick_deltas(); the current setting allows up to 200MB of deltas to be stored in RAM, and I am reducing this to 32MB. With this, gotadmin pack -a remains under the default data size limit on amd64 (1.5GB) on my test repo (OpenBSD src.git), and can successfully produce a pack file without bumping ulimit -d. ok? diff refs/heads/main refs/heads/meta-path-fix blob - 5415448aa86fb16df21cd1e028be51fecc89f699 blob + 1811adf838ff6ee401fc281c3a5fc22c16e23b12 --- lib/pack_create.c +++ lib/pack_create.c @@ -56,6 +56,8 @@ #include "got_lib_ratelimit.h" #include "got_lib_inflate.h" +#include "murmurhash2.h" + #ifndef MIN #define MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) #endif @@ -70,7 +72,7 @@ struct got_pack_meta { struct got_object_id id; - char *path; + uint32_t path_hash; int obj_type; off_t size; time_t mtime; @@ -105,7 +107,6 @@ static const struct got_error * alloc_meta(struct got_pack_meta **new, struct got_object_id *id, const char *path, int obj_type, time_t mtime) { - const struct got_error *err = NULL; struct got_pack_meta *m; *new = NULL; @@ -116,13 +117,7 @@ alloc_meta(struct got_pack_meta **new, struct got_obje memcpy(&m->id, id, sizeof(m->id)); - m->path = strdup(path); - if (m->path == NULL) { - err = got_error_from_errno("strdup"); - free(m); - return err; - } - + m->path_hash = murmurhash2(path, strlen(path), 0xd70af26a); m->obj_type = obj_type; m->mtime = mtime; *new = m; @@ -134,8 +129,7 @@ clear_meta(struct got_pack_meta *meta) { if (meta == NULL) return; - free(meta->path); - meta->path = NULL; + meta->path_hash = 0; free(meta->delta_buf); meta->delta_buf = NULL; free(meta->base_obj_id); @@ -156,16 +150,16 @@ static int delta_order_cmp(const void *pa, const void *pb) { struct got_pack_meta *a, *b; - int cmp; a = *(struct got_pack_meta **)pa; b = *(struct got_pack_meta **)pb; if (a->obj_type != b->obj_type) return a->obj_type - b->obj_type; - cmp = strcmp(a->path, b->path); - if (cmp != 0) - return cmp; + if (a->path_hash < b->path_hash) + return -1; + if (a->path_hash > b->path_hash) + return 1; if (a->mtime < b->mtime) return -1; if (a->mtime > b->mtime) @@ -687,7 +681,7 @@ pick_deltas(struct got_pack_meta **meta, int nmeta, in off_t size, best_size; const int max_base_candidates = 3; size_t delta_memsize = 0; - const size_t max_delta_memsize = 25 * GOT_DELTA_RESULT_SIZE_CACHED_MAX; + const size_t max_delta_memsize = 4 * GOT_DELTA_RESULT_SIZE_CACHED_MAX; int outfd = -1; qsort(meta, nmeta, sizeof(struct got_pack_meta *), delta_order_cmp); @@ -954,6 +948,8 @@ load_tree_entries(struct got_object_id_queue *ids, int err = got_object_qid_alloc(&qid, id); if (err) break; + qid->data = p; + p = NULL; STAILQ_INSERT_TAIL(ids, qid, entry); } else if (S_ISREG(mode) || S_ISLNK(mode)) { err = add_object(want_meta, @@ -963,9 +959,12 @@ load_tree_entries(struct got_object_id_queue *ids, int progress_cb, progress_arg, rl); if (err) break; + free(p); + p = NULL; + } else { + free(p); + p = NULL; } - free(p); - p = NULL; } got_object_tree_close(tree); @@ -993,11 +992,18 @@ load_tree(int want_meta, struct got_object_idset *idse err = got_object_qid_alloc(&qid, tree_id); if (err) return err; + qid->data = strdup(dpath); + if (qid->data == NULL) { + err = got_error_from_errno("strdup"); + free(qid); + return err; + } STAILQ_INIT(&tree_ids); STAILQ_INSERT_TAIL(&tree_ids, qid, entry); while (!STAILQ_EMPTY(&tree_ids)) { + const char *path; if (cancel_cb) { err = (*cancel_cb)(cancel_arg); if (err) @@ -1006,32 +1012,38 @@ load_tree(int want_meta, struct got_object_idset *idse qid = STAILQ_FIRST(&tree_ids); STAILQ_REMOVE_HEAD(&tree_ids, entry); + path = qid->data; if (got_object_idset_contains(idset, &qid->id) || got_object_idset_contains(idset_exclude, &qid->id)) { + free(qid->data); got_object_qid_free(qid); continue; } err = add_object(want_meta, want_meta ? idset : idset_exclude, - &qid->id, dpath, GOT_OBJ_TYPE_TREE, + &qid->id, path, GOT_OBJ_TYPE_TREE, mtime, loose_obj_only, repo, ncolored, nfound, ntrees, progress_cb, progress_arg, rl); if (err) { + free(qid->data); got_object_qid_free(qid); break; } err = load_tree_entries(&tree_ids, want_meta, idset, idset_exclude, &qid->id, - dpath, mtime, repo, loose_obj_only, ncolored, nfound, + path, mtime, repo, loose_obj_only, ncolored, nfound, ntrees, progress_cb, progress_arg, rl, cancel_cb, cancel_arg); + free(qid->data); got_object_qid_free(qid); if (err) break; } + STAILQ_FOREACH(qid, &tree_ids, entry) + free(qid->data); got_object_id_queue_free(&tree_ids); return err; }
fix path meta-data used for packing