Download raw body.
fix path meta-data used for packing
This patch isolates and refines a fix also contained in my
got-read-pack object enumeration patch.
When packing large repositories I have seen gotadmin pack -a run
out of memory during the deltification stage. This only happened
with the got-read-pack object enumeration patch applied, which
didn't make much sense at first. There are 3 problems involved,
and the patch below contains fixes for all of them.
The first problem fixed by the patch below is that the paths we store
in struct got_pack_meta are incorrect (to see why, just print out paths
we pass to alloc_meta(), they are obviously wrong). This problem is
fixed by making load_tree_entries() store the correct path in qid->data
for each subtree we are going to traverse in load_tree().
This leads to a second problem: Storing copies of full paths on meta
data entries consumes quite a bit of memory when packing a large repo.
To work around this, we can store a hash of the path in got_pack_meta,
rather than a full path (Git does something similar here). We want to sort
files which share the same path next to each other during deltification,
and a hash of the path is good enough for this purpose. I chose murmurhash2()
for now because it only needs a uint32_t. The hash function be changed
later if needed.
With the order of files fixed during deltification, we suddenly produce
much better deltas. A full pack file containing all objects of my test
repo shrank from 2GB to 1.5GB because of this fix. However, it is a bit
slower and drives up memory use during deltification, which is problem
number 3: The same out-of-memory problem I saw while testing my patch
for got-read-pack.
To avoid running out of memory we can reduce max_delta_memsize in
pick_deltas(); the current setting allows up to 200MB of deltas to be
stored in RAM, and I am reducing this to 32MB.
With this, gotadmin pack -a remains under the default data size limit
on amd64 (1.5GB) on my test repo (OpenBSD src.git), and can successfully
produce a pack file without bumping ulimit -d.
ok?
diff refs/heads/main refs/heads/meta-path-fix
blob - 5415448aa86fb16df21cd1e028be51fecc89f699
blob + 1811adf838ff6ee401fc281c3a5fc22c16e23b12
--- lib/pack_create.c
+++ lib/pack_create.c
@@ -56,6 +56,8 @@
#include "got_lib_ratelimit.h"
#include "got_lib_inflate.h"
+#include "murmurhash2.h"
+
#ifndef MIN
#define MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
#endif
@@ -70,7 +72,7 @@
struct got_pack_meta {
struct got_object_id id;
- char *path;
+ uint32_t path_hash;
int obj_type;
off_t size;
time_t mtime;
@@ -105,7 +107,6 @@ static const struct got_error *
alloc_meta(struct got_pack_meta **new, struct got_object_id *id,
const char *path, int obj_type, time_t mtime)
{
- const struct got_error *err = NULL;
struct got_pack_meta *m;
*new = NULL;
@@ -116,13 +117,7 @@ alloc_meta(struct got_pack_meta **new, struct got_obje
memcpy(&m->id, id, sizeof(m->id));
- m->path = strdup(path);
- if (m->path == NULL) {
- err = got_error_from_errno("strdup");
- free(m);
- return err;
- }
-
+ m->path_hash = murmurhash2(path, strlen(path), 0xd70af26a);
m->obj_type = obj_type;
m->mtime = mtime;
*new = m;
@@ -134,8 +129,7 @@ clear_meta(struct got_pack_meta *meta)
{
if (meta == NULL)
return;
- free(meta->path);
- meta->path = NULL;
+ meta->path_hash = 0;
free(meta->delta_buf);
meta->delta_buf = NULL;
free(meta->base_obj_id);
@@ -156,16 +150,16 @@ static int
delta_order_cmp(const void *pa, const void *pb)
{
struct got_pack_meta *a, *b;
- int cmp;
a = *(struct got_pack_meta **)pa;
b = *(struct got_pack_meta **)pb;
if (a->obj_type != b->obj_type)
return a->obj_type - b->obj_type;
- cmp = strcmp(a->path, b->path);
- if (cmp != 0)
- return cmp;
+ if (a->path_hash < b->path_hash)
+ return -1;
+ if (a->path_hash > b->path_hash)
+ return 1;
if (a->mtime < b->mtime)
return -1;
if (a->mtime > b->mtime)
@@ -687,7 +681,7 @@ pick_deltas(struct got_pack_meta **meta, int nmeta, in
off_t size, best_size;
const int max_base_candidates = 3;
size_t delta_memsize = 0;
- const size_t max_delta_memsize = 25 * GOT_DELTA_RESULT_SIZE_CACHED_MAX;
+ const size_t max_delta_memsize = 4 * GOT_DELTA_RESULT_SIZE_CACHED_MAX;
int outfd = -1;
qsort(meta, nmeta, sizeof(struct got_pack_meta *), delta_order_cmp);
@@ -954,6 +948,8 @@ load_tree_entries(struct got_object_id_queue *ids, int
err = got_object_qid_alloc(&qid, id);
if (err)
break;
+ qid->data = p;
+ p = NULL;
STAILQ_INSERT_TAIL(ids, qid, entry);
} else if (S_ISREG(mode) || S_ISLNK(mode)) {
err = add_object(want_meta,
@@ -963,9 +959,12 @@ load_tree_entries(struct got_object_id_queue *ids, int
progress_cb, progress_arg, rl);
if (err)
break;
+ free(p);
+ p = NULL;
+ } else {
+ free(p);
+ p = NULL;
}
- free(p);
- p = NULL;
}
got_object_tree_close(tree);
@@ -993,11 +992,18 @@ load_tree(int want_meta, struct got_object_idset *idse
err = got_object_qid_alloc(&qid, tree_id);
if (err)
return err;
+ qid->data = strdup(dpath);
+ if (qid->data == NULL) {
+ err = got_error_from_errno("strdup");
+ free(qid);
+ return err;
+ }
STAILQ_INIT(&tree_ids);
STAILQ_INSERT_TAIL(&tree_ids, qid, entry);
while (!STAILQ_EMPTY(&tree_ids)) {
+ const char *path;
if (cancel_cb) {
err = (*cancel_cb)(cancel_arg);
if (err)
@@ -1006,32 +1012,38 @@ load_tree(int want_meta, struct got_object_idset *idse
qid = STAILQ_FIRST(&tree_ids);
STAILQ_REMOVE_HEAD(&tree_ids, entry);
+ path = qid->data;
if (got_object_idset_contains(idset, &qid->id) ||
got_object_idset_contains(idset_exclude, &qid->id)) {
+ free(qid->data);
got_object_qid_free(qid);
continue;
}
err = add_object(want_meta, want_meta ? idset : idset_exclude,
- &qid->id, dpath, GOT_OBJ_TYPE_TREE,
+ &qid->id, path, GOT_OBJ_TYPE_TREE,
mtime, loose_obj_only, repo,
ncolored, nfound, ntrees, progress_cb, progress_arg, rl);
if (err) {
+ free(qid->data);
got_object_qid_free(qid);
break;
}
err = load_tree_entries(&tree_ids, want_meta, idset,
idset_exclude, &qid->id,
- dpath, mtime, repo, loose_obj_only, ncolored, nfound,
+ path, mtime, repo, loose_obj_only, ncolored, nfound,
ntrees, progress_cb, progress_arg, rl,
cancel_cb, cancel_arg);
+ free(qid->data);
got_object_qid_free(qid);
if (err)
break;
}
+ STAILQ_FOREACH(qid, &tree_ids, entry)
+ free(qid->data);
got_object_id_queue_free(&tree_ids);
return err;
}
fix path meta-data used for packing