"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
Stefan Sperling <stsp@stsp.name>
Subject:
fix path meta-data used for packing
To:
gameoftrees@openbsd.org
Date:
Fri, 20 May 2022 00:27:29 +0200

Download raw body.

Thread
This patch isolates and refines a fix also contained in my
got-read-pack object enumeration patch.

When packing large repositories I have seen gotadmin pack -a run
out of memory during the deltification stage. This only happened
with the got-read-pack object enumeration patch applied, which
didn't make much sense at first. There are 3 problems involved,
and the patch below contains fixes for all of them.

The first problem fixed by the patch below is that the paths we store
in struct got_pack_meta are incorrect (to see why, just print out paths
we pass to alloc_meta(), they are obviously wrong). This problem is
fixed by making load_tree_entries() store the correct path in qid->data
for each subtree we are going to traverse in load_tree().

This leads to a second problem: Storing copies of full paths on meta
data entries consumes quite a bit of memory when packing a large repo.
To work around this, we can store a hash of the path in got_pack_meta,
rather than a full path (Git does something similar here). We want to sort
files which share the same path next to each other during deltification,
and a hash of the path is good enough for this purpose. I chose murmurhash2()
for now because it only needs a uint32_t. The hash function be changed
later if needed.

With the order of files fixed during deltification, we suddenly produce
much better deltas. A full pack file containing all objects of my test
repo shrank from 2GB to 1.5GB because of this fix. However, it is a bit
slower and drives up memory use during deltification, which is problem
number 3: The same out-of-memory problem I saw while testing my patch
for got-read-pack.
To avoid running out of memory we can reduce max_delta_memsize in
pick_deltas(); the current setting allows up to 200MB of deltas to be
stored in RAM, and I am reducing this to 32MB.
With this, gotadmin pack -a remains under the default data size limit
on amd64 (1.5GB) on my test repo (OpenBSD src.git), and can successfully
produce a pack file without bumping ulimit -d.

ok?

diff refs/heads/main refs/heads/meta-path-fix
blob - 5415448aa86fb16df21cd1e028be51fecc89f699
blob + 1811adf838ff6ee401fc281c3a5fc22c16e23b12
--- lib/pack_create.c
+++ lib/pack_create.c
@@ -56,6 +56,8 @@
 #include "got_lib_ratelimit.h"
 #include "got_lib_inflate.h"
 
+#include "murmurhash2.h"
+
 #ifndef MIN
 #define	MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
 #endif
@@ -70,7 +72,7 @@
 
 struct got_pack_meta {
 	struct got_object_id id;
-	char	*path;
+	uint32_t path_hash;
 	int	obj_type;
 	off_t	size;
 	time_t	mtime;
@@ -105,7 +107,6 @@ static const struct got_error *
 alloc_meta(struct got_pack_meta **new, struct got_object_id *id,
     const char *path, int obj_type, time_t mtime)
 {
-	const struct got_error *err = NULL;
 	struct got_pack_meta *m;
 
 	*new = NULL;
@@ -116,13 +117,7 @@ alloc_meta(struct got_pack_meta **new, struct got_obje
 
 	memcpy(&m->id, id, sizeof(m->id));
 
-	m->path = strdup(path);
-	if (m->path == NULL) {
-		err = got_error_from_errno("strdup");
-		free(m);
-		return err;
-	}
-
+	m->path_hash = murmurhash2(path, strlen(path), 0xd70af26a);
 	m->obj_type = obj_type;
 	m->mtime = mtime;
 	*new = m;
@@ -134,8 +129,7 @@ clear_meta(struct got_pack_meta *meta)
 {
 	if (meta == NULL)
 		return;
-	free(meta->path);
-	meta->path = NULL;
+	meta->path_hash = 0;
 	free(meta->delta_buf);
 	meta->delta_buf = NULL;
 	free(meta->base_obj_id);
@@ -156,16 +150,16 @@ static int
 delta_order_cmp(const void *pa, const void *pb)
 {
 	struct got_pack_meta *a, *b;
-	int cmp;
 
 	a = *(struct got_pack_meta **)pa;
 	b = *(struct got_pack_meta **)pb;
 
 	if (a->obj_type != b->obj_type)
 		return a->obj_type - b->obj_type;
-	cmp = strcmp(a->path, b->path);
-	if (cmp != 0)
-		return cmp;
+	if (a->path_hash < b->path_hash)
+		return -1;
+	if (a->path_hash > b->path_hash)
+		return 1;
 	if (a->mtime < b->mtime)
 		return -1;
 	if (a->mtime > b->mtime)
@@ -687,7 +681,7 @@ pick_deltas(struct got_pack_meta **meta, int nmeta, in
 	off_t size, best_size;
 	const int max_base_candidates = 3;
 	size_t delta_memsize = 0;
-	const size_t max_delta_memsize = 25 * GOT_DELTA_RESULT_SIZE_CACHED_MAX;
+	const size_t max_delta_memsize = 4 * GOT_DELTA_RESULT_SIZE_CACHED_MAX;
 	int outfd = -1;
 
 	qsort(meta, nmeta, sizeof(struct got_pack_meta *), delta_order_cmp);
@@ -954,6 +948,8 @@ load_tree_entries(struct got_object_id_queue *ids, int
 			err = got_object_qid_alloc(&qid, id);
 			if (err)
 				break;
+			qid->data = p;
+			p = NULL;
 			STAILQ_INSERT_TAIL(ids, qid, entry);
 		} else if (S_ISREG(mode) || S_ISLNK(mode)) {
 			err = add_object(want_meta,
@@ -963,9 +959,12 @@ load_tree_entries(struct got_object_id_queue *ids, int
 			    progress_cb, progress_arg, rl);
 			if (err)
 				break;
+			free(p);
+			p = NULL;
+		} else {
+			free(p);
+			p = NULL;
 		}
-		free(p);
-		p = NULL;
 	}
 
 	got_object_tree_close(tree);
@@ -993,11 +992,18 @@ load_tree(int want_meta, struct got_object_idset *idse
 	err = got_object_qid_alloc(&qid, tree_id);
 	if (err)
 		return err;
+	qid->data = strdup(dpath);
+	if (qid->data == NULL) {
+		err = got_error_from_errno("strdup");
+		free(qid);
+		return err;
+	}
 
 	STAILQ_INIT(&tree_ids);
 	STAILQ_INSERT_TAIL(&tree_ids, qid, entry);
 
 	while (!STAILQ_EMPTY(&tree_ids)) {
+		const char *path;
 		if (cancel_cb) {
 			err = (*cancel_cb)(cancel_arg);
 			if (err)
@@ -1006,32 +1012,38 @@ load_tree(int want_meta, struct got_object_idset *idse
 
 		qid = STAILQ_FIRST(&tree_ids);
 		STAILQ_REMOVE_HEAD(&tree_ids, entry);
+		path = qid->data;
 
 		if (got_object_idset_contains(idset, &qid->id) ||
 		    got_object_idset_contains(idset_exclude, &qid->id)) {
+			free(qid->data);
 			got_object_qid_free(qid);
 			continue;
 		}
 
 		err = add_object(want_meta, want_meta ? idset : idset_exclude,
-		    &qid->id, dpath, GOT_OBJ_TYPE_TREE,
+		    &qid->id, path, GOT_OBJ_TYPE_TREE,
 		    mtime, loose_obj_only, repo,
 		    ncolored, nfound, ntrees, progress_cb, progress_arg, rl);
 		if (err) {
+			free(qid->data);
 			got_object_qid_free(qid);
 			break;
 		}
 
 		err = load_tree_entries(&tree_ids, want_meta, idset,
 		    idset_exclude, &qid->id,
-		    dpath, mtime, repo, loose_obj_only, ncolored, nfound,
+		    path, mtime, repo, loose_obj_only, ncolored, nfound,
 		    ntrees, progress_cb, progress_arg, rl,
 		    cancel_cb, cancel_arg);
+		free(qid->data);
 		got_object_qid_free(qid);
 		if (err)
 			break;
 	}
 
+	STAILQ_FOREACH(qid, &tree_ids, entry)
+		free(qid->data);
 	got_object_id_queue_free(&tree_ids);
 	return err;
 }