"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
Omar Polo <op@omarpolo.com>
Subject:
Re: fix path meta-data used for packing
To:
Stefan Sperling <stsp@stsp.name>
Cc:
gameoftrees@openbsd.org
Date:
Fri, 20 May 2022 10:17:49 +0200

Download raw body.

Thread
Stefan Sperling <stsp@stsp.name> wrote:
> This patch isolates and refines a fix also contained in my
> got-read-pack object enumeration patch.
> 
> When packing large repositories I have seen gotadmin pack -a run
> out of memory during the deltification stage. This only happened
> with the got-read-pack object enumeration patch applied, which
> didn't make much sense at first. There are 3 problems involved,
> and the patch below contains fixes for all of them.
> 
> The first problem fixed by the patch below is that the paths we store
> in struct got_pack_meta are incorrect (to see why, just print out paths
> we pass to alloc_meta(), they are obviously wrong). This problem is

they're all just relative paths?  I've seen a bunch of got.c, tog.c,
worktree.c, ...

> fixed by making load_tree_entries() store the correct path in qid->data
> for each subtree we are going to traverse in load_tree().
> 
> This leads to a second problem: Storing copies of full paths on meta
> data entries consumes quite a bit of memory when packing a large repo.

sharing just because i've logged the paths and did a quick math.
(assuming emacs didn't betray me) ~35% of the call to alloc_meta are
with path "", followed by "lib" at ~8.5%, followed by "got" at ~4%.

> To work around this, we can store a hash of the path in got_pack_meta,
> rather than a full path (Git does something similar here). We want to sort
> files which share the same path next to each other during deltification,
> and a hash of the path is good enough for this purpose. I chose murmurhash2()
> for now because it only needs a uint32_t. The hash function be changed
> later if needed.
> 
> With the order of files fixed during deltification, we suddenly produce
> much better deltas. A full pack file containing all objects of my test
> repo shrank from 2GB to 1.5GB because of this fix. However, it is a bit
> slower and drives up memory use during deltification, which is problem
> number 3: The same out-of-memory problem I saw while testing my patch
> for got-read-pack.
> To avoid running out of memory we can reduce max_delta_memsize in
> pick_deltas(); the current setting allows up to 200MB of deltas to be
> stored in RAM, and I am reducing this to 32MB.
> With this, gotadmin pack -a remains under the default data size limit
> on amd64 (1.5GB) on my test repo (OpenBSD src.git), and can successfully
> produce a pack file without bumping ulimit -d.
> 
> ok?

ok op

one question below

> diff refs/heads/main refs/heads/meta-path-fix
> blob - 5415448aa86fb16df21cd1e028be51fecc89f699
> blob + 1811adf838ff6ee401fc281c3a5fc22c16e23b12
> --- lib/pack_create.c
> +++ lib/pack_create.c
> @@ -56,6 +56,8 @@
>  #include "got_lib_ratelimit.h"
>  #include "got_lib_inflate.h"
>  
> +#include "murmurhash2.h"
> +
>  #ifndef MIN
>  #define	MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
>  #endif
> @@ -70,7 +72,7 @@
>  
>  struct got_pack_meta {
>  	struct got_object_id id;
> -	char	*path;
> +	uint32_t path_hash;
>  	int	obj_type;
>  	off_t	size;
>  	time_t	mtime;
> @@ -105,7 +107,6 @@ static const struct got_error *
>  alloc_meta(struct got_pack_meta **new, struct got_object_id *id,
>      const char *path, int obj_type, time_t mtime)
>  {
> -	const struct got_error *err = NULL;
>  	struct got_pack_meta *m;
>  
>  	*new = NULL;
> @@ -116,13 +117,7 @@ alloc_meta(struct got_pack_meta **new, struct got_obje
>  
>  	memcpy(&m->id, id, sizeof(m->id));
>  
> -	m->path = strdup(path);
> -	if (m->path == NULL) {
> -		err = got_error_from_errno("strdup");
> -		free(m);
> -		return err;
> -	}
> -
> +	m->path_hash = murmurhash2(path, strlen(path), 0xd70af26a);

reusing the same seed on every run is to be deterministic and produce the
same pack at every run?  What are the disadvantages with, say, using a
random seed freshly generated at every run?

(a quick grep shows that we're always using hardcoded seeds for
murmurhash2 also in other cases too so this is not an issue, just a
question)

>  	m->obj_type = obj_type;
>  	m->mtime = mtime;
>  	*new = m;
> @@ -134,8 +129,7 @@ clear_meta(struct got_pack_meta *meta)
>  {
>  	if (meta == NULL)
>  		return;
> -	free(meta->path);
> -	meta->path = NULL;
> +	meta->path_hash = 0;
>  	free(meta->delta_buf);
>  	meta->delta_buf = NULL;
>  	free(meta->base_obj_id);
> @@ -156,16 +150,16 @@ static int
>  delta_order_cmp(const void *pa, const void *pb)
>  {
>  	struct got_pack_meta *a, *b;
> -	int cmp;
>  
>  	a = *(struct got_pack_meta **)pa;
>  	b = *(struct got_pack_meta **)pb;
>  
>  	if (a->obj_type != b->obj_type)
>  		return a->obj_type - b->obj_type;
> -	cmp = strcmp(a->path, b->path);
> -	if (cmp != 0)
> -		return cmp;
> +	if (a->path_hash < b->path_hash)
> +		return -1;
> +	if (a->path_hash > b->path_hash)
> +		return 1;
>  	if (a->mtime < b->mtime)
>  		return -1;
>  	if (a->mtime > b->mtime)
> @@ -687,7 +681,7 @@ pick_deltas(struct got_pack_meta **meta, int nmeta, in
>  	off_t size, best_size;
>  	const int max_base_candidates = 3;
>  	size_t delta_memsize = 0;
> -	const size_t max_delta_memsize = 25 * GOT_DELTA_RESULT_SIZE_CACHED_MAX;
> +	const size_t max_delta_memsize = 4 * GOT_DELTA_RESULT_SIZE_CACHED_MAX;
>  	int outfd = -1;
>  
>  	qsort(meta, nmeta, sizeof(struct got_pack_meta *), delta_order_cmp);
> @@ -954,6 +948,8 @@ load_tree_entries(struct got_object_id_queue *ids, int
>  			err = got_object_qid_alloc(&qid, id);
>  			if (err)
>  				break;
> +			qid->data = p;
> +			p = NULL;
>  			STAILQ_INSERT_TAIL(ids, qid, entry);
>  		} else if (S_ISREG(mode) || S_ISLNK(mode)) {
>  			err = add_object(want_meta,
> @@ -963,9 +959,12 @@ load_tree_entries(struct got_object_id_queue *ids, int
>  			    progress_cb, progress_arg, rl);
>  			if (err)
>  				break;
> +			free(p);
> +			p = NULL;
> +		} else {
> +			free(p);
> +			p = NULL;
>  		}
> -		free(p);
> -		p = NULL;
>  	}
>  
>  	got_object_tree_close(tree);
> @@ -993,11 +992,18 @@ load_tree(int want_meta, struct got_object_idset *idse
>  	err = got_object_qid_alloc(&qid, tree_id);
>  	if (err)
>  		return err;
> +	qid->data = strdup(dpath);
> +	if (qid->data == NULL) {
> +		err = got_error_from_errno("strdup");
> +		free(qid);
> +		return err;
> +	}
>  
>  	STAILQ_INIT(&tree_ids);
>  	STAILQ_INSERT_TAIL(&tree_ids, qid, entry);
>  
>  	while (!STAILQ_EMPTY(&tree_ids)) {
> +		const char *path;
>  		if (cancel_cb) {
>  			err = (*cancel_cb)(cancel_arg);
>  			if (err)
> @@ -1006,32 +1012,38 @@ load_tree(int want_meta, struct got_object_idset *idse
>  
>  		qid = STAILQ_FIRST(&tree_ids);
>  		STAILQ_REMOVE_HEAD(&tree_ids, entry);
> +		path = qid->data;
>  
>  		if (got_object_idset_contains(idset, &qid->id) ||
>  		    got_object_idset_contains(idset_exclude, &qid->id)) {
> +			free(qid->data);
>  			got_object_qid_free(qid);
>  			continue;
>  		}
>  
>  		err = add_object(want_meta, want_meta ? idset : idset_exclude,
> -		    &qid->id, dpath, GOT_OBJ_TYPE_TREE,
> +		    &qid->id, path, GOT_OBJ_TYPE_TREE,
>  		    mtime, loose_obj_only, repo,
>  		    ncolored, nfound, ntrees, progress_cb, progress_arg, rl);
>  		if (err) {
> +			free(qid->data);
>  			got_object_qid_free(qid);
>  			break;
>  		}
>  
>  		err = load_tree_entries(&tree_ids, want_meta, idset,
>  		    idset_exclude, &qid->id,
> -		    dpath, mtime, repo, loose_obj_only, ncolored, nfound,
> +		    path, mtime, repo, loose_obj_only, ncolored, nfound,
>  		    ntrees, progress_cb, progress_arg, rl,
>  		    cancel_cb, cancel_arg);
> +		free(qid->data);
>  		got_object_qid_free(qid);
>  		if (err)
>  			break;
>  	}
>  
> +	STAILQ_FOREACH(qid, &tree_ids, entry)
> +		free(qid->data);
>  	got_object_id_queue_free(&tree_ids);
>  	return err;
>  }