"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
Omar Polo <op@omarpolo.com>
Subject:
Re: fix path meta-data used for packing
To:
Stefan Sperling <stsp@stsp.name>
Cc:
gameoftrees@openbsd.org
Date:
Fri, 20 May 2022 14:30:32 +0200

Download raw body.

Thread
Stefan Sperling <stsp@stsp.name> wrote:
> On Fri, May 20, 2022 at 10:17:49AM +0200, Omar Polo wrote:
> > > +	m->path_hash = murmurhash2(path, strlen(path), 0xd70af26a);
> > 
> > reusing the same seed on every run is to be deterministic and produce the
> > same pack at every run?  What are the disadvantages with, say, using a
> > random seed freshly generated at every run?
> > 
> > (a quick grep shows that we're always using hardcoded seeds for
> > murmurhash2 also in other cases too so this is not an issue, just a
> > question)
> 
> Yes, we should probably be using a unique arc4random() seed per run.
> The SipHash24 API encourages this as well.
> 
> We can fix this later.

with something like this? :)

written on the go and only lightly tested, but regress passes.  It
creates a fresh seed in got_pack_create and in got_deltify.  I can move
the arc4random call one level higher probably, but these functions seems
to be called only once.

I've also changed bloom.c to create a fresh seed upon bloom_init: the
change is very small but this "vendored" library so I'm not sure to
change it.

I don't see further calls to murmurhash2 with an hardcoded hash.

This will require some work in -portable thought.  I've seen what
openssh (IIRC) does to have a bundled arc4random and it's not
straightforward (even if we may be able to just steal their arc4random
compat.)  I'm happy to help with that however!

diff refs/heads/main refs/heads/randseeds
blob - 05e57b1f7053a73112d4713be722a197307866e2
blob + a66c49edad2dad5800dbe4eb0deb37ed923834ac
--- lib/bloom.c
+++ lib/bloom.c
@@ -77,7 +77,7 @@ static int bloom_check_add(struct bloom * bloom,
   }
 
   int hits = 0;
-  register unsigned int a = murmurhash2(buffer, len, 0x9747b28c);
+  register unsigned int a = murmurhash2(buffer, len, bloom->seed);
   register unsigned int b = murmurhash2(buffer, len, a);
   register unsigned int x;
   register unsigned int i;
@@ -111,6 +111,8 @@ int bloom_init(struct bloom * bloom, int entries, doub
 {
   bloom->ready = 0;
 
+  bloom->seed = arc4random();
+
   if (entries < 1000 || error == 0) {
     return 1;
   }
blob - 28356e3721b9f7902763322750d5a1b87640cd4d
blob + 51c848e8470bd02ed14a2bc377a173e39836239c
--- lib/bloom.h
+++ lib/bloom.h
@@ -52,6 +52,7 @@ struct bloom
   int bits;
   int bytes;
   int hashes;
+  uint32_t seed;
 
   // Fields below are private to the implementation. These may go away or
   // change incompatibly at any moment. Client code MUST NOT access or rely
blob - b3bbf0b189575c109c8b935c3519ade298fed3aa
blob + 526099720694041b29211dc3b8f4852c0e20d398
--- lib/deltify.c
+++ lib/deltify.c
@@ -88,9 +88,9 @@ static const uint32_t geartab[256] = {
 };
 
 static uint32_t
-hashblk(const unsigned char *p, off_t n)
+hashblk(const unsigned char *p, off_t n, uint32_t seed)
 {
-	return murmurhash2(p, n, 0x1d7c5ac3);
+	return murmurhash2(p, n, seed);
 }
 
 static const struct got_error *
@@ -237,7 +237,8 @@ addblk_mem(struct got_delta_table *dt, uint8_t *data, 
 
 static const struct got_error *
 lookupblk(struct got_delta_block **block, struct got_delta_table *dt,
-    unsigned char *p, off_t len, FILE *basefile, off_t basefile_offset0)
+    unsigned char *p, off_t len, uint32_t seed, FILE *basefile,
+    off_t basefile_offset0)
 {
 	int i;
 	uint32_t h;
@@ -246,7 +247,7 @@ lookupblk(struct got_delta_block **block, struct got_d
 
 	*block = NULL;
 
-	h = hashblk(p, len);
+	h = hashblk(p, len, seed);
 	for (i = h % dt->nalloc; dt->blocks[i].len != 0;
 	     i = (i + 1) % dt->nalloc) {
 		if (dt->blocks[i].hash != h ||
@@ -268,7 +269,8 @@ lookupblk(struct got_delta_block **block, struct got_d
 
 static const struct got_error *
 lookupblk_mem(struct got_delta_block **block, struct got_delta_table *dt,
-    unsigned char *p, off_t len, uint8_t *basedata, off_t basefile_offset0)
+    unsigned char *p, off_t len, uint32_t seed, uint8_t *basedata,
+    off_t basefile_offset0)
 {
 	int i;
 	uint32_t h;
@@ -276,7 +278,7 @@ lookupblk_mem(struct got_delta_block **block, struct g
 
 	*block = NULL;
 
-	h = hashblk(p, len);
+	h = hashblk(p, len, seed);
 	for (i = h % dt->nalloc; dt->blocks[i].len != 0;
 	     i = (i + 1) % dt->nalloc) {
 		if (dt->blocks[i].hash != h ||
@@ -355,7 +357,10 @@ got_deltify_init(struct got_delta_table **dt, FILE *f,
 	const struct got_error *err = NULL;
 	uint32_t h;
 	const off_t offset0 = fileoffset;
+	uint32_t seed;
 
+	seed = arc4random();
+
 	*dt = calloc(1, sizeof(**dt));
 	if (*dt == NULL)
 		return got_error_from_errno("calloc");
@@ -379,7 +384,7 @@ got_deltify_init(struct got_delta_table **dt, FILE *f,
 			goto done;
 		if (blocklen == 0)
 			break;
-		h = hashblk(buf, blocklen);
+		h = hashblk(buf, blocklen, seed);
 		err = addblk(*dt, f, offset0, blocklen,
 		    fileoffset - offset0, h);
 		if (err)
@@ -405,7 +410,10 @@ got_deltify_init_mem(struct got_delta_table **dt, uint
 	const struct got_error *err = NULL;
 	uint32_t h;
 	const off_t offset0 = fileoffset;
+	uint32_t seed;
 
+	seed = arc4random();
+
 	*dt = calloc(1, sizeof(**dt));
 	if (*dt == NULL)
 		return got_error_from_errno("calloc");
@@ -425,7 +433,7 @@ got_deltify_init_mem(struct got_delta_table **dt, uint
 			goto done;
 		if (blocklen == 0)
 			break;
-		h = hashblk(data + fileoffset, blocklen);
+		h = hashblk(data + fileoffset, blocklen, seed);
 		err = addblk_mem(*dt, data, offset0, blocklen,
 		    fileoffset - offset0, h);
 		if (err)
@@ -629,10 +637,13 @@ got_deltify(struct got_delta_instruction **deltas, int
 	const off_t offset0 = fileoffset;
 	size_t nalloc = 0;
 	const size_t alloc_chunk_size = 64;
+	uint32_t seed;
 
 	*deltas = NULL;
 	*ndeltas = 0;
 
+	seed = arc4random();
+
 	/*
 	 * offset0 indicates where data to be deltified begins.
 	 * For example, we want to avoid deltifying a Git object header at
@@ -663,7 +674,7 @@ got_deltify(struct got_delta_instruction **deltas, int
 			}
 			break;
 		}
-		err = lookupblk(&block, dt, buf, blocklen, basefile,
+		err = lookupblk(&block, dt, buf, blocklen, seed, basefile,
 		    basefile_offset0);
 		if (err)
 			break;
@@ -716,10 +727,13 @@ got_deltify_file_mem(struct got_delta_instruction **de
 	const off_t offset0 = fileoffset;
 	size_t nalloc = 0;
 	const size_t alloc_chunk_size = 64;
+	uint32_t seed;
 
 	*deltas = NULL;
 	*ndeltas = 0;
 
+	seed = arc4random();
+
 	/*
 	 * offset0 indicates where data to be deltified begins.
 	 * For example, we want to avoid deltifying a Git object header at
@@ -750,7 +764,7 @@ got_deltify_file_mem(struct got_delta_instruction **de
 			}
 			break;
 		}
-		err = lookupblk_mem(&block, dt, buf, blocklen, basedata,
+		err = lookupblk_mem(&block, dt, buf, blocklen, seed, basedata,
 		    basefile_offset0);
 		if (err)
 			break;
@@ -803,10 +817,13 @@ got_deltify_mem_file(struct got_delta_instruction **de
 	const off_t offset0 = fileoffset;
 	size_t nalloc = 0;
 	const size_t alloc_chunk_size = 64;
+	uint32_t seed;
 
 	*deltas = NULL;
 	*ndeltas = 0;
 
+	seed = arc4random();
+
 	*deltas = reallocarray(NULL, alloc_chunk_size,
 	    sizeof(struct got_delta_instruction));
 	if (*deltas == NULL)
@@ -828,7 +845,7 @@ got_deltify_mem_file(struct got_delta_instruction **de
 			}
 			break;
 		}
-		err = lookupblk(&block, dt, data + fileoffset, blocklen,
+		err = lookupblk(&block, dt, data + fileoffset, blocklen, seed,
 		    basefile, basefile_offset0);
 		if (err)
 			break;
@@ -878,10 +895,13 @@ got_deltify_mem_mem(struct got_delta_instruction **del
 	const off_t offset0 = fileoffset;
 	size_t nalloc = 0;
 	const size_t alloc_chunk_size = 64;
+	uint32_t seed;
 
 	*deltas = NULL;
 	*ndeltas = 0;
 
+	seed = arc4random();
+
 	*deltas = reallocarray(NULL, alloc_chunk_size,
 	    sizeof(struct got_delta_instruction));
 	if (*deltas == NULL)
@@ -904,7 +924,7 @@ got_deltify_mem_mem(struct got_delta_instruction **del
 			break;
 		}
 		err = lookupblk_mem(&block, dt, data + fileoffset, blocklen,
-		    basedata, basefile_offset0);
+		    seed, basedata, basefile_offset0);
 		if (err)
 			break;
 		if (block != NULL) {
blob - d748288cff18ac9179407bbf2ecb771ad11452c2
blob + f92a4d04144e6c8cbea41506b7f00f552a1942ad
--- lib/pack_create.c
+++ lib/pack_create.c
@@ -51,6 +51,7 @@
 #include "got_lib_object_cache.h"
 #include "got_lib_deflate.h"
 #include "got_lib_pack.h"
+#include "got_lib_pack_create.h"
 #include "got_lib_privsep.h"
 #include "got_lib_repository.h"
 #include "got_lib_ratelimit.h"
@@ -104,7 +105,7 @@ struct got_pack_metavec {
 
 static const struct got_error *
 alloc_meta(struct got_pack_meta **new, struct got_object_id *id,
-    const char *path, int obj_type, time_t mtime)
+    const char *path, int obj_type, time_t mtime, uint32_t seed)
 {
 	struct got_pack_meta *m;
 
@@ -116,7 +117,7 @@ alloc_meta(struct got_pack_meta **new, struct got_obje
 
 	memcpy(&m->id, id, sizeof(m->id));
 
-	m->path_hash = murmurhash2(path, strlen(path), 0xd70af26a);
+	m->path_hash = murmurhash2(path, strlen(path), seed);
 	m->obj_type = obj_type;
 	m->mtime = mtime;
 	*new = m;
@@ -856,8 +857,8 @@ search_packidx(int *found, struct got_object_id *id,
 static const struct got_error *
 add_object(int want_meta, struct got_object_idset *idset,
     struct got_object_id *id, const char *path, int obj_type,
-    time_t mtime, int loose_obj_only, struct got_repository *repo,
-    int *ncolored, int *nfound, int *ntrees,
+    time_t mtime, uint32_t seed, int loose_obj_only,
+    struct got_repository *repo, int *ncolored, int *nfound, int *ntrees,
     got_pack_progress_cb progress_cb, void *progress_arg,
     struct got_ratelimit *rl)
 {
@@ -874,7 +875,7 @@ add_object(int want_meta, struct got_object_idset *ids
 	}
 
 	if (want_meta) {
-		err = alloc_meta(&m, id, path, obj_type, mtime);
+		err = alloc_meta(&m, id, path, obj_type, mtime, seed);
 		if (err)
 			return err;
 
@@ -900,7 +901,7 @@ static const struct got_error *
 load_tree_entries(struct got_object_id_queue *ids, int want_meta,
     struct got_object_idset *idset, struct got_object_idset *idset_exclude,
     struct got_object_id *tree_id,
-    const char *dpath, time_t mtime, struct got_repository *repo,
+    const char *dpath, time_t mtime, uint32_t seed, struct got_repository *repo,
     int loose_obj_only, int *ncolored, int *nfound, int *ntrees,
     got_pack_progress_cb progress_cb, void *progress_arg,
     struct got_ratelimit *rl, got_cancel_cb cancel_cb, void *cancel_arg)
@@ -953,8 +954,8 @@ load_tree_entries(struct got_object_id_queue *ids, int
 		} else if (S_ISREG(mode) || S_ISLNK(mode)) {
 			err = add_object(want_meta,
 			    want_meta ? idset : idset_exclude, id, p,
-			    GOT_OBJ_TYPE_BLOB, mtime, loose_obj_only, repo,
-			    ncolored, nfound, ntrees,
+			    GOT_OBJ_TYPE_BLOB, mtime, seed, loose_obj_only,
+			    repo, ncolored, nfound, ntrees,
 			    progress_cb, progress_arg, rl);
 			if (err)
 				break;
@@ -975,7 +976,7 @@ static const struct got_error *
 load_tree(int want_meta, struct got_object_idset *idset,
     struct got_object_idset *idset_exclude,
     struct got_object_id *tree_id, const char *dpath, time_t mtime,
-    struct got_repository *repo, int loose_obj_only,
+    uint32_t seed, struct got_repository *repo, int loose_obj_only,
     int *ncolored, int *nfound, int *ntrees,
     got_pack_progress_cb progress_cb, void *progress_arg,
     struct got_ratelimit *rl, got_cancel_cb cancel_cb, void *cancel_arg)
@@ -1022,7 +1023,7 @@ load_tree(int want_meta, struct got_object_idset *idse
 
 		err = add_object(want_meta, want_meta ? idset : idset_exclude,
 		    &qid->id, path, GOT_OBJ_TYPE_TREE,
-		    mtime, loose_obj_only, repo,
+		    mtime, seed, loose_obj_only, repo,
 		    ncolored, nfound, ntrees, progress_cb, progress_arg, rl);
 		if (err) {
 			free(qid->data);
@@ -1032,7 +1033,7 @@ load_tree(int want_meta, struct got_object_idset *idse
 
 		err = load_tree_entries(&tree_ids, want_meta, idset,
 		    idset_exclude, &qid->id,
-		    path, mtime, repo, loose_obj_only, ncolored, nfound,
+		    path, mtime, seed, repo, loose_obj_only, ncolored, nfound,
 		    ntrees, progress_cb, progress_arg, rl,
 		    cancel_cb, cancel_arg);
 		free(qid->data);
@@ -1050,8 +1051,8 @@ load_tree(int want_meta, struct got_object_idset *idse
 static const struct got_error *
 load_commit(int want_meta, struct got_object_idset *idset,
     struct got_object_idset *idset_exclude,
-    struct got_object_id *id, struct got_repository *repo, int loose_obj_only,
-    int *ncolored, int *nfound, int *ntrees,
+    struct got_object_id *id, struct got_repository *repo, uint32_t seed,
+    int loose_obj_only, int *ncolored, int *nfound, int *ntrees,
     got_pack_progress_cb progress_cb, void *progress_arg,
     struct got_ratelimit *rl, got_cancel_cb cancel_cb, void *cancel_arg)
 {
@@ -1077,7 +1078,7 @@ load_commit(int want_meta, struct got_object_idset *id
 
 	err = add_object(want_meta, want_meta ? idset : idset_exclude,
 	    id, "", GOT_OBJ_TYPE_COMMIT,
-	    got_object_commit_get_committer_time(commit),
+	    got_object_commit_get_committer_time(commit), seed,
 	    loose_obj_only, repo,
 	    ncolored, nfound, ntrees, progress_cb, progress_arg, rl);
 	if (err)
@@ -1085,7 +1086,7 @@ load_commit(int want_meta, struct got_object_idset *id
 
 	err = load_tree(want_meta, idset, idset_exclude,
 	    got_object_commit_get_tree_id(commit),
-	    "", got_object_commit_get_committer_time(commit),
+	    "", got_object_commit_get_committer_time(commit), seed,
 	    repo, loose_obj_only, ncolored, nfound, ntrees,
 	    progress_cb, progress_arg, rl, cancel_cb, cancel_arg);
 done:
@@ -1096,8 +1097,8 @@ done:
 static const struct got_error *
 load_tag(int want_meta, struct got_object_idset *idset,
     struct got_object_idset *idset_exclude,
-    struct got_object_id *id, struct got_repository *repo, int loose_obj_only,
-    int *ncolored, int *nfound, int *ntrees,
+    struct got_object_id *id, struct got_repository *repo, uint32_t seed,
+    int loose_obj_only, int *ncolored, int *nfound, int *ntrees,
     got_pack_progress_cb progress_cb, void *progress_arg,
     struct got_ratelimit *rl, got_cancel_cb cancel_cb, void *cancel_arg)
 {
@@ -1123,7 +1124,7 @@ load_tag(int want_meta, struct got_object_idset *idset
 
 	err = add_object(want_meta, want_meta ? idset : idset_exclude,
 	    id, "", GOT_OBJ_TYPE_TAG,
-	    got_object_tag_get_tagger_time(tag), loose_obj_only, repo,
+	    got_object_tag_get_tagger_time(tag), seed, loose_obj_only, repo,
 	    ncolored, nfound, ntrees, progress_cb, progress_arg, rl);
 	if (err)
 		goto done;
@@ -1131,16 +1132,16 @@ load_tag(int want_meta, struct got_object_idset *idset
 	switch (got_object_tag_get_object_type(tag)) {
 	case GOT_OBJ_TYPE_COMMIT:
 		err = load_commit(want_meta, idset, idset_exclude,
-		    got_object_tag_get_object_id(tag), repo, loose_obj_only,
-		    ncolored, nfound, ntrees, progress_cb, progress_arg, rl,
-		    cancel_cb, cancel_arg);
+		    got_object_tag_get_object_id(tag), repo, seed,
+		    loose_obj_only, ncolored, nfound, ntrees,
+		    progress_cb, progress_arg, rl, cancel_cb, cancel_arg);
 		break;
 	case GOT_OBJ_TYPE_TREE:
 		err = load_tree(want_meta, idset, idset_exclude,
 		    got_object_tag_get_object_id(tag), "",
-		    got_object_tag_get_tagger_time(tag), repo, loose_obj_only,
-		    ncolored, nfound, ntrees, progress_cb, progress_arg, rl,
-		    cancel_cb, cancel_arg);
+		    got_object_tag_get_tagger_time(tag), seed, repo,
+		    loose_obj_only, ncolored, nfound, ntrees,
+		    progress_cb, progress_arg, rl, cancel_cb, cancel_arg);
 		break;
 	default:
 		break;
@@ -1451,8 +1452,9 @@ static const struct got_error *
 load_object_ids(int *ncolored, int *nfound, int *ntrees,
     struct got_object_idset *idset, struct got_object_id **theirs, int ntheirs,
     struct got_object_id **ours, int nours, struct got_repository *repo,
-    int loose_obj_only, got_pack_progress_cb progress_cb, void *progress_arg,
-    struct got_ratelimit *rl, got_cancel_cb cancel_cb, void *cancel_arg)
+    uint32_t seed, int loose_obj_only, got_pack_progress_cb progress_cb,
+    void *progress_arg, struct got_ratelimit *rl, got_cancel_cb cancel_cb,
+    void *cancel_arg)
 {
 	const struct got_error *err = NULL;
 	struct got_object_id **ids = NULL;
@@ -1481,14 +1483,14 @@ load_object_ids(int *ncolored, int *nfound, int *ntree
 			return err;
 		if (obj_type == GOT_OBJ_TYPE_COMMIT) {
 			err = load_commit(0, idset, idset_exclude, id, repo,
-			    loose_obj_only, ncolored, nfound, ntrees,
+			    seed, loose_obj_only, ncolored, nfound, ntrees,
 			    progress_cb, progress_arg, rl,
 			    cancel_cb, cancel_arg);
 			if (err)
 				goto done;
 		} else if (obj_type == GOT_OBJ_TYPE_TAG) {
 			err = load_tag(0, idset, idset_exclude, id, repo,
-			    loose_obj_only, ncolored, nfound, ntrees,
+			    seed, loose_obj_only, ncolored, nfound, ntrees,
 			    progress_cb, progress_arg, rl,
 			    cancel_cb, cancel_arg);
 			if (err)
@@ -1497,8 +1499,8 @@ load_object_ids(int *ncolored, int *nfound, int *ntree
 	}
 
 	for (i = 0; i < nobj; i++) {
-		err = load_commit(1, idset, idset_exclude,
-		    ids[i], repo, loose_obj_only, ncolored, nfound, ntrees,
+		err = load_commit(1, idset, idset_exclude, ids[i], repo,
+		    seed, loose_obj_only, ncolored, nfound, ntrees,
 		    progress_cb, progress_arg, rl, cancel_cb, cancel_arg);
 		if (err)
 			goto done;
@@ -1519,7 +1521,7 @@ load_object_ids(int *ncolored, int *nfound, int *ntree
 		if (obj_type != GOT_OBJ_TYPE_TAG)
 			continue;
 		err = load_tag(1, idset, idset_exclude, id, repo,
-		    loose_obj_only, ncolored, nfound, ntrees,
+		    seed, loose_obj_only, ncolored, nfound, ntrees,
 		    progress_cb, progress_arg, rl, cancel_cb, cancel_arg);
 		if (err)
 			goto done;
@@ -1943,7 +1945,10 @@ got_pack_create(uint8_t *packsha1, FILE *packfile,
 	struct got_pack_metavec deltify, reuse;
 	int ncolored = 0, nfound = 0, ntrees = 0;
 	size_t ndeltify;
+	uint32_t seed;
 
+	seed = arc4random();
+
 	memset(&deltify, 0, sizeof(deltify));
 	memset(&reuse, 0, sizeof(reuse));
 
@@ -1954,7 +1959,7 @@ got_pack_create(uint8_t *packsha1, FILE *packfile,
 		return got_error_from_errno("got_object_idset_alloc");
 
 	err = load_object_ids(&ncolored, &nfound, &ntrees, idset, theirs,
-	    ntheirs, ours, nours, repo, loose_obj_only,
+	    ntheirs, ours, nours, repo, seed, loose_obj_only,
 	    progress_cb, progress_arg, &rl, cancel_cb, cancel_arg);
 	if (err)
 		goto done;