Download raw body.
fix path meta-data used for packing
Stefan Sperling <stsp@stsp.name> wrote: > On Fri, May 20, 2022 at 10:17:49AM +0200, Omar Polo wrote: > > > + m->path_hash = murmurhash2(path, strlen(path), 0xd70af26a); > > > > reusing the same seed on every run is to be deterministic and produce the > > same pack at every run? What are the disadvantages with, say, using a > > random seed freshly generated at every run? > > > > (a quick grep shows that we're always using hardcoded seeds for > > murmurhash2 also in other cases too so this is not an issue, just a > > question) > > Yes, we should probably be using a unique arc4random() seed per run. > The SipHash24 API encourages this as well. > > We can fix this later. with something like this? :) written on the go and only lightly tested, but regress passes. It creates a fresh seed in got_pack_create and in got_deltify. I can move the arc4random call one level higher probably, but these functions seems to be called only once. I've also changed bloom.c to create a fresh seed upon bloom_init: the change is very small but this "vendored" library so I'm not sure to change it. I don't see further calls to murmurhash2 with an hardcoded hash. This will require some work in -portable thought. I've seen what openssh (IIRC) does to have a bundled arc4random and it's not straightforward (even if we may be able to just steal their arc4random compat.) I'm happy to help with that however! diff refs/heads/main refs/heads/randseeds blob - 05e57b1f7053a73112d4713be722a197307866e2 blob + a66c49edad2dad5800dbe4eb0deb37ed923834ac --- lib/bloom.c +++ lib/bloom.c @@ -77,7 +77,7 @@ static int bloom_check_add(struct bloom * bloom, } int hits = 0; - register unsigned int a = murmurhash2(buffer, len, 0x9747b28c); + register unsigned int a = murmurhash2(buffer, len, bloom->seed); register unsigned int b = murmurhash2(buffer, len, a); register unsigned int x; register unsigned int i; @@ -111,6 +111,8 @@ int bloom_init(struct bloom * bloom, int entries, doub { bloom->ready = 0; + bloom->seed = arc4random(); + if (entries < 1000 || error == 0) { return 1; } blob - 28356e3721b9f7902763322750d5a1b87640cd4d blob + 51c848e8470bd02ed14a2bc377a173e39836239c --- lib/bloom.h +++ lib/bloom.h @@ -52,6 +52,7 @@ struct bloom int bits; int bytes; int hashes; + uint32_t seed; // Fields below are private to the implementation. These may go away or // change incompatibly at any moment. Client code MUST NOT access or rely blob - b3bbf0b189575c109c8b935c3519ade298fed3aa blob + 526099720694041b29211dc3b8f4852c0e20d398 --- lib/deltify.c +++ lib/deltify.c @@ -88,9 +88,9 @@ static const uint32_t geartab[256] = { }; static uint32_t -hashblk(const unsigned char *p, off_t n) +hashblk(const unsigned char *p, off_t n, uint32_t seed) { - return murmurhash2(p, n, 0x1d7c5ac3); + return murmurhash2(p, n, seed); } static const struct got_error * @@ -237,7 +237,8 @@ addblk_mem(struct got_delta_table *dt, uint8_t *data, static const struct got_error * lookupblk(struct got_delta_block **block, struct got_delta_table *dt, - unsigned char *p, off_t len, FILE *basefile, off_t basefile_offset0) + unsigned char *p, off_t len, uint32_t seed, FILE *basefile, + off_t basefile_offset0) { int i; uint32_t h; @@ -246,7 +247,7 @@ lookupblk(struct got_delta_block **block, struct got_d *block = NULL; - h = hashblk(p, len); + h = hashblk(p, len, seed); for (i = h % dt->nalloc; dt->blocks[i].len != 0; i = (i + 1) % dt->nalloc) { if (dt->blocks[i].hash != h || @@ -268,7 +269,8 @@ lookupblk(struct got_delta_block **block, struct got_d static const struct got_error * lookupblk_mem(struct got_delta_block **block, struct got_delta_table *dt, - unsigned char *p, off_t len, uint8_t *basedata, off_t basefile_offset0) + unsigned char *p, off_t len, uint32_t seed, uint8_t *basedata, + off_t basefile_offset0) { int i; uint32_t h; @@ -276,7 +278,7 @@ lookupblk_mem(struct got_delta_block **block, struct g *block = NULL; - h = hashblk(p, len); + h = hashblk(p, len, seed); for (i = h % dt->nalloc; dt->blocks[i].len != 0; i = (i + 1) % dt->nalloc) { if (dt->blocks[i].hash != h || @@ -355,7 +357,10 @@ got_deltify_init(struct got_delta_table **dt, FILE *f, const struct got_error *err = NULL; uint32_t h; const off_t offset0 = fileoffset; + uint32_t seed; + seed = arc4random(); + *dt = calloc(1, sizeof(**dt)); if (*dt == NULL) return got_error_from_errno("calloc"); @@ -379,7 +384,7 @@ got_deltify_init(struct got_delta_table **dt, FILE *f, goto done; if (blocklen == 0) break; - h = hashblk(buf, blocklen); + h = hashblk(buf, blocklen, seed); err = addblk(*dt, f, offset0, blocklen, fileoffset - offset0, h); if (err) @@ -405,7 +410,10 @@ got_deltify_init_mem(struct got_delta_table **dt, uint const struct got_error *err = NULL; uint32_t h; const off_t offset0 = fileoffset; + uint32_t seed; + seed = arc4random(); + *dt = calloc(1, sizeof(**dt)); if (*dt == NULL) return got_error_from_errno("calloc"); @@ -425,7 +433,7 @@ got_deltify_init_mem(struct got_delta_table **dt, uint goto done; if (blocklen == 0) break; - h = hashblk(data + fileoffset, blocklen); + h = hashblk(data + fileoffset, blocklen, seed); err = addblk_mem(*dt, data, offset0, blocklen, fileoffset - offset0, h); if (err) @@ -629,10 +637,13 @@ got_deltify(struct got_delta_instruction **deltas, int const off_t offset0 = fileoffset; size_t nalloc = 0; const size_t alloc_chunk_size = 64; + uint32_t seed; *deltas = NULL; *ndeltas = 0; + seed = arc4random(); + /* * offset0 indicates where data to be deltified begins. * For example, we want to avoid deltifying a Git object header at @@ -663,7 +674,7 @@ got_deltify(struct got_delta_instruction **deltas, int } break; } - err = lookupblk(&block, dt, buf, blocklen, basefile, + err = lookupblk(&block, dt, buf, blocklen, seed, basefile, basefile_offset0); if (err) break; @@ -716,10 +727,13 @@ got_deltify_file_mem(struct got_delta_instruction **de const off_t offset0 = fileoffset; size_t nalloc = 0; const size_t alloc_chunk_size = 64; + uint32_t seed; *deltas = NULL; *ndeltas = 0; + seed = arc4random(); + /* * offset0 indicates where data to be deltified begins. * For example, we want to avoid deltifying a Git object header at @@ -750,7 +764,7 @@ got_deltify_file_mem(struct got_delta_instruction **de } break; } - err = lookupblk_mem(&block, dt, buf, blocklen, basedata, + err = lookupblk_mem(&block, dt, buf, blocklen, seed, basedata, basefile_offset0); if (err) break; @@ -803,10 +817,13 @@ got_deltify_mem_file(struct got_delta_instruction **de const off_t offset0 = fileoffset; size_t nalloc = 0; const size_t alloc_chunk_size = 64; + uint32_t seed; *deltas = NULL; *ndeltas = 0; + seed = arc4random(); + *deltas = reallocarray(NULL, alloc_chunk_size, sizeof(struct got_delta_instruction)); if (*deltas == NULL) @@ -828,7 +845,7 @@ got_deltify_mem_file(struct got_delta_instruction **de } break; } - err = lookupblk(&block, dt, data + fileoffset, blocklen, + err = lookupblk(&block, dt, data + fileoffset, blocklen, seed, basefile, basefile_offset0); if (err) break; @@ -878,10 +895,13 @@ got_deltify_mem_mem(struct got_delta_instruction **del const off_t offset0 = fileoffset; size_t nalloc = 0; const size_t alloc_chunk_size = 64; + uint32_t seed; *deltas = NULL; *ndeltas = 0; + seed = arc4random(); + *deltas = reallocarray(NULL, alloc_chunk_size, sizeof(struct got_delta_instruction)); if (*deltas == NULL) @@ -904,7 +924,7 @@ got_deltify_mem_mem(struct got_delta_instruction **del break; } err = lookupblk_mem(&block, dt, data + fileoffset, blocklen, - basedata, basefile_offset0); + seed, basedata, basefile_offset0); if (err) break; if (block != NULL) { blob - d748288cff18ac9179407bbf2ecb771ad11452c2 blob + f92a4d04144e6c8cbea41506b7f00f552a1942ad --- lib/pack_create.c +++ lib/pack_create.c @@ -51,6 +51,7 @@ #include "got_lib_object_cache.h" #include "got_lib_deflate.h" #include "got_lib_pack.h" +#include "got_lib_pack_create.h" #include "got_lib_privsep.h" #include "got_lib_repository.h" #include "got_lib_ratelimit.h" @@ -104,7 +105,7 @@ struct got_pack_metavec { static const struct got_error * alloc_meta(struct got_pack_meta **new, struct got_object_id *id, - const char *path, int obj_type, time_t mtime) + const char *path, int obj_type, time_t mtime, uint32_t seed) { struct got_pack_meta *m; @@ -116,7 +117,7 @@ alloc_meta(struct got_pack_meta **new, struct got_obje memcpy(&m->id, id, sizeof(m->id)); - m->path_hash = murmurhash2(path, strlen(path), 0xd70af26a); + m->path_hash = murmurhash2(path, strlen(path), seed); m->obj_type = obj_type; m->mtime = mtime; *new = m; @@ -856,8 +857,8 @@ search_packidx(int *found, struct got_object_id *id, static const struct got_error * add_object(int want_meta, struct got_object_idset *idset, struct got_object_id *id, const char *path, int obj_type, - time_t mtime, int loose_obj_only, struct got_repository *repo, - int *ncolored, int *nfound, int *ntrees, + time_t mtime, uint32_t seed, int loose_obj_only, + struct got_repository *repo, int *ncolored, int *nfound, int *ntrees, got_pack_progress_cb progress_cb, void *progress_arg, struct got_ratelimit *rl) { @@ -874,7 +875,7 @@ add_object(int want_meta, struct got_object_idset *ids } if (want_meta) { - err = alloc_meta(&m, id, path, obj_type, mtime); + err = alloc_meta(&m, id, path, obj_type, mtime, seed); if (err) return err; @@ -900,7 +901,7 @@ static const struct got_error * load_tree_entries(struct got_object_id_queue *ids, int want_meta, struct got_object_idset *idset, struct got_object_idset *idset_exclude, struct got_object_id *tree_id, - const char *dpath, time_t mtime, struct got_repository *repo, + const char *dpath, time_t mtime, uint32_t seed, struct got_repository *repo, int loose_obj_only, int *ncolored, int *nfound, int *ntrees, got_pack_progress_cb progress_cb, void *progress_arg, struct got_ratelimit *rl, got_cancel_cb cancel_cb, void *cancel_arg) @@ -953,8 +954,8 @@ load_tree_entries(struct got_object_id_queue *ids, int } else if (S_ISREG(mode) || S_ISLNK(mode)) { err = add_object(want_meta, want_meta ? idset : idset_exclude, id, p, - GOT_OBJ_TYPE_BLOB, mtime, loose_obj_only, repo, - ncolored, nfound, ntrees, + GOT_OBJ_TYPE_BLOB, mtime, seed, loose_obj_only, + repo, ncolored, nfound, ntrees, progress_cb, progress_arg, rl); if (err) break; @@ -975,7 +976,7 @@ static const struct got_error * load_tree(int want_meta, struct got_object_idset *idset, struct got_object_idset *idset_exclude, struct got_object_id *tree_id, const char *dpath, time_t mtime, - struct got_repository *repo, int loose_obj_only, + uint32_t seed, struct got_repository *repo, int loose_obj_only, int *ncolored, int *nfound, int *ntrees, got_pack_progress_cb progress_cb, void *progress_arg, struct got_ratelimit *rl, got_cancel_cb cancel_cb, void *cancel_arg) @@ -1022,7 +1023,7 @@ load_tree(int want_meta, struct got_object_idset *idse err = add_object(want_meta, want_meta ? idset : idset_exclude, &qid->id, path, GOT_OBJ_TYPE_TREE, - mtime, loose_obj_only, repo, + mtime, seed, loose_obj_only, repo, ncolored, nfound, ntrees, progress_cb, progress_arg, rl); if (err) { free(qid->data); @@ -1032,7 +1033,7 @@ load_tree(int want_meta, struct got_object_idset *idse err = load_tree_entries(&tree_ids, want_meta, idset, idset_exclude, &qid->id, - path, mtime, repo, loose_obj_only, ncolored, nfound, + path, mtime, seed, repo, loose_obj_only, ncolored, nfound, ntrees, progress_cb, progress_arg, rl, cancel_cb, cancel_arg); free(qid->data); @@ -1050,8 +1051,8 @@ load_tree(int want_meta, struct got_object_idset *idse static const struct got_error * load_commit(int want_meta, struct got_object_idset *idset, struct got_object_idset *idset_exclude, - struct got_object_id *id, struct got_repository *repo, int loose_obj_only, - int *ncolored, int *nfound, int *ntrees, + struct got_object_id *id, struct got_repository *repo, uint32_t seed, + int loose_obj_only, int *ncolored, int *nfound, int *ntrees, got_pack_progress_cb progress_cb, void *progress_arg, struct got_ratelimit *rl, got_cancel_cb cancel_cb, void *cancel_arg) { @@ -1077,7 +1078,7 @@ load_commit(int want_meta, struct got_object_idset *id err = add_object(want_meta, want_meta ? idset : idset_exclude, id, "", GOT_OBJ_TYPE_COMMIT, - got_object_commit_get_committer_time(commit), + got_object_commit_get_committer_time(commit), seed, loose_obj_only, repo, ncolored, nfound, ntrees, progress_cb, progress_arg, rl); if (err) @@ -1085,7 +1086,7 @@ load_commit(int want_meta, struct got_object_idset *id err = load_tree(want_meta, idset, idset_exclude, got_object_commit_get_tree_id(commit), - "", got_object_commit_get_committer_time(commit), + "", got_object_commit_get_committer_time(commit), seed, repo, loose_obj_only, ncolored, nfound, ntrees, progress_cb, progress_arg, rl, cancel_cb, cancel_arg); done: @@ -1096,8 +1097,8 @@ done: static const struct got_error * load_tag(int want_meta, struct got_object_idset *idset, struct got_object_idset *idset_exclude, - struct got_object_id *id, struct got_repository *repo, int loose_obj_only, - int *ncolored, int *nfound, int *ntrees, + struct got_object_id *id, struct got_repository *repo, uint32_t seed, + int loose_obj_only, int *ncolored, int *nfound, int *ntrees, got_pack_progress_cb progress_cb, void *progress_arg, struct got_ratelimit *rl, got_cancel_cb cancel_cb, void *cancel_arg) { @@ -1123,7 +1124,7 @@ load_tag(int want_meta, struct got_object_idset *idset err = add_object(want_meta, want_meta ? idset : idset_exclude, id, "", GOT_OBJ_TYPE_TAG, - got_object_tag_get_tagger_time(tag), loose_obj_only, repo, + got_object_tag_get_tagger_time(tag), seed, loose_obj_only, repo, ncolored, nfound, ntrees, progress_cb, progress_arg, rl); if (err) goto done; @@ -1131,16 +1132,16 @@ load_tag(int want_meta, struct got_object_idset *idset switch (got_object_tag_get_object_type(tag)) { case GOT_OBJ_TYPE_COMMIT: err = load_commit(want_meta, idset, idset_exclude, - got_object_tag_get_object_id(tag), repo, loose_obj_only, - ncolored, nfound, ntrees, progress_cb, progress_arg, rl, - cancel_cb, cancel_arg); + got_object_tag_get_object_id(tag), repo, seed, + loose_obj_only, ncolored, nfound, ntrees, + progress_cb, progress_arg, rl, cancel_cb, cancel_arg); break; case GOT_OBJ_TYPE_TREE: err = load_tree(want_meta, idset, idset_exclude, got_object_tag_get_object_id(tag), "", - got_object_tag_get_tagger_time(tag), repo, loose_obj_only, - ncolored, nfound, ntrees, progress_cb, progress_arg, rl, - cancel_cb, cancel_arg); + got_object_tag_get_tagger_time(tag), seed, repo, + loose_obj_only, ncolored, nfound, ntrees, + progress_cb, progress_arg, rl, cancel_cb, cancel_arg); break; default: break; @@ -1451,8 +1452,9 @@ static const struct got_error * load_object_ids(int *ncolored, int *nfound, int *ntrees, struct got_object_idset *idset, struct got_object_id **theirs, int ntheirs, struct got_object_id **ours, int nours, struct got_repository *repo, - int loose_obj_only, got_pack_progress_cb progress_cb, void *progress_arg, - struct got_ratelimit *rl, got_cancel_cb cancel_cb, void *cancel_arg) + uint32_t seed, int loose_obj_only, got_pack_progress_cb progress_cb, + void *progress_arg, struct got_ratelimit *rl, got_cancel_cb cancel_cb, + void *cancel_arg) { const struct got_error *err = NULL; struct got_object_id **ids = NULL; @@ -1481,14 +1483,14 @@ load_object_ids(int *ncolored, int *nfound, int *ntree return err; if (obj_type == GOT_OBJ_TYPE_COMMIT) { err = load_commit(0, idset, idset_exclude, id, repo, - loose_obj_only, ncolored, nfound, ntrees, + seed, loose_obj_only, ncolored, nfound, ntrees, progress_cb, progress_arg, rl, cancel_cb, cancel_arg); if (err) goto done; } else if (obj_type == GOT_OBJ_TYPE_TAG) { err = load_tag(0, idset, idset_exclude, id, repo, - loose_obj_only, ncolored, nfound, ntrees, + seed, loose_obj_only, ncolored, nfound, ntrees, progress_cb, progress_arg, rl, cancel_cb, cancel_arg); if (err) @@ -1497,8 +1499,8 @@ load_object_ids(int *ncolored, int *nfound, int *ntree } for (i = 0; i < nobj; i++) { - err = load_commit(1, idset, idset_exclude, - ids[i], repo, loose_obj_only, ncolored, nfound, ntrees, + err = load_commit(1, idset, idset_exclude, ids[i], repo, + seed, loose_obj_only, ncolored, nfound, ntrees, progress_cb, progress_arg, rl, cancel_cb, cancel_arg); if (err) goto done; @@ -1519,7 +1521,7 @@ load_object_ids(int *ncolored, int *nfound, int *ntree if (obj_type != GOT_OBJ_TYPE_TAG) continue; err = load_tag(1, idset, idset_exclude, id, repo, - loose_obj_only, ncolored, nfound, ntrees, + seed, loose_obj_only, ncolored, nfound, ntrees, progress_cb, progress_arg, rl, cancel_cb, cancel_arg); if (err) goto done; @@ -1943,7 +1945,10 @@ got_pack_create(uint8_t *packsha1, FILE *packfile, struct got_pack_metavec deltify, reuse; int ncolored = 0, nfound = 0, ntrees = 0; size_t ndeltify; + uint32_t seed; + seed = arc4random(); + memset(&deltify, 0, sizeof(deltify)); memset(&reuse, 0, sizeof(reuse)); @@ -1954,7 +1959,7 @@ got_pack_create(uint8_t *packsha1, FILE *packfile, return got_error_from_errno("got_object_idset_alloc"); err = load_object_ids(&ncolored, &nfound, &ntrees, idset, theirs, - ntheirs, ours, nours, repo, loose_obj_only, + ntheirs, ours, nours, repo, seed, loose_obj_only, progress_cb, progress_arg, &rl, cancel_cb, cancel_arg); if (err) goto done;
fix path meta-data used for packing