reimplement object ID-set on top of a hash table

Stefan Sperling <stsp@stsp.name> wrote: > Implement the object ID-set on top of a chaining hash table instead > of an RB tree. > > This provides a slight speed advantage when packing a lot of objects: > > test case: 216541 commits colored; 2048137 objects found; 994231 trees scanned > RB tree: 3m59.31s real 3m44.61s user 0m42.55s system > hash table: 3m11.87s real 2m58.43s user 0m43.60s system > > jrick suggested siphash as the hash distribution function. > Initially I tried using 4 byte positions of the object ID array, but discovered > that murmurhash has a much better random distribution which results in shorter > chains. siphash seems to have similar performance to murmurhash and doesn't > suffer from collision attacks: > https://en.wikipedia.org/wiki/MurmurHash#Vulnerabilities > > The idset_add() operation no longer checks whether the element is already > present in the set. All callers are already performing this check before > adding elements, and it seems bad to do it twice. Lookups are now O(1) if > the element is missing, so letting callers check for existence before > insertion should be fine. > > The hash table can grow until it fails to allocate more hash buckets. > In theory we could allow the table to shrink, but so far I could not > get this working without a significant slowdown. When the number of > buckets changes all elements have to be copied over to a new table, > which is very expensive. it reads fine to me, just some comments inline: > ----------------------------------------------- > commit 2670093859bd13a721a818c4d8b2f0140a8dfc93 (idset-hash) > from: Stefan Sperling <stsp@stsp.name> > date: Fri Apr 15 14:21:05 2022 UTC > > reimplement object-ID set data structure on top of a hash table > > Siphash suggested by jrick as a better alternative to murmurhash > for this use case. > > diff 1b01a31979d2daafab1f160fcc263c8b7062a7f3 ba40aee1bfa885072a040a81da73cb49b5d40e08 > blob - 5346f7f2ce1559b966b7c6e2f1bde6da40d729a4 > blob + 750be4962021434fb27d2dcc95867ff2e333a776 > --- lib/got_lib_object_idset.h > +++ lib/got_lib_object_idset.h > @@ -31,13 +31,3 @@ const struct got_error *got_object_idset_for_each(stru > const struct got_error *(*cb)(struct got_object_id *, void *, void *), > void *); > int got_object_idset_num_elements(struct got_object_idset *); > - > -struct got_object_idset_element; > -struct got_object_idset_element *got_object_idset_get_element( > - struct got_object_idset *, struct got_object_id *); > -void *got_object_idset_get_element_data(struct got_object_idset_element *); > -const struct got_error *got_object_idset_for_each_element(struct got_object_idset *, > - const struct got_error *(*cb)(struct got_object_idset_element *, void *), void *); > -void got_object_idset_remove_element(struct got_object_idset *, > - struct got_object_idset_element *); > - > blob - 06f7347e22beeabe835c116ac99d59e61f87ea68 > blob + d188d887deea72878f96122fb9c3544c4b9da7de > --- lib/object_idset.c > +++ lib/object_idset.c > @@ -1,5 +1,5 @@ > /* > - * Copyright (c) 2018, 2019 Stefan Sperling <stsp@openbsd.org> > + * Copyright (c) 2022 Stefan Sperling <stsp@openbsd.org> > * > * Permission to use, copy, modify, and distribute this software for any > * purpose with or without fee is hereby granted, provided that the above > @@ -15,15 +15,17 @@ > */ > > #include <sys/queue.h> > -#include <sys/tree.h> > > #include <stdlib.h> > +#include <stdint.h> > #include <string.h> > #include <sha1.h> > #include <stdio.h> > #include <zlib.h> > #include <limits.h> > #include <time.h> > +#include <errno.h> > +#include <siphash.h> > > #include "got_object.h" > #include "got_error.h" > @@ -32,116 +34,190 @@ > #include "got_lib_inflate.h" > #include "got_lib_object.h" > #include "got_lib_object_idset.h" > +#include "got_lib_object_parse.h" > > -struct got_object_idset_element { > - RB_ENTRY(got_object_idset_element) entry; > - struct got_object_id id; > - void *data; /* API user data */ > -}; > +#define GOT_OBJECT_IDSET_MIN_BUCKETS 64 > > -RB_HEAD(got_object_idset_tree, got_object_idset_element); > - > -static int > -cmp_elements(const struct got_object_idset_element *e1, > - const struct got_object_idset_element *e2) > -{ > - return got_object_id_cmp(&e1->id, &e2->id); > -} > - > -RB_PROTOTYPE(got_object_idset_tree, got_object_idset_element, entry, > - cmp_elements); > - > struct got_object_idset { > - struct got_object_idset_tree entries; > - int totelem; > -#define GOT_OBJECT_IDSET_MAX_ELEM INT_MAX > + struct got_object_id_queue *ids; > + size_t nbuckets; > + unsigned int totelem; > + unsigned int flags; > +#define GOT_OBJECT_IDSET_F_TRAVERSAL 0x01 > +#define GOT_OBJECT_IDSET_F_NOMEM 0x02 > + SIPHASH_KEY key; > }; > > struct got_object_idset * > got_object_idset_alloc(void) > { > struct got_object_idset *set; > + int i; > > set = malloc(sizeof(*set)); > if (set == NULL) > return NULL; > > - RB_INIT(&set->entries); > + set->ids = calloc(sizeof(set->ids[0]), GOT_OBJECT_IDSET_MIN_BUCKETS); > + if (set->ids == NULL) { > + free(set); > + return NULL; > + } > + for (i = 0; i < GOT_OBJECT_IDSET_MIN_BUCKETS; i++) > + STAILQ_INIT(&set->ids[i]); > + > set->totelem = 0; > - > + set->nbuckets = GOT_OBJECT_IDSET_MIN_BUCKETS; > + set->flags = 0; > + arc4random_buf(&set->key, sizeof(set->key)); > return set; > } > > void > got_object_idset_free(struct got_object_idset *set) > { > - struct got_object_idset_element *entry; > + size_t i; > + struct got_object_qid *qid; > > - while ((entry = RB_MIN(got_object_idset_tree, &set->entries))) { > - RB_REMOVE(got_object_idset_tree, &set->entries, entry); > - /* User data should be freed by caller. */ > - free(entry); > + for (i = 0; i < set->nbuckets; i++) { > + while (!STAILQ_EMPTY(&set->ids[i])) { > + qid = STAILQ_FIRST(&set->ids[i]); > + STAILQ_REMOVE(&set->ids[i], qid, got_object_qid, entry); > + got_object_qid_free(qid); > + } > } > - > + /* User data should be freed by caller. */ > + free(set->ids); > free(set); > } > > +static uint64_t > +idset_hash(struct got_object_idset *set, struct got_object_id *id) > +{ > + return SipHash24(&set->key, id->sha1, sizeof(id->sha1)); > +} > + > +static const struct got_error * > +idset_resize(struct got_object_idset *set, size_t nbuckets) > +{ > + struct got_object_id_queue *ids; > + unsigned int i; > + > + ids = calloc(nbuckets, sizeof(ids[0])); > + if (ids == NULL) { > + if (errno != ENOMEM) > + return got_error_from_errno("calloc"); > + /* Proceed with our current amount of hash buckets. */ > + set->flags |= GOT_OBJECT_IDSET_F_NOMEM; > + return NULL; > + } > + > + for (i = 0; i < nbuckets; i++) > + STAILQ_INIT(&ids[i]); > + > + arc4random_buf(&set->key, sizeof(set->key)); > + > + for (i = 0; i < set->nbuckets; i++) { got_object_idset_add enforces that we don't end up having more than UINT_MAX elements, so this is safe. However, I think it'd read better to declare i as size_t (and maybe even `totelem' in the struct) and avoid these comparisons between different sizes. (btw, got_object_idset_for_each uses `size_t i' to iterate) > + while (!STAILQ_EMPTY(&set->ids[i])) { > + struct got_object_qid *qid; > + uint64_t idx; > + qid = STAILQ_FIRST(&set->ids[i]); > + STAILQ_REMOVE(&set->ids[i], qid, got_object_qid, entry); > + idx = idset_hash(set, qid->id) % nbuckets; > + STAILQ_INSERT_HEAD(&ids[idx], qid, entry); > + } > + } > + > + free(set->ids); > + set->ids = ids; > + set->nbuckets = nbuckets; > + return NULL; > +} > + > +static const struct got_error * > +idset_grow(struct got_object_idset *set) > +{ > + size_t nbuckets; > + > + if (set->flags & GOT_OBJECT_IDSET_F_NOMEM) > + return NULL; > + > + if (set->nbuckets >= UINT_MAX / 2) > + nbuckets = UINT_MAX; > + else > + nbuckets = set->nbuckets * 2; > + > + return idset_resize(set, nbuckets); > +} > + > const struct got_error * > got_object_idset_add(struct got_object_idset *set, struct got_object_id *id, > void *data) > { > - struct got_object_idset_element *new; > + const struct got_error *err; > + struct got_object_qid *qid; > + uint64_t idx; > + struct got_object_id_queue *head; > > - if (set->totelem >= GOT_OBJECT_IDSET_MAX_ELEM) > + /* This function may resize the set. */ > + if (set->flags & GOT_OBJECT_IDSET_F_TRAVERSAL) > + return got_error_msg(GOT_ERR_NOT_IMPL, > + "cannot add elements to idset during traversal"); > + > + if (set->totelem == UINT_MAX) > return got_error(GOT_ERR_NO_SPACE); > + > + err = got_object_qid_alloc_partial(&qid); > + if (err) > + return err; > + memcpy(qid->id, id, sizeof(*qid->id)); > + qid->data = data; > > - new = malloc(sizeof(*new)); > - if (new == NULL) > - return got_error_from_errno("malloc"); > - > - memcpy(&new->id, id, sizeof(new->id)); > - new->data = data; > - > - if (RB_INSERT(got_object_idset_tree, &set->entries, new) != NULL) { > - free(new); > - return got_error(GOT_ERR_OBJ_EXISTS); > - } > - > + idx = idset_hash(set, id) % set->nbuckets; > + head = &set->ids[idx]; > + STAILQ_INSERT_HEAD(head, qid, entry); > set->totelem++; > + > + if (set->nbuckets < set->totelem) { > + err = idset_grow(set); > + if (err) > + return err; these two are junk from previous drafts? > + idx = idset_hash(set, id) % set->nbuckets; > + head = &set->ids[idx]; > + } > + > return NULL; > } > > -static struct got_object_idset_element * > +static struct got_object_qid * > find_element(struct got_object_idset *set, struct got_object_id *id) > { > - struct got_object_idset_element *entry; > + uint64_t idx = idset_hash(set, id) % set->nbuckets; > + struct got_object_id_queue *head = &set->ids[idx]; > + struct got_object_qid *qid; > > - entry = RB_ROOT(&set->entries); > - while (entry) { > - int cmp = got_object_id_cmp(id, &entry->id); > - if (cmp < 0) > - entry = RB_LEFT(entry, entry); > - else if (cmp > 0) > - entry = RB_RIGHT(entry, entry); > - else > - break; > + STAILQ_FOREACH(qid, head, entry) { > + if (got_object_id_cmp(qid->id, id) == 0) > + return qid; > } > > - return entry; > + return NULL; > } > > void * > got_object_idset_get(struct got_object_idset *set, struct got_object_id *id) > { > - struct got_object_idset_element *entry = find_element(set, id); > - return entry ? entry->data : NULL; > + struct got_object_qid *qid = find_element(set, id); > + return qid ? qid->data : NULL; > } > > const struct got_error * > got_object_idset_remove(void **data, struct got_object_idset *set, > struct got_object_id *id) > { > - struct got_object_idset_element *entry; > + uint64_t idx; > + struct got_object_id_queue *head; > + struct got_object_qid *qid; > > if (data) > *data = NULL; > @@ -150,20 +226,30 @@ got_object_idset_remove(void **data, struct got_object > return got_error(GOT_ERR_NO_OBJ); > > if (id == NULL) { > - entry = RB_ROOT(&set->entries); > - if (entry == NULL) > - return got_error(GOT_ERR_NO_OBJ); > + /* Remove a "random" element. */ > + for (idx = 0; idx < set->nbuckets; idx++) { > + head = &set->ids[idx]; > + qid = STAILQ_FIRST(head); > + if (qid) > + break; > + } > } else { > - entry = find_element(set, id); > - if (entry == NULL) > + idx = idset_hash(set, id) % set->nbuckets; > + head = &set->ids[idx]; > + STAILQ_FOREACH(qid, head, entry) { > + if (got_object_id_cmp(qid->id, id) == 0) > + break; > + } > + if (qid == NULL) > return got_error_no_obj(id); > } > > - RB_REMOVE(got_object_idset_tree, &set->entries, entry); > if (data) > - *data = entry->data; > - free(entry); > + *data = qid->data; > + STAILQ_REMOVE(head, qid, got_object_qid, entry); > + got_object_qid_free(qid); > set->totelem--; > + > return NULL; > } > > @@ -171,8 +257,8 @@ int > got_object_idset_contains(struct got_object_idset *set, > struct got_object_id *id) > { > - struct got_object_idset_element *entry = find_element(set, id); > - return entry ? 1 : 0; > + struct got_object_qid *qid = find_element(set, id); > + return qid ? 1 : 0; > } > > const struct got_error * > @@ -180,15 +266,22 @@ got_object_idset_for_each(struct got_object_idset *set > const struct got_error *(*cb)(struct got_object_id *, void *, void *), > void *arg) > { > - const struct got_error *err; > - struct got_object_idset_element *entry, *tmp; > + const struct got_error *err = NULL; > + struct got_object_id_queue *head; > + struct got_object_qid *qid, *tmp; > + size_t i; > > - RB_FOREACH_SAFE(entry, got_object_idset_tree, &set->entries, tmp) { > - err = (*cb)(&entry->id, entry->data, arg); > - if (err) > - return err; > + set->flags |= GOT_OBJECT_IDSET_F_TRAVERSAL; missing a newline after the { > + for (i = 0; i < set->nbuckets; i++) { head = &set->ids[i]; > + STAILQ_FOREACH_SAFE(qid, head, entry, tmp) { > + err = (*cb)(qid->id, qid->data, arg); > + if (err) > + goto done; > + } > } > - return NULL; > +done: > + set->flags &= ~GOT_OBJECT_IDSET_F_TRAVERSAL; > + return err; > } > > int > @@ -196,43 +289,3 @@ got_object_idset_num_elements(struct got_object_idset > { > return set->totelem; > } > - > -struct got_object_idset_element * > -got_object_idset_get_element(struct got_object_idset *set, struct got_object_id *id) > -{ > - return find_element(set, id); > -} > - > -void * > -got_object_idset_get_element_data(struct got_object_idset_element *entry) > -{ > - return entry->data; > -} > - > -const struct got_error * > -got_object_idset_for_each_element(struct got_object_idset *set, > - const struct got_error *(*cb)(struct got_object_idset_element *, void *), > - void *arg) > -{ > - const struct got_error *err; > - struct got_object_idset_element *entry, *tmp; > - > - RB_FOREACH_SAFE(entry, got_object_idset_tree, &set->entries, tmp) { > - err = (*cb)(entry, arg); > - if (err) > - return err; > - } > - return NULL; > -} > - > -void > -got_object_idset_remove_element(struct got_object_idset *set, > - struct got_object_idset_element *entry) > -{ > - RB_REMOVE(got_object_idset_tree, &set->entries, entry); > - free(entry); > - set->totelem--; > -} > - > -RB_GENERATE(got_object_idset_tree, got_object_idset_element, entry, > - cmp_elements); > blob - fa7ea77d02fc1d6284318e12d67fc4c0742fde70 > blob + 81518d57e3cdb40c14904ec873d086182fea8b4c > --- lib/pack_create.c > +++ lib/pack_create.c > @@ -1777,25 +1777,24 @@ done: > } > > static const struct got_error * > -remove_unused_object(struct got_object_idset_element *entry, void *arg) > +remove_unused_object(struct got_object_id *id, void *data, void *arg) > { > struct got_object_idset *idset = arg; > > - if (got_object_idset_get_element_data(entry) == NULL) > - got_object_idset_remove_element(idset, entry); > + if (data == NULL) > + got_object_idset_remove(NULL, idset, id); > > return NULL; > } > > static const struct got_error * > -remove_reused_object(struct got_object_idset_element *entry, void *arg) > +remove_reused_object(struct got_object_id *id, void *data, void *arg) > { > struct got_object_idset *idset = arg; > - struct got_pack_meta *m; > + struct got_pack_meta *m = data; > > - m = got_object_idset_get_element_data(entry); > if (m->have_reused_delta) > - got_object_idset_remove_element(idset, entry); > + got_object_idset_remove(NULL, idset, id); > > return NULL; > } > @@ -1840,8 +1839,7 @@ got_pack_create(uint8_t *packsha1, FILE *packfile, > if (err) > return err; > > - err = got_object_idset_for_each_element(idset, > - remove_unused_object, idset); > + err = got_object_idset_for_each(idset, remove_unused_object, idset); > if (err) > goto done; > > @@ -1877,7 +1875,7 @@ got_pack_create(uint8_t *packsha1, FILE *packfile, > if (err) > goto done; > if (reuse.nmeta > 0) { > - err = got_object_idset_for_each_element(idset, > + err = got_object_idset_for_each(idset, > remove_reused_object, idset); > if (err) > goto done;

2022-04-15 14:32 Stefan Sperling:
reimplement object ID-set on top of a hash table
- 2022-04-19 12:04 Omar Polo:
  reimplement object ID-set on top of a hash table
- - 2022-04-19 20:09 Stefan Sperling:
    reimplement object ID-set on top of a hash table