From: Stefan Sperling Subject: reimplement object ID-set on top of a hash table To: gameoftrees@openbsd.org Date: Fri, 15 Apr 2022 16:32:33 +0200 Implement the object ID-set on top of a chaining hash table instead of an RB tree. This provides a slight speed advantage when packing a lot of objects: test case: 216541 commits colored; 2048137 objects found; 994231 trees scanned RB tree: 3m59.31s real 3m44.61s user 0m42.55s system hash table: 3m11.87s real 2m58.43s user 0m43.60s system jrick suggested siphash as the hash distribution function. Initially I tried using 4 byte positions of the object ID array, but discovered that murmurhash has a much better random distribution which results in shorter chains. siphash seems to have similar performance to murmurhash and doesn't suffer from collision attacks: https://en.wikipedia.org/wiki/MurmurHash#Vulnerabilities The idset_add() operation no longer checks whether the element is already present in the set. All callers are already performing this check before adding elements, and it seems bad to do it twice. Lookups are now O(1) if the element is missing, so letting callers check for existence before insertion should be fine. The hash table can grow until it fails to allocate more hash buckets. In theory we could allow the table to shrink, but so far I could not get this working without a significant slowdown. When the number of buckets changes all elements have to be copied over to a new table, which is very expensive. ----------------------------------------------- commit 2670093859bd13a721a818c4d8b2f0140a8dfc93 (idset-hash) from: Stefan Sperling date: Fri Apr 15 14:21:05 2022 UTC reimplement object-ID set data structure on top of a hash table Siphash suggested by jrick as a better alternative to murmurhash for this use case. diff 1b01a31979d2daafab1f160fcc263c8b7062a7f3 ba40aee1bfa885072a040a81da73cb49b5d40e08 blob - 5346f7f2ce1559b966b7c6e2f1bde6da40d729a4 blob + 750be4962021434fb27d2dcc95867ff2e333a776 --- lib/got_lib_object_idset.h +++ lib/got_lib_object_idset.h @@ -31,13 +31,3 @@ const struct got_error *got_object_idset_for_each(stru const struct got_error *(*cb)(struct got_object_id *, void *, void *), void *); int got_object_idset_num_elements(struct got_object_idset *); - -struct got_object_idset_element; -struct got_object_idset_element *got_object_idset_get_element( - struct got_object_idset *, struct got_object_id *); -void *got_object_idset_get_element_data(struct got_object_idset_element *); -const struct got_error *got_object_idset_for_each_element(struct got_object_idset *, - const struct got_error *(*cb)(struct got_object_idset_element *, void *), void *); -void got_object_idset_remove_element(struct got_object_idset *, - struct got_object_idset_element *); - blob - 06f7347e22beeabe835c116ac99d59e61f87ea68 blob + d188d887deea72878f96122fb9c3544c4b9da7de --- lib/object_idset.c +++ lib/object_idset.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2019 Stefan Sperling + * Copyright (c) 2022 Stefan Sperling * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -15,15 +15,17 @@ */ #include -#include #include +#include #include #include #include #include #include #include +#include +#include #include "got_object.h" #include "got_error.h" @@ -32,116 +34,190 @@ #include "got_lib_inflate.h" #include "got_lib_object.h" #include "got_lib_object_idset.h" +#include "got_lib_object_parse.h" -struct got_object_idset_element { - RB_ENTRY(got_object_idset_element) entry; - struct got_object_id id; - void *data; /* API user data */ -}; +#define GOT_OBJECT_IDSET_MIN_BUCKETS 64 -RB_HEAD(got_object_idset_tree, got_object_idset_element); - -static int -cmp_elements(const struct got_object_idset_element *e1, - const struct got_object_idset_element *e2) -{ - return got_object_id_cmp(&e1->id, &e2->id); -} - -RB_PROTOTYPE(got_object_idset_tree, got_object_idset_element, entry, - cmp_elements); - struct got_object_idset { - struct got_object_idset_tree entries; - int totelem; -#define GOT_OBJECT_IDSET_MAX_ELEM INT_MAX + struct got_object_id_queue *ids; + size_t nbuckets; + unsigned int totelem; + unsigned int flags; +#define GOT_OBJECT_IDSET_F_TRAVERSAL 0x01 +#define GOT_OBJECT_IDSET_F_NOMEM 0x02 + SIPHASH_KEY key; }; struct got_object_idset * got_object_idset_alloc(void) { struct got_object_idset *set; + int i; set = malloc(sizeof(*set)); if (set == NULL) return NULL; - RB_INIT(&set->entries); + set->ids = calloc(sizeof(set->ids[0]), GOT_OBJECT_IDSET_MIN_BUCKETS); + if (set->ids == NULL) { + free(set); + return NULL; + } + for (i = 0; i < GOT_OBJECT_IDSET_MIN_BUCKETS; i++) + STAILQ_INIT(&set->ids[i]); + set->totelem = 0; - + set->nbuckets = GOT_OBJECT_IDSET_MIN_BUCKETS; + set->flags = 0; + arc4random_buf(&set->key, sizeof(set->key)); return set; } void got_object_idset_free(struct got_object_idset *set) { - struct got_object_idset_element *entry; + size_t i; + struct got_object_qid *qid; - while ((entry = RB_MIN(got_object_idset_tree, &set->entries))) { - RB_REMOVE(got_object_idset_tree, &set->entries, entry); - /* User data should be freed by caller. */ - free(entry); + for (i = 0; i < set->nbuckets; i++) { + while (!STAILQ_EMPTY(&set->ids[i])) { + qid = STAILQ_FIRST(&set->ids[i]); + STAILQ_REMOVE(&set->ids[i], qid, got_object_qid, entry); + got_object_qid_free(qid); + } } - + /* User data should be freed by caller. */ + free(set->ids); free(set); } +static uint64_t +idset_hash(struct got_object_idset *set, struct got_object_id *id) +{ + return SipHash24(&set->key, id->sha1, sizeof(id->sha1)); +} + +static const struct got_error * +idset_resize(struct got_object_idset *set, size_t nbuckets) +{ + struct got_object_id_queue *ids; + unsigned int i; + + ids = calloc(nbuckets, sizeof(ids[0])); + if (ids == NULL) { + if (errno != ENOMEM) + return got_error_from_errno("calloc"); + /* Proceed with our current amount of hash buckets. */ + set->flags |= GOT_OBJECT_IDSET_F_NOMEM; + return NULL; + } + + for (i = 0; i < nbuckets; i++) + STAILQ_INIT(&ids[i]); + + arc4random_buf(&set->key, sizeof(set->key)); + + for (i = 0; i < set->nbuckets; i++) { + while (!STAILQ_EMPTY(&set->ids[i])) { + struct got_object_qid *qid; + uint64_t idx; + qid = STAILQ_FIRST(&set->ids[i]); + STAILQ_REMOVE(&set->ids[i], qid, got_object_qid, entry); + idx = idset_hash(set, qid->id) % nbuckets; + STAILQ_INSERT_HEAD(&ids[idx], qid, entry); + } + } + + free(set->ids); + set->ids = ids; + set->nbuckets = nbuckets; + return NULL; +} + +static const struct got_error * +idset_grow(struct got_object_idset *set) +{ + size_t nbuckets; + + if (set->flags & GOT_OBJECT_IDSET_F_NOMEM) + return NULL; + + if (set->nbuckets >= UINT_MAX / 2) + nbuckets = UINT_MAX; + else + nbuckets = set->nbuckets * 2; + + return idset_resize(set, nbuckets); +} + const struct got_error * got_object_idset_add(struct got_object_idset *set, struct got_object_id *id, void *data) { - struct got_object_idset_element *new; + const struct got_error *err; + struct got_object_qid *qid; + uint64_t idx; + struct got_object_id_queue *head; - if (set->totelem >= GOT_OBJECT_IDSET_MAX_ELEM) + /* This function may resize the set. */ + if (set->flags & GOT_OBJECT_IDSET_F_TRAVERSAL) + return got_error_msg(GOT_ERR_NOT_IMPL, + "cannot add elements to idset during traversal"); + + if (set->totelem == UINT_MAX) return got_error(GOT_ERR_NO_SPACE); + + err = got_object_qid_alloc_partial(&qid); + if (err) + return err; + memcpy(qid->id, id, sizeof(*qid->id)); + qid->data = data; - new = malloc(sizeof(*new)); - if (new == NULL) - return got_error_from_errno("malloc"); - - memcpy(&new->id, id, sizeof(new->id)); - new->data = data; - - if (RB_INSERT(got_object_idset_tree, &set->entries, new) != NULL) { - free(new); - return got_error(GOT_ERR_OBJ_EXISTS); - } - + idx = idset_hash(set, id) % set->nbuckets; + head = &set->ids[idx]; + STAILQ_INSERT_HEAD(head, qid, entry); set->totelem++; + + if (set->nbuckets < set->totelem) { + err = idset_grow(set); + if (err) + return err; + idx = idset_hash(set, id) % set->nbuckets; + head = &set->ids[idx]; + } + return NULL; } -static struct got_object_idset_element * +static struct got_object_qid * find_element(struct got_object_idset *set, struct got_object_id *id) { - struct got_object_idset_element *entry; + uint64_t idx = idset_hash(set, id) % set->nbuckets; + struct got_object_id_queue *head = &set->ids[idx]; + struct got_object_qid *qid; - entry = RB_ROOT(&set->entries); - while (entry) { - int cmp = got_object_id_cmp(id, &entry->id); - if (cmp < 0) - entry = RB_LEFT(entry, entry); - else if (cmp > 0) - entry = RB_RIGHT(entry, entry); - else - break; + STAILQ_FOREACH(qid, head, entry) { + if (got_object_id_cmp(qid->id, id) == 0) + return qid; } - return entry; + return NULL; } void * got_object_idset_get(struct got_object_idset *set, struct got_object_id *id) { - struct got_object_idset_element *entry = find_element(set, id); - return entry ? entry->data : NULL; + struct got_object_qid *qid = find_element(set, id); + return qid ? qid->data : NULL; } const struct got_error * got_object_idset_remove(void **data, struct got_object_idset *set, struct got_object_id *id) { - struct got_object_idset_element *entry; + uint64_t idx; + struct got_object_id_queue *head; + struct got_object_qid *qid; if (data) *data = NULL; @@ -150,20 +226,30 @@ got_object_idset_remove(void **data, struct got_object return got_error(GOT_ERR_NO_OBJ); if (id == NULL) { - entry = RB_ROOT(&set->entries); - if (entry == NULL) - return got_error(GOT_ERR_NO_OBJ); + /* Remove a "random" element. */ + for (idx = 0; idx < set->nbuckets; idx++) { + head = &set->ids[idx]; + qid = STAILQ_FIRST(head); + if (qid) + break; + } } else { - entry = find_element(set, id); - if (entry == NULL) + idx = idset_hash(set, id) % set->nbuckets; + head = &set->ids[idx]; + STAILQ_FOREACH(qid, head, entry) { + if (got_object_id_cmp(qid->id, id) == 0) + break; + } + if (qid == NULL) return got_error_no_obj(id); } - RB_REMOVE(got_object_idset_tree, &set->entries, entry); if (data) - *data = entry->data; - free(entry); + *data = qid->data; + STAILQ_REMOVE(head, qid, got_object_qid, entry); + got_object_qid_free(qid); set->totelem--; + return NULL; } @@ -171,8 +257,8 @@ int got_object_idset_contains(struct got_object_idset *set, struct got_object_id *id) { - struct got_object_idset_element *entry = find_element(set, id); - return entry ? 1 : 0; + struct got_object_qid *qid = find_element(set, id); + return qid ? 1 : 0; } const struct got_error * @@ -180,15 +266,22 @@ got_object_idset_for_each(struct got_object_idset *set const struct got_error *(*cb)(struct got_object_id *, void *, void *), void *arg) { - const struct got_error *err; - struct got_object_idset_element *entry, *tmp; + const struct got_error *err = NULL; + struct got_object_id_queue *head; + struct got_object_qid *qid, *tmp; + size_t i; - RB_FOREACH_SAFE(entry, got_object_idset_tree, &set->entries, tmp) { - err = (*cb)(&entry->id, entry->data, arg); - if (err) - return err; + set->flags |= GOT_OBJECT_IDSET_F_TRAVERSAL; + for (i = 0; i < set->nbuckets; i++) { head = &set->ids[i]; + STAILQ_FOREACH_SAFE(qid, head, entry, tmp) { + err = (*cb)(qid->id, qid->data, arg); + if (err) + goto done; + } } - return NULL; +done: + set->flags &= ~GOT_OBJECT_IDSET_F_TRAVERSAL; + return err; } int @@ -196,43 +289,3 @@ got_object_idset_num_elements(struct got_object_idset { return set->totelem; } - -struct got_object_idset_element * -got_object_idset_get_element(struct got_object_idset *set, struct got_object_id *id) -{ - return find_element(set, id); -} - -void * -got_object_idset_get_element_data(struct got_object_idset_element *entry) -{ - return entry->data; -} - -const struct got_error * -got_object_idset_for_each_element(struct got_object_idset *set, - const struct got_error *(*cb)(struct got_object_idset_element *, void *), - void *arg) -{ - const struct got_error *err; - struct got_object_idset_element *entry, *tmp; - - RB_FOREACH_SAFE(entry, got_object_idset_tree, &set->entries, tmp) { - err = (*cb)(entry, arg); - if (err) - return err; - } - return NULL; -} - -void -got_object_idset_remove_element(struct got_object_idset *set, - struct got_object_idset_element *entry) -{ - RB_REMOVE(got_object_idset_tree, &set->entries, entry); - free(entry); - set->totelem--; -} - -RB_GENERATE(got_object_idset_tree, got_object_idset_element, entry, - cmp_elements); blob - fa7ea77d02fc1d6284318e12d67fc4c0742fde70 blob + 81518d57e3cdb40c14904ec873d086182fea8b4c --- lib/pack_create.c +++ lib/pack_create.c @@ -1777,25 +1777,24 @@ done: } static const struct got_error * -remove_unused_object(struct got_object_idset_element *entry, void *arg) +remove_unused_object(struct got_object_id *id, void *data, void *arg) { struct got_object_idset *idset = arg; - if (got_object_idset_get_element_data(entry) == NULL) - got_object_idset_remove_element(idset, entry); + if (data == NULL) + got_object_idset_remove(NULL, idset, id); return NULL; } static const struct got_error * -remove_reused_object(struct got_object_idset_element *entry, void *arg) +remove_reused_object(struct got_object_id *id, void *data, void *arg) { struct got_object_idset *idset = arg; - struct got_pack_meta *m; + struct got_pack_meta *m = data; - m = got_object_idset_get_element_data(entry); if (m->have_reused_delta) - got_object_idset_remove_element(idset, entry); + got_object_idset_remove(NULL, idset, id); return NULL; } @@ -1840,8 +1839,7 @@ got_pack_create(uint8_t *packsha1, FILE *packfile, if (err) return err; - err = got_object_idset_for_each_element(idset, - remove_unused_object, idset); + err = got_object_idset_for_each(idset, remove_unused_object, idset); if (err) goto done; @@ -1877,7 +1875,7 @@ got_pack_create(uint8_t *packsha1, FILE *packfile, if (err) goto done; if (reuse.nmeta > 0) { - err = got_object_idset_for_each_element(idset, + err = got_object_idset_for_each(idset, remove_reused_object, idset); if (err) goto done;