Download raw body.
reimplement object ID-set on top of a hash table
Stefan Sperling <stsp@stsp.name> wrote:
> Implement the object ID-set on top of a chaining hash table instead
> of an RB tree.
>
> This provides a slight speed advantage when packing a lot of objects:
>
> test case: 216541 commits colored; 2048137 objects found; 994231 trees scanned
> RB tree: 3m59.31s real 3m44.61s user 0m42.55s system
> hash table: 3m11.87s real 2m58.43s user 0m43.60s system
>
> jrick suggested siphash as the hash distribution function.
> Initially I tried using 4 byte positions of the object ID array, but discovered
> that murmurhash has a much better random distribution which results in shorter
> chains. siphash seems to have similar performance to murmurhash and doesn't
> suffer from collision attacks:
> https://en.wikipedia.org/wiki/MurmurHash#Vulnerabilities
>
> The idset_add() operation no longer checks whether the element is already
> present in the set. All callers are already performing this check before
> adding elements, and it seems bad to do it twice. Lookups are now O(1) if
> the element is missing, so letting callers check for existence before
> insertion should be fine.
>
> The hash table can grow until it fails to allocate more hash buckets.
> In theory we could allow the table to shrink, but so far I could not
> get this working without a significant slowdown. When the number of
> buckets changes all elements have to be copied over to a new table,
> which is very expensive.
it reads fine to me, just some comments inline:
> -----------------------------------------------
> commit 2670093859bd13a721a818c4d8b2f0140a8dfc93 (idset-hash)
> from: Stefan Sperling <stsp@stsp.name>
> date: Fri Apr 15 14:21:05 2022 UTC
>
> reimplement object-ID set data structure on top of a hash table
>
> Siphash suggested by jrick as a better alternative to murmurhash
> for this use case.
>
> diff 1b01a31979d2daafab1f160fcc263c8b7062a7f3 ba40aee1bfa885072a040a81da73cb49b5d40e08
> blob - 5346f7f2ce1559b966b7c6e2f1bde6da40d729a4
> blob + 750be4962021434fb27d2dcc95867ff2e333a776
> --- lib/got_lib_object_idset.h
> +++ lib/got_lib_object_idset.h
> @@ -31,13 +31,3 @@ const struct got_error *got_object_idset_for_each(stru
> const struct got_error *(*cb)(struct got_object_id *, void *, void *),
> void *);
> int got_object_idset_num_elements(struct got_object_idset *);
> -
> -struct got_object_idset_element;
> -struct got_object_idset_element *got_object_idset_get_element(
> - struct got_object_idset *, struct got_object_id *);
> -void *got_object_idset_get_element_data(struct got_object_idset_element *);
> -const struct got_error *got_object_idset_for_each_element(struct got_object_idset *,
> - const struct got_error *(*cb)(struct got_object_idset_element *, void *), void *);
> -void got_object_idset_remove_element(struct got_object_idset *,
> - struct got_object_idset_element *);
> -
> blob - 06f7347e22beeabe835c116ac99d59e61f87ea68
> blob + d188d887deea72878f96122fb9c3544c4b9da7de
> --- lib/object_idset.c
> +++ lib/object_idset.c
> @@ -1,5 +1,5 @@
> /*
> - * Copyright (c) 2018, 2019 Stefan Sperling <stsp@openbsd.org>
> + * Copyright (c) 2022 Stefan Sperling <stsp@openbsd.org>
> *
> * Permission to use, copy, modify, and distribute this software for any
> * purpose with or without fee is hereby granted, provided that the above
> @@ -15,15 +15,17 @@
> */
>
> #include <sys/queue.h>
> -#include <sys/tree.h>
>
> #include <stdlib.h>
> +#include <stdint.h>
> #include <string.h>
> #include <sha1.h>
> #include <stdio.h>
> #include <zlib.h>
> #include <limits.h>
> #include <time.h>
> +#include <errno.h>
> +#include <siphash.h>
>
> #include "got_object.h"
> #include "got_error.h"
> @@ -32,116 +34,190 @@
> #include "got_lib_inflate.h"
> #include "got_lib_object.h"
> #include "got_lib_object_idset.h"
> +#include "got_lib_object_parse.h"
>
> -struct got_object_idset_element {
> - RB_ENTRY(got_object_idset_element) entry;
> - struct got_object_id id;
> - void *data; /* API user data */
> -};
> +#define GOT_OBJECT_IDSET_MIN_BUCKETS 64
>
> -RB_HEAD(got_object_idset_tree, got_object_idset_element);
> -
> -static int
> -cmp_elements(const struct got_object_idset_element *e1,
> - const struct got_object_idset_element *e2)
> -{
> - return got_object_id_cmp(&e1->id, &e2->id);
> -}
> -
> -RB_PROTOTYPE(got_object_idset_tree, got_object_idset_element, entry,
> - cmp_elements);
> -
> struct got_object_idset {
> - struct got_object_idset_tree entries;
> - int totelem;
> -#define GOT_OBJECT_IDSET_MAX_ELEM INT_MAX
> + struct got_object_id_queue *ids;
> + size_t nbuckets;
> + unsigned int totelem;
> + unsigned int flags;
> +#define GOT_OBJECT_IDSET_F_TRAVERSAL 0x01
> +#define GOT_OBJECT_IDSET_F_NOMEM 0x02
> + SIPHASH_KEY key;
> };
>
> struct got_object_idset *
> got_object_idset_alloc(void)
> {
> struct got_object_idset *set;
> + int i;
>
> set = malloc(sizeof(*set));
> if (set == NULL)
> return NULL;
>
> - RB_INIT(&set->entries);
> + set->ids = calloc(sizeof(set->ids[0]), GOT_OBJECT_IDSET_MIN_BUCKETS);
> + if (set->ids == NULL) {
> + free(set);
> + return NULL;
> + }
> + for (i = 0; i < GOT_OBJECT_IDSET_MIN_BUCKETS; i++)
> + STAILQ_INIT(&set->ids[i]);
> +
> set->totelem = 0;
> -
> + set->nbuckets = GOT_OBJECT_IDSET_MIN_BUCKETS;
> + set->flags = 0;
> + arc4random_buf(&set->key, sizeof(set->key));
> return set;
> }
>
> void
> got_object_idset_free(struct got_object_idset *set)
> {
> - struct got_object_idset_element *entry;
> + size_t i;
> + struct got_object_qid *qid;
>
> - while ((entry = RB_MIN(got_object_idset_tree, &set->entries))) {
> - RB_REMOVE(got_object_idset_tree, &set->entries, entry);
> - /* User data should be freed by caller. */
> - free(entry);
> + for (i = 0; i < set->nbuckets; i++) {
> + while (!STAILQ_EMPTY(&set->ids[i])) {
> + qid = STAILQ_FIRST(&set->ids[i]);
> + STAILQ_REMOVE(&set->ids[i], qid, got_object_qid, entry);
> + got_object_qid_free(qid);
> + }
> }
> -
> + /* User data should be freed by caller. */
> + free(set->ids);
> free(set);
> }
>
> +static uint64_t
> +idset_hash(struct got_object_idset *set, struct got_object_id *id)
> +{
> + return SipHash24(&set->key, id->sha1, sizeof(id->sha1));
> +}
> +
> +static const struct got_error *
> +idset_resize(struct got_object_idset *set, size_t nbuckets)
> +{
> + struct got_object_id_queue *ids;
> + unsigned int i;
> +
> + ids = calloc(nbuckets, sizeof(ids[0]));
> + if (ids == NULL) {
> + if (errno != ENOMEM)
> + return got_error_from_errno("calloc");
> + /* Proceed with our current amount of hash buckets. */
> + set->flags |= GOT_OBJECT_IDSET_F_NOMEM;
> + return NULL;
> + }
> +
> + for (i = 0; i < nbuckets; i++)
> + STAILQ_INIT(&ids[i]);
> +
> + arc4random_buf(&set->key, sizeof(set->key));
> +
> + for (i = 0; i < set->nbuckets; i++) {
got_object_idset_add enforces that we don't end up having more than
UINT_MAX elements, so this is safe. However, I think it'd read better
to declare i as size_t (and maybe even `totelem' in the struct) and
avoid these comparisons between different sizes.
(btw, got_object_idset_for_each uses `size_t i' to iterate)
> + while (!STAILQ_EMPTY(&set->ids[i])) {
> + struct got_object_qid *qid;
> + uint64_t idx;
> + qid = STAILQ_FIRST(&set->ids[i]);
> + STAILQ_REMOVE(&set->ids[i], qid, got_object_qid, entry);
> + idx = idset_hash(set, qid->id) % nbuckets;
> + STAILQ_INSERT_HEAD(&ids[idx], qid, entry);
> + }
> + }
> +
> + free(set->ids);
> + set->ids = ids;
> + set->nbuckets = nbuckets;
> + return NULL;
> +}
> +
> +static const struct got_error *
> +idset_grow(struct got_object_idset *set)
> +{
> + size_t nbuckets;
> +
> + if (set->flags & GOT_OBJECT_IDSET_F_NOMEM)
> + return NULL;
> +
> + if (set->nbuckets >= UINT_MAX / 2)
> + nbuckets = UINT_MAX;
> + else
> + nbuckets = set->nbuckets * 2;
> +
> + return idset_resize(set, nbuckets);
> +}
> +
> const struct got_error *
> got_object_idset_add(struct got_object_idset *set, struct got_object_id *id,
> void *data)
> {
> - struct got_object_idset_element *new;
> + const struct got_error *err;
> + struct got_object_qid *qid;
> + uint64_t idx;
> + struct got_object_id_queue *head;
>
> - if (set->totelem >= GOT_OBJECT_IDSET_MAX_ELEM)
> + /* This function may resize the set. */
> + if (set->flags & GOT_OBJECT_IDSET_F_TRAVERSAL)
> + return got_error_msg(GOT_ERR_NOT_IMPL,
> + "cannot add elements to idset during traversal");
> +
> + if (set->totelem == UINT_MAX)
> return got_error(GOT_ERR_NO_SPACE);
> +
> + err = got_object_qid_alloc_partial(&qid);
> + if (err)
> + return err;
> + memcpy(qid->id, id, sizeof(*qid->id));
> + qid->data = data;
>
> - new = malloc(sizeof(*new));
> - if (new == NULL)
> - return got_error_from_errno("malloc");
> -
> - memcpy(&new->id, id, sizeof(new->id));
> - new->data = data;
> -
> - if (RB_INSERT(got_object_idset_tree, &set->entries, new) != NULL) {
> - free(new);
> - return got_error(GOT_ERR_OBJ_EXISTS);
> - }
> -
> + idx = idset_hash(set, id) % set->nbuckets;
> + head = &set->ids[idx];
> + STAILQ_INSERT_HEAD(head, qid, entry);
> set->totelem++;
> +
> + if (set->nbuckets < set->totelem) {
> + err = idset_grow(set);
> + if (err)
> + return err;
these two are junk from previous drafts?
> + idx = idset_hash(set, id) % set->nbuckets;
> + head = &set->ids[idx];
> + }
> +
> return NULL;
> }
>
> -static struct got_object_idset_element *
> +static struct got_object_qid *
> find_element(struct got_object_idset *set, struct got_object_id *id)
> {
> - struct got_object_idset_element *entry;
> + uint64_t idx = idset_hash(set, id) % set->nbuckets;
> + struct got_object_id_queue *head = &set->ids[idx];
> + struct got_object_qid *qid;
>
> - entry = RB_ROOT(&set->entries);
> - while (entry) {
> - int cmp = got_object_id_cmp(id, &entry->id);
> - if (cmp < 0)
> - entry = RB_LEFT(entry, entry);
> - else if (cmp > 0)
> - entry = RB_RIGHT(entry, entry);
> - else
> - break;
> + STAILQ_FOREACH(qid, head, entry) {
> + if (got_object_id_cmp(qid->id, id) == 0)
> + return qid;
> }
>
> - return entry;
> + return NULL;
> }
>
> void *
> got_object_idset_get(struct got_object_idset *set, struct got_object_id *id)
> {
> - struct got_object_idset_element *entry = find_element(set, id);
> - return entry ? entry->data : NULL;
> + struct got_object_qid *qid = find_element(set, id);
> + return qid ? qid->data : NULL;
> }
>
> const struct got_error *
> got_object_idset_remove(void **data, struct got_object_idset *set,
> struct got_object_id *id)
> {
> - struct got_object_idset_element *entry;
> + uint64_t idx;
> + struct got_object_id_queue *head;
> + struct got_object_qid *qid;
>
> if (data)
> *data = NULL;
> @@ -150,20 +226,30 @@ got_object_idset_remove(void **data, struct got_object
> return got_error(GOT_ERR_NO_OBJ);
>
> if (id == NULL) {
> - entry = RB_ROOT(&set->entries);
> - if (entry == NULL)
> - return got_error(GOT_ERR_NO_OBJ);
> + /* Remove a "random" element. */
> + for (idx = 0; idx < set->nbuckets; idx++) {
> + head = &set->ids[idx];
> + qid = STAILQ_FIRST(head);
> + if (qid)
> + break;
> + }
> } else {
> - entry = find_element(set, id);
> - if (entry == NULL)
> + idx = idset_hash(set, id) % set->nbuckets;
> + head = &set->ids[idx];
> + STAILQ_FOREACH(qid, head, entry) {
> + if (got_object_id_cmp(qid->id, id) == 0)
> + break;
> + }
> + if (qid == NULL)
> return got_error_no_obj(id);
> }
>
> - RB_REMOVE(got_object_idset_tree, &set->entries, entry);
> if (data)
> - *data = entry->data;
> - free(entry);
> + *data = qid->data;
> + STAILQ_REMOVE(head, qid, got_object_qid, entry);
> + got_object_qid_free(qid);
> set->totelem--;
> +
> return NULL;
> }
>
> @@ -171,8 +257,8 @@ int
> got_object_idset_contains(struct got_object_idset *set,
> struct got_object_id *id)
> {
> - struct got_object_idset_element *entry = find_element(set, id);
> - return entry ? 1 : 0;
> + struct got_object_qid *qid = find_element(set, id);
> + return qid ? 1 : 0;
> }
>
> const struct got_error *
> @@ -180,15 +266,22 @@ got_object_idset_for_each(struct got_object_idset *set
> const struct got_error *(*cb)(struct got_object_id *, void *, void *),
> void *arg)
> {
> - const struct got_error *err;
> - struct got_object_idset_element *entry, *tmp;
> + const struct got_error *err = NULL;
> + struct got_object_id_queue *head;
> + struct got_object_qid *qid, *tmp;
> + size_t i;
>
> - RB_FOREACH_SAFE(entry, got_object_idset_tree, &set->entries, tmp) {
> - err = (*cb)(&entry->id, entry->data, arg);
> - if (err)
> - return err;
> + set->flags |= GOT_OBJECT_IDSET_F_TRAVERSAL;
missing a newline after the {
> + for (i = 0; i < set->nbuckets; i++) { head = &set->ids[i];
> + STAILQ_FOREACH_SAFE(qid, head, entry, tmp) {
> + err = (*cb)(qid->id, qid->data, arg);
> + if (err)
> + goto done;
> + }
> }
> - return NULL;
> +done:
> + set->flags &= ~GOT_OBJECT_IDSET_F_TRAVERSAL;
> + return err;
> }
>
> int
> @@ -196,43 +289,3 @@ got_object_idset_num_elements(struct got_object_idset
> {
> return set->totelem;
> }
> -
> -struct got_object_idset_element *
> -got_object_idset_get_element(struct got_object_idset *set, struct got_object_id *id)
> -{
> - return find_element(set, id);
> -}
> -
> -void *
> -got_object_idset_get_element_data(struct got_object_idset_element *entry)
> -{
> - return entry->data;
> -}
> -
> -const struct got_error *
> -got_object_idset_for_each_element(struct got_object_idset *set,
> - const struct got_error *(*cb)(struct got_object_idset_element *, void *),
> - void *arg)
> -{
> - const struct got_error *err;
> - struct got_object_idset_element *entry, *tmp;
> -
> - RB_FOREACH_SAFE(entry, got_object_idset_tree, &set->entries, tmp) {
> - err = (*cb)(entry, arg);
> - if (err)
> - return err;
> - }
> - return NULL;
> -}
> -
> -void
> -got_object_idset_remove_element(struct got_object_idset *set,
> - struct got_object_idset_element *entry)
> -{
> - RB_REMOVE(got_object_idset_tree, &set->entries, entry);
> - free(entry);
> - set->totelem--;
> -}
> -
> -RB_GENERATE(got_object_idset_tree, got_object_idset_element, entry,
> - cmp_elements);
> blob - fa7ea77d02fc1d6284318e12d67fc4c0742fde70
> blob + 81518d57e3cdb40c14904ec873d086182fea8b4c
> --- lib/pack_create.c
> +++ lib/pack_create.c
> @@ -1777,25 +1777,24 @@ done:
> }
>
> static const struct got_error *
> -remove_unused_object(struct got_object_idset_element *entry, void *arg)
> +remove_unused_object(struct got_object_id *id, void *data, void *arg)
> {
> struct got_object_idset *idset = arg;
>
> - if (got_object_idset_get_element_data(entry) == NULL)
> - got_object_idset_remove_element(idset, entry);
> + if (data == NULL)
> + got_object_idset_remove(NULL, idset, id);
>
> return NULL;
> }
>
> static const struct got_error *
> -remove_reused_object(struct got_object_idset_element *entry, void *arg)
> +remove_reused_object(struct got_object_id *id, void *data, void *arg)
> {
> struct got_object_idset *idset = arg;
> - struct got_pack_meta *m;
> + struct got_pack_meta *m = data;
>
> - m = got_object_idset_get_element_data(entry);
> if (m->have_reused_delta)
> - got_object_idset_remove_element(idset, entry);
> + got_object_idset_remove(NULL, idset, id);
>
> return NULL;
> }
> @@ -1840,8 +1839,7 @@ got_pack_create(uint8_t *packsha1, FILE *packfile,
> if (err)
> return err;
>
> - err = got_object_idset_for_each_element(idset,
> - remove_unused_object, idset);
> + err = got_object_idset_for_each(idset, remove_unused_object, idset);
> if (err)
> goto done;
>
> @@ -1877,7 +1875,7 @@ got_pack_create(uint8_t *packsha1, FILE *packfile,
> if (err)
> goto done;
> if (reuse.nmeta > 0) {
> - err = got_object_idset_for_each_element(idset,
> + err = got_object_idset_for_each(idset,
> remove_reused_object, idset);
> if (err)
> goto done;
reimplement object ID-set on top of a hash table