"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
Omar Polo <op@omarpolo.com>
Subject:
Re: reimplement object ID-set on top of a hash table
To:
Stefan Sperling <stsp@stsp.name>
Cc:
gameoftrees@openbsd.org
Date:
Tue, 19 Apr 2022 14:04:18 +0200

Download raw body.

Thread
Stefan Sperling <stsp@stsp.name> wrote:
> Implement the object ID-set on top of a chaining hash table instead
> of an RB tree.
> 
> This provides a slight speed advantage when packing a lot of objects:
> 
> test case: 216541 commits colored; 2048137 objects found; 994231 trees scanned
> RB tree: 3m59.31s real     3m44.61s user     0m42.55s system
> hash table: 3m11.87s real     2m58.43s user     0m43.60s system
> 
> jrick suggested siphash as the hash distribution function.
> Initially I tried using 4 byte positions of the object ID array, but discovered
> that murmurhash has a much better random distribution which results in shorter
> chains. siphash seems to have similar performance to murmurhash and doesn't
> suffer from collision attacks:
> https://en.wikipedia.org/wiki/MurmurHash#Vulnerabilities
> 
> The idset_add() operation no longer checks whether the element is already
> present in the set. All callers are already performing this check before
> adding elements, and it seems bad to do it twice. Lookups are now O(1) if
> the element is missing, so letting callers check for existence before
> insertion should be fine.
> 
> The hash table can grow until it fails to allocate more hash buckets.
> In theory we could allow the table to shrink, but so far I could not
> get this working without a significant slowdown. When the number of
> buckets changes all elements have to be copied over to a new table,
> which is very expensive.

it reads fine to me, just some comments inline:

> -----------------------------------------------
> commit 2670093859bd13a721a818c4d8b2f0140a8dfc93 (idset-hash)
> from: Stefan Sperling <stsp@stsp.name>
> date: Fri Apr 15 14:21:05 2022 UTC
>  
>  reimplement object-ID set data structure on top of a hash table
>  
>  Siphash suggested by jrick as a better alternative to murmurhash
>  for this use case.
>  
> diff 1b01a31979d2daafab1f160fcc263c8b7062a7f3 ba40aee1bfa885072a040a81da73cb49b5d40e08
> blob - 5346f7f2ce1559b966b7c6e2f1bde6da40d729a4
> blob + 750be4962021434fb27d2dcc95867ff2e333a776
> --- lib/got_lib_object_idset.h
> +++ lib/got_lib_object_idset.h
> @@ -31,13 +31,3 @@ const struct got_error *got_object_idset_for_each(stru
>      const struct got_error *(*cb)(struct got_object_id *, void *, void *),
>      void *);
>  int got_object_idset_num_elements(struct got_object_idset *);
> -
> -struct got_object_idset_element;
> -struct got_object_idset_element *got_object_idset_get_element(
> -    struct got_object_idset *, struct got_object_id *);
> -void *got_object_idset_get_element_data(struct got_object_idset_element *);
> -const struct got_error *got_object_idset_for_each_element(struct got_object_idset *,
> -    const struct got_error *(*cb)(struct got_object_idset_element *, void *), void *);
> -void got_object_idset_remove_element(struct got_object_idset *,
> -    struct got_object_idset_element *);
> -
> blob - 06f7347e22beeabe835c116ac99d59e61f87ea68
> blob + d188d887deea72878f96122fb9c3544c4b9da7de
> --- lib/object_idset.c
> +++ lib/object_idset.c
> @@ -1,5 +1,5 @@
>  /*
> - * Copyright (c) 2018, 2019 Stefan Sperling <stsp@openbsd.org>
> + * Copyright (c) 2022 Stefan Sperling <stsp@openbsd.org>
>   *
>   * Permission to use, copy, modify, and distribute this software for any
>   * purpose with or without fee is hereby granted, provided that the above
> @@ -15,15 +15,17 @@
>   */
>  
>  #include <sys/queue.h>
> -#include <sys/tree.h>
>  
>  #include <stdlib.h>
> +#include <stdint.h>
>  #include <string.h>
>  #include <sha1.h>
>  #include <stdio.h>
>  #include <zlib.h>
>  #include <limits.h>
>  #include <time.h>
> +#include <errno.h>
> +#include <siphash.h>
>  
>  #include "got_object.h"
>  #include "got_error.h"
> @@ -32,116 +34,190 @@
>  #include "got_lib_inflate.h"
>  #include "got_lib_object.h"
>  #include "got_lib_object_idset.h"
> +#include "got_lib_object_parse.h"
>  
> -struct got_object_idset_element {
> -	RB_ENTRY(got_object_idset_element)	entry;
> -	struct got_object_id id;
> -	void *data;	/* API user data */
> -};
> +#define GOT_OBJECT_IDSET_MIN_BUCKETS	64
>  
> -RB_HEAD(got_object_idset_tree, got_object_idset_element);
> -
> -static int
> -cmp_elements(const struct got_object_idset_element *e1,
> -    const struct got_object_idset_element *e2)
> -{
> -	return got_object_id_cmp(&e1->id, &e2->id);
> -}
> -
> -RB_PROTOTYPE(got_object_idset_tree, got_object_idset_element, entry,
> -    cmp_elements);
> -
>  struct got_object_idset {
> -	struct got_object_idset_tree entries;
> -	int totelem;
> -#define GOT_OBJECT_IDSET_MAX_ELEM INT_MAX
> +	struct got_object_id_queue *ids;
> +	size_t nbuckets;
> +	unsigned int totelem;
> +	unsigned int flags;
> +#define GOT_OBJECT_IDSET_F_TRAVERSAL	0x01
> +#define GOT_OBJECT_IDSET_F_NOMEM	0x02
> +	SIPHASH_KEY key;
>  };
>  
>  struct got_object_idset *
>  got_object_idset_alloc(void)
>  {
>  	struct got_object_idset *set;
> +	int i;
>  
>  	set = malloc(sizeof(*set));
>  	if (set == NULL)
>  		return NULL;
>  
> -	RB_INIT(&set->entries);
> +	set->ids = calloc(sizeof(set->ids[0]), GOT_OBJECT_IDSET_MIN_BUCKETS);
> +	if (set->ids == NULL) {
> +		free(set);
> +		return NULL;
> +	}
> +	for (i = 0; i < GOT_OBJECT_IDSET_MIN_BUCKETS; i++)
> +		STAILQ_INIT(&set->ids[i]);
> +
>  	set->totelem = 0;
> -
> +	set->nbuckets = GOT_OBJECT_IDSET_MIN_BUCKETS;
> +	set->flags = 0;
> +	arc4random_buf(&set->key, sizeof(set->key));
>  	return set;
>  }
>  
>  void
>  got_object_idset_free(struct got_object_idset *set)
>  {
> -	struct got_object_idset_element *entry;
> +	size_t i;
> +	struct got_object_qid *qid;
>  
> -	while ((entry = RB_MIN(got_object_idset_tree, &set->entries))) {
> -		RB_REMOVE(got_object_idset_tree, &set->entries, entry);
> -		/* User data should be freed by caller. */
> -		free(entry);
> +	for (i = 0; i < set->nbuckets; i++) {
> +		while (!STAILQ_EMPTY(&set->ids[i])) {
> +			qid = STAILQ_FIRST(&set->ids[i]);
> +			STAILQ_REMOVE(&set->ids[i], qid, got_object_qid, entry);
> +			got_object_qid_free(qid);
> +		}
>  	}
> -
> +	/* User data should be freed by caller. */
> +	free(set->ids);
>  	free(set);
>  }
>  
> +static uint64_t
> +idset_hash(struct got_object_idset *set, struct got_object_id *id)
> +{
> +	return SipHash24(&set->key, id->sha1, sizeof(id->sha1));
> +}
> +
> +static const struct got_error *
> +idset_resize(struct got_object_idset *set, size_t nbuckets)
> +{
> +	struct got_object_id_queue *ids;
> +	unsigned int i;
> +
> +	ids = calloc(nbuckets, sizeof(ids[0]));
> +	if (ids == NULL) {
> +		if (errno != ENOMEM)
> +			return got_error_from_errno("calloc");
> +		/* Proceed with our current amount of hash buckets. */
> +		set->flags |= GOT_OBJECT_IDSET_F_NOMEM;
> +		return NULL;
> +	}
> +
> +	for (i = 0; i < nbuckets; i++)
> +		STAILQ_INIT(&ids[i]);
> +
> +	arc4random_buf(&set->key, sizeof(set->key));
> +
> +	for (i = 0; i < set->nbuckets; i++) {

got_object_idset_add enforces that we don't end up having more than
UINT_MAX elements, so this is safe.  However, I think it'd read better
to declare i as size_t (and maybe even `totelem' in the struct) and
avoid these comparisons between different sizes.

(btw, got_object_idset_for_each uses `size_t i' to iterate)

> +		while (!STAILQ_EMPTY(&set->ids[i])) {
> +			struct got_object_qid *qid;
> +			uint64_t idx;
> +			qid = STAILQ_FIRST(&set->ids[i]);
> +			STAILQ_REMOVE(&set->ids[i], qid, got_object_qid, entry);
> +			idx = idset_hash(set, qid->id) % nbuckets;
> +			STAILQ_INSERT_HEAD(&ids[idx], qid, entry);
> +		}
> +	}
> +
> +	free(set->ids);
> +	set->ids = ids;
> +	set->nbuckets = nbuckets;
> +	return NULL;
> +}
> +
> +static const struct got_error *
> +idset_grow(struct got_object_idset *set)
> +{
> +	size_t nbuckets;
> +
> +	if (set->flags & GOT_OBJECT_IDSET_F_NOMEM)
> +		return NULL;
> +
> +	if (set->nbuckets >= UINT_MAX / 2)
> +		nbuckets = UINT_MAX;
> +	else
> +		nbuckets = set->nbuckets * 2;
> +
> +	return idset_resize(set, nbuckets);
> +}
> +
>  const struct got_error *
>  got_object_idset_add(struct got_object_idset *set, struct got_object_id *id,
>      void *data)
>  {
> -	struct got_object_idset_element *new;
> +	const struct got_error *err;
> +	struct got_object_qid *qid;
> +	uint64_t idx;
> +	struct got_object_id_queue *head;
>  
> -	if (set->totelem >= GOT_OBJECT_IDSET_MAX_ELEM)
> +	/* This function may resize the set. */
> +	if (set->flags & GOT_OBJECT_IDSET_F_TRAVERSAL)
> +		return got_error_msg(GOT_ERR_NOT_IMPL,
> +		    "cannot add elements to idset during traversal");
> +
> +	if (set->totelem == UINT_MAX)
>  		return got_error(GOT_ERR_NO_SPACE);
> +		
> +	err = got_object_qid_alloc_partial(&qid);
> +	if (err)
> +		return err;
> +	memcpy(qid->id, id, sizeof(*qid->id));
> +	qid->data = data;
>  
> -	new = malloc(sizeof(*new));
> -	if (new == NULL)
> -		return got_error_from_errno("malloc");
> -
> -	memcpy(&new->id, id, sizeof(new->id));
> -	new->data = data;
> -
> -	if (RB_INSERT(got_object_idset_tree, &set->entries, new) != NULL) {
> -		free(new);
> -		return got_error(GOT_ERR_OBJ_EXISTS);
> -	}
> -
> +	idx = idset_hash(set, id) % set->nbuckets;
> +	head = &set->ids[idx];
> +	STAILQ_INSERT_HEAD(head, qid, entry);
>  	set->totelem++;
> +
> +	if (set->nbuckets < set->totelem) {
> +		err = idset_grow(set);
> +		if (err)
> +			return err;

these two are junk from previous drafts?

> +		idx = idset_hash(set, id) % set->nbuckets;
> +		head = &set->ids[idx];
> +	}
> +
>  	return NULL;
>  }
>  
> -static struct got_object_idset_element *
> +static struct got_object_qid *
>  find_element(struct got_object_idset *set, struct got_object_id *id)
>  {
> -	struct got_object_idset_element *entry;
> +	uint64_t idx = idset_hash(set, id) % set->nbuckets;
> +	struct got_object_id_queue *head = &set->ids[idx];
> +	struct got_object_qid *qid;
>  
> -	entry = RB_ROOT(&set->entries);
> -	while (entry) {
> -		int cmp = got_object_id_cmp(id, &entry->id);
> -		if (cmp < 0)
> -			entry = RB_LEFT(entry, entry);
> -		else if (cmp > 0)
> -			entry = RB_RIGHT(entry, entry);
> -		else
> -			break;
> +	STAILQ_FOREACH(qid, head, entry) {
> +		if (got_object_id_cmp(qid->id, id) == 0)
> +			return qid;
>  	}
>  
> -	return entry;
> +	return NULL;
>  }
>  
>  void *
>  got_object_idset_get(struct got_object_idset *set, struct got_object_id *id)
>  {
> -	struct got_object_idset_element *entry = find_element(set, id);
> -	return entry ? entry->data : NULL;
> +	struct got_object_qid *qid = find_element(set, id);
> +	return qid ? qid->data : NULL;
>  }
>  
>  const struct got_error *
>  got_object_idset_remove(void **data, struct got_object_idset *set,
>      struct got_object_id *id)
>  {
> -	struct got_object_idset_element *entry;
> +	uint64_t idx;
> +	struct got_object_id_queue *head;
> +	struct got_object_qid *qid;
>  
>  	if (data)
>  		*data = NULL;
> @@ -150,20 +226,30 @@ got_object_idset_remove(void **data, struct got_object
>  		return got_error(GOT_ERR_NO_OBJ);
>  
>  	if (id == NULL) {
> -		entry = RB_ROOT(&set->entries);
> -		if (entry == NULL)
> -			return got_error(GOT_ERR_NO_OBJ);
> +		/* Remove a "random" element. */
> +		for (idx = 0; idx < set->nbuckets; idx++) {
> +			head = &set->ids[idx];
> +			qid = STAILQ_FIRST(head);
> +			if (qid)
> +				break;
> +		}
>  	} else {
> -		entry = find_element(set, id);
> -		if (entry == NULL)
> +		idx = idset_hash(set, id) % set->nbuckets;
> +		head = &set->ids[idx];
> +		STAILQ_FOREACH(qid, head, entry) {
> +			if (got_object_id_cmp(qid->id, id) == 0)
> +				break;
> +		}
> +		if (qid == NULL)
>  			return got_error_no_obj(id);
>  	}
>  
> -	RB_REMOVE(got_object_idset_tree, &set->entries, entry);
>  	if (data)
> -		*data = entry->data;
> -	free(entry);
> +		*data = qid->data;
> +	STAILQ_REMOVE(head, qid, got_object_qid, entry);
> +	got_object_qid_free(qid);
>  	set->totelem--;
> +
>  	return NULL;
>  }
>  
> @@ -171,8 +257,8 @@ int
>  got_object_idset_contains(struct got_object_idset *set,
>      struct got_object_id *id)
>  {
> -	struct got_object_idset_element *entry = find_element(set, id);
> -	return entry ? 1 : 0;
> +	struct got_object_qid *qid = find_element(set, id);
> +	return qid ? 1 : 0;
>  }
>  
>  const struct got_error *
> @@ -180,15 +266,22 @@ got_object_idset_for_each(struct got_object_idset *set
>      const struct got_error *(*cb)(struct got_object_id *, void *, void *),
>      void *arg)
>  {
> -	const struct got_error *err;
> -	struct got_object_idset_element *entry, *tmp;
> +	const struct got_error *err = NULL;
> +	struct got_object_id_queue *head;
> +	struct got_object_qid *qid, *tmp;
> +	size_t i;
>  
> -	RB_FOREACH_SAFE(entry, got_object_idset_tree, &set->entries, tmp) {
> -		err = (*cb)(&entry->id, entry->data, arg);
> -		if (err)
> -			return err;
> +	set->flags |= GOT_OBJECT_IDSET_F_TRAVERSAL;

missing a newline after the {

> +	for (i = 0; i < set->nbuckets; i++) { head = &set->ids[i];
> +		STAILQ_FOREACH_SAFE(qid, head, entry, tmp) {
> +			err = (*cb)(qid->id, qid->data, arg);
> +			if (err)
> +				goto done;
> +		}
>  	}
> -	return NULL;
> +done:
> +	set->flags &= ~GOT_OBJECT_IDSET_F_TRAVERSAL;
> +	return err;
>  }
>  
>  int
> @@ -196,43 +289,3 @@ got_object_idset_num_elements(struct got_object_idset 
>  {
>  	return set->totelem;
>  }
> -
> -struct got_object_idset_element *
> -got_object_idset_get_element(struct got_object_idset *set, struct got_object_id *id)
> -{
> -	return find_element(set, id);
> -}
> -
> -void *
> -got_object_idset_get_element_data(struct got_object_idset_element *entry)
> -{
> -	return entry->data;
> -}
> -
> -const struct got_error *
> -got_object_idset_for_each_element(struct got_object_idset *set,
> -    const struct got_error *(*cb)(struct got_object_idset_element *, void *),
> -    void *arg)
> -{
> -	const struct got_error *err;
> -	struct got_object_idset_element *entry, *tmp;
> -
> -	RB_FOREACH_SAFE(entry, got_object_idset_tree, &set->entries, tmp) {
> -		err = (*cb)(entry, arg);
> -		if (err)
> -			return err;
> -	}
> -	return NULL;
> -}
> -
> -void
> -got_object_idset_remove_element(struct got_object_idset *set,
> -    struct got_object_idset_element *entry)
> -{
> -	RB_REMOVE(got_object_idset_tree, &set->entries, entry);
> -	free(entry);
> -	set->totelem--;
> -}
> -
> -RB_GENERATE(got_object_idset_tree, got_object_idset_element, entry,
> -    cmp_elements);
> blob - fa7ea77d02fc1d6284318e12d67fc4c0742fde70
> blob + 81518d57e3cdb40c14904ec873d086182fea8b4c
> --- lib/pack_create.c
> +++ lib/pack_create.c
> @@ -1777,25 +1777,24 @@ done:
>  }
>  
>  static const struct got_error *
> -remove_unused_object(struct got_object_idset_element *entry, void *arg)
> +remove_unused_object(struct got_object_id *id, void *data, void *arg)
>  {
>  	struct got_object_idset *idset = arg;
>  
> -	if (got_object_idset_get_element_data(entry) == NULL)
> -		got_object_idset_remove_element(idset, entry);
> +	if (data == NULL)
> +		got_object_idset_remove(NULL, idset, id);
>  
>  	return NULL;
>  }
>  
>  static const struct got_error *
> -remove_reused_object(struct got_object_idset_element *entry, void *arg)
> +remove_reused_object(struct got_object_id *id, void *data, void *arg)
>  {
>  	struct got_object_idset *idset = arg;
> -	struct got_pack_meta *m;
> +	struct got_pack_meta *m = data;
>  
> -	m = got_object_idset_get_element_data(entry);
>  	if (m->have_reused_delta)
> -		got_object_idset_remove_element(idset, entry);
> +		got_object_idset_remove(NULL, idset, id);
>  
>  	return NULL;
>  }
> @@ -1840,8 +1839,7 @@ got_pack_create(uint8_t *packsha1, FILE *packfile,
>  	if (err)
>  		return err;
>  
> -	err = got_object_idset_for_each_element(idset,
> -	    remove_unused_object, idset);
> +	err = got_object_idset_for_each(idset, remove_unused_object, idset);
>  	if (err)
>  		goto done;
>  
> @@ -1877,7 +1875,7 @@ got_pack_create(uint8_t *packsha1, FILE *packfile,
>  	if (err)
>  		goto done;
>  	if (reuse.nmeta > 0) {
> -		err = got_object_idset_for_each_element(idset,
> +		err = got_object_idset_for_each(idset,
>  		    remove_reused_object, idset);
>  		if (err)
>  			goto done;