"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
Stefan Sperling <stsp@stsp.name>
Subject:
Re: reimplement object ID-set on top of a hash table
To:
Omar Polo <op@omarpolo.com>
Cc:
gameoftrees@openbsd.org
Date:
Tue, 19 Apr 2022 22:09:28 +0200

Download raw body.

Thread
On Tue, Apr 19, 2022 at 02:04:18PM +0200, Omar Polo wrote:
> > +	for (i = 0; i < set->nbuckets; i++) {
> 
> got_object_idset_add enforces that we don't end up having more than
> UINT_MAX elements, so this is safe.  However, I think it'd read better
> to declare i as size_t (and maybe even `totelem' in the struct) and
> avoid these comparisons between different sizes.
> 

Indeed, fixed in patch below.

> > +	if (set->nbuckets < set->totelem) {
> > +		err = idset_grow(set);
> > +		if (err)
> > +			return err;
> 
> these two are junk from previous drafts?

Oh, indeed.
I had the call to grow() further up in an earlier version.

> > +		idx = idset_hash(set, id) % set->nbuckets;
> > +		head = &set->ids[idx];
> > +	}

> missing a newline after the {

Thanks, fixed as well.

> > +	for (i = 0; i < set->nbuckets; i++) { head = &set->ids[i];
> > +		STAILQ_FOREACH_SAFE(qid, head, entry, tmp) {
> > +			err = (*cb)(qid->id, qid->data, arg);
> > +			if (err)
> > +				goto done;
> > +		}
> >  	}


diff 765bfda82dee49bd1575eb87e64920ab3d5c3b16 b0e81243e25a163bfb4d0b06b6cc6e69ca030bcd
blob - 5346f7f2ce1559b966b7c6e2f1bde6da40d729a4
blob + 750be4962021434fb27d2dcc95867ff2e333a776
--- lib/got_lib_object_idset.h
+++ lib/got_lib_object_idset.h
@@ -31,13 +31,3 @@ const struct got_error *got_object_idset_for_each(stru
     const struct got_error *(*cb)(struct got_object_id *, void *, void *),
     void *);
 int got_object_idset_num_elements(struct got_object_idset *);
-
-struct got_object_idset_element;
-struct got_object_idset_element *got_object_idset_get_element(
-    struct got_object_idset *, struct got_object_id *);
-void *got_object_idset_get_element_data(struct got_object_idset_element *);
-const struct got_error *got_object_idset_for_each_element(struct got_object_idset *,
-    const struct got_error *(*cb)(struct got_object_idset_element *, void *), void *);
-void got_object_idset_remove_element(struct got_object_idset *,
-    struct got_object_idset_element *);
-
blob - 06f7347e22beeabe835c116ac99d59e61f87ea68
blob + e27d30ce0887da3caae5375fdd367eb524ddac3e
--- lib/object_idset.c
+++ lib/object_idset.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, 2019 Stefan Sperling <stsp@openbsd.org>
+ * Copyright (c) 2022 Stefan Sperling <stsp@openbsd.org>
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -15,15 +15,17 @@
  */
 
 #include <sys/queue.h>
-#include <sys/tree.h>
 
 #include <stdlib.h>
+#include <stdint.h>
 #include <string.h>
 #include <sha1.h>
 #include <stdio.h>
 #include <zlib.h>
 #include <limits.h>
 #include <time.h>
+#include <errno.h>
+#include <siphash.h>
 
 #include "got_object.h"
 #include "got_error.h"
@@ -32,116 +34,185 @@
 #include "got_lib_inflate.h"
 #include "got_lib_object.h"
 #include "got_lib_object_idset.h"
+#include "got_lib_object_parse.h"
 
-struct got_object_idset_element {
-	RB_ENTRY(got_object_idset_element)	entry;
-	struct got_object_id id;
-	void *data;	/* API user data */
-};
+#define GOT_OBJECT_IDSET_MIN_BUCKETS	64
 
-RB_HEAD(got_object_idset_tree, got_object_idset_element);
-
-static int
-cmp_elements(const struct got_object_idset_element *e1,
-    const struct got_object_idset_element *e2)
-{
-	return got_object_id_cmp(&e1->id, &e2->id);
-}
-
-RB_PROTOTYPE(got_object_idset_tree, got_object_idset_element, entry,
-    cmp_elements);
-
 struct got_object_idset {
-	struct got_object_idset_tree entries;
-	int totelem;
-#define GOT_OBJECT_IDSET_MAX_ELEM INT_MAX
+	struct got_object_id_queue *ids;
+	size_t nbuckets;
+	unsigned int totelem;
+	unsigned int flags;
+#define GOT_OBJECT_IDSET_F_TRAVERSAL	0x01
+#define GOT_OBJECT_IDSET_F_NOMEM	0x02
+	SIPHASH_KEY key;
 };
 
 struct got_object_idset *
 got_object_idset_alloc(void)
 {
 	struct got_object_idset *set;
+	int i;
 
 	set = malloc(sizeof(*set));
 	if (set == NULL)
 		return NULL;
 
-	RB_INIT(&set->entries);
+	set->ids = calloc(sizeof(set->ids[0]), GOT_OBJECT_IDSET_MIN_BUCKETS);
+	if (set->ids == NULL) {
+		free(set);
+		return NULL;
+	}
+	for (i = 0; i < GOT_OBJECT_IDSET_MIN_BUCKETS; i++)
+		STAILQ_INIT(&set->ids[i]);
+
 	set->totelem = 0;
-
+	set->nbuckets = GOT_OBJECT_IDSET_MIN_BUCKETS;
+	set->flags = 0;
+	arc4random_buf(&set->key, sizeof(set->key));
 	return set;
 }
 
 void
 got_object_idset_free(struct got_object_idset *set)
 {
-	struct got_object_idset_element *entry;
+	size_t i;
+	struct got_object_qid *qid;
 
-	while ((entry = RB_MIN(got_object_idset_tree, &set->entries))) {
-		RB_REMOVE(got_object_idset_tree, &set->entries, entry);
-		/* User data should be freed by caller. */
-		free(entry);
+	for (i = 0; i < set->nbuckets; i++) {
+		while (!STAILQ_EMPTY(&set->ids[i])) {
+			qid = STAILQ_FIRST(&set->ids[i]);
+			STAILQ_REMOVE(&set->ids[i], qid, got_object_qid, entry);
+			got_object_qid_free(qid);
+		}
 	}
-
+	/* User data should be freed by caller. */
+	free(set->ids);
 	free(set);
 }
 
+static uint64_t
+idset_hash(struct got_object_idset *set, struct got_object_id *id)
+{
+	return SipHash24(&set->key, id->sha1, sizeof(id->sha1));
+}
+
+static const struct got_error *
+idset_resize(struct got_object_idset *set, size_t nbuckets)
+{
+	struct got_object_id_queue *ids;
+	size_t i;
+
+	ids = calloc(nbuckets, sizeof(ids[0]));
+	if (ids == NULL) {
+		if (errno != ENOMEM)
+			return got_error_from_errno("calloc");
+		/* Proceed with our current amount of hash buckets. */
+		set->flags |= GOT_OBJECT_IDSET_F_NOMEM;
+		return NULL;
+	}
+
+	for (i = 0; i < nbuckets; i++)
+		STAILQ_INIT(&ids[i]);
+
+	arc4random_buf(&set->key, sizeof(set->key));
+
+	for (i = 0; i < set->nbuckets; i++) {
+		while (!STAILQ_EMPTY(&set->ids[i])) {
+			struct got_object_qid *qid;
+			uint64_t idx;
+			qid = STAILQ_FIRST(&set->ids[i]);
+			STAILQ_REMOVE(&set->ids[i], qid, got_object_qid, entry);
+			idx = idset_hash(set, qid->id) % nbuckets;
+			STAILQ_INSERT_HEAD(&ids[idx], qid, entry);
+		}
+	}
+
+	free(set->ids);
+	set->ids = ids;
+	set->nbuckets = nbuckets;
+	return NULL;
+}
+
+static const struct got_error *
+idset_grow(struct got_object_idset *set)
+{
+	size_t nbuckets;
+
+	if (set->flags & GOT_OBJECT_IDSET_F_NOMEM)
+		return NULL;
+
+	if (set->nbuckets >= UINT_MAX / 2)
+		nbuckets = UINT_MAX;
+	else
+		nbuckets = set->nbuckets * 2;
+
+	return idset_resize(set, nbuckets);
+}
+
 const struct got_error *
 got_object_idset_add(struct got_object_idset *set, struct got_object_id *id,
     void *data)
 {
-	struct got_object_idset_element *new;
+	const struct got_error *err;
+	struct got_object_qid *qid;
+	uint64_t idx;
+	struct got_object_id_queue *head;
 
-	if (set->totelem >= GOT_OBJECT_IDSET_MAX_ELEM)
+	/* This function may resize the set. */
+	if (set->flags & GOT_OBJECT_IDSET_F_TRAVERSAL)
+		return got_error_msg(GOT_ERR_NOT_IMPL,
+		    "cannot add elements to idset during traversal");
+
+	if (set->totelem == UINT_MAX)
 		return got_error(GOT_ERR_NO_SPACE);
+		
+	err = got_object_qid_alloc_partial(&qid);
+	if (err)
+		return err;
+	memcpy(qid->id, id, sizeof(*qid->id));
+	qid->data = data;
 
-	new = malloc(sizeof(*new));
-	if (new == NULL)
-		return got_error_from_errno("malloc");
-
-	memcpy(&new->id, id, sizeof(new->id));
-	new->data = data;
-
-	if (RB_INSERT(got_object_idset_tree, &set->entries, new) != NULL) {
-		free(new);
-		return got_error(GOT_ERR_OBJ_EXISTS);
-	}
-
+	idx = idset_hash(set, id) % set->nbuckets;
+	head = &set->ids[idx];
+	STAILQ_INSERT_HEAD(head, qid, entry);
 	set->totelem++;
-	return NULL;
+
+	if (set->nbuckets < set->totelem)
+		err = idset_grow(set);
+
+	return err;
 }
 
-static struct got_object_idset_element *
+static struct got_object_qid *
 find_element(struct got_object_idset *set, struct got_object_id *id)
 {
-	struct got_object_idset_element *entry;
+	uint64_t idx = idset_hash(set, id) % set->nbuckets;
+	struct got_object_id_queue *head = &set->ids[idx];
+	struct got_object_qid *qid;
 
-	entry = RB_ROOT(&set->entries);
-	while (entry) {
-		int cmp = got_object_id_cmp(id, &entry->id);
-		if (cmp < 0)
-			entry = RB_LEFT(entry, entry);
-		else if (cmp > 0)
-			entry = RB_RIGHT(entry, entry);
-		else
-			break;
+	STAILQ_FOREACH(qid, head, entry) {
+		if (got_object_id_cmp(qid->id, id) == 0)
+			return qid;
 	}
 
-	return entry;
+	return NULL;
 }
 
 void *
 got_object_idset_get(struct got_object_idset *set, struct got_object_id *id)
 {
-	struct got_object_idset_element *entry = find_element(set, id);
-	return entry ? entry->data : NULL;
+	struct got_object_qid *qid = find_element(set, id);
+	return qid ? qid->data : NULL;
 }
 
 const struct got_error *
 got_object_idset_remove(void **data, struct got_object_idset *set,
     struct got_object_id *id)
 {
-	struct got_object_idset_element *entry;
+	uint64_t idx;
+	struct got_object_id_queue *head;
+	struct got_object_qid *qid;
 
 	if (data)
 		*data = NULL;
@@ -150,20 +221,30 @@ got_object_idset_remove(void **data, struct got_object
 		return got_error(GOT_ERR_NO_OBJ);
 
 	if (id == NULL) {
-		entry = RB_ROOT(&set->entries);
-		if (entry == NULL)
-			return got_error(GOT_ERR_NO_OBJ);
+		/* Remove a "random" element. */
+		for (idx = 0; idx < set->nbuckets; idx++) {
+			head = &set->ids[idx];
+			qid = STAILQ_FIRST(head);
+			if (qid)
+				break;
+		}
 	} else {
-		entry = find_element(set, id);
-		if (entry == NULL)
+		idx = idset_hash(set, id) % set->nbuckets;
+		head = &set->ids[idx];
+		STAILQ_FOREACH(qid, head, entry) {
+			if (got_object_id_cmp(qid->id, id) == 0)
+				break;
+		}
+		if (qid == NULL)
 			return got_error_no_obj(id);
 	}
 
-	RB_REMOVE(got_object_idset_tree, &set->entries, entry);
 	if (data)
-		*data = entry->data;
-	free(entry);
+		*data = qid->data;
+	STAILQ_REMOVE(head, qid, got_object_qid, entry);
+	got_object_qid_free(qid);
 	set->totelem--;
+
 	return NULL;
 }
 
@@ -171,8 +252,8 @@ int
 got_object_idset_contains(struct got_object_idset *set,
     struct got_object_id *id)
 {
-	struct got_object_idset_element *entry = find_element(set, id);
-	return entry ? 1 : 0;
+	struct got_object_qid *qid = find_element(set, id);
+	return qid ? 1 : 0;
 }
 
 const struct got_error *
@@ -180,15 +261,23 @@ got_object_idset_for_each(struct got_object_idset *set
     const struct got_error *(*cb)(struct got_object_id *, void *, void *),
     void *arg)
 {
-	const struct got_error *err;
-	struct got_object_idset_element *entry, *tmp;
+	const struct got_error *err = NULL;
+	struct got_object_id_queue *head;
+	struct got_object_qid *qid, *tmp;
+	size_t i;
 
-	RB_FOREACH_SAFE(entry, got_object_idset_tree, &set->entries, tmp) {
-		err = (*cb)(&entry->id, entry->data, arg);
-		if (err)
-			return err;
+	set->flags |= GOT_OBJECT_IDSET_F_TRAVERSAL;
+	for (i = 0; i < set->nbuckets; i++) {
+		head = &set->ids[i];
+		STAILQ_FOREACH_SAFE(qid, head, entry, tmp) {
+			err = (*cb)(qid->id, qid->data, arg);
+			if (err)
+				goto done;
+		}
 	}
-	return NULL;
+done:
+	set->flags &= ~GOT_OBJECT_IDSET_F_TRAVERSAL;
+	return err;
 }
 
 int
@@ -196,43 +285,3 @@ got_object_idset_num_elements(struct got_object_idset 
 {
 	return set->totelem;
 }
-
-struct got_object_idset_element *
-got_object_idset_get_element(struct got_object_idset *set, struct got_object_id *id)
-{
-	return find_element(set, id);
-}
-
-void *
-got_object_idset_get_element_data(struct got_object_idset_element *entry)
-{
-	return entry->data;
-}
-
-const struct got_error *
-got_object_idset_for_each_element(struct got_object_idset *set,
-    const struct got_error *(*cb)(struct got_object_idset_element *, void *),
-    void *arg)
-{
-	const struct got_error *err;
-	struct got_object_idset_element *entry, *tmp;
-
-	RB_FOREACH_SAFE(entry, got_object_idset_tree, &set->entries, tmp) {
-		err = (*cb)(entry, arg);
-		if (err)
-			return err;
-	}
-	return NULL;
-}
-
-void
-got_object_idset_remove_element(struct got_object_idset *set,
-    struct got_object_idset_element *entry)
-{
-	RB_REMOVE(got_object_idset_tree, &set->entries, entry);
-	free(entry);
-	set->totelem--;
-}
-
-RB_GENERATE(got_object_idset_tree, got_object_idset_element, entry,
-    cmp_elements);
blob - fa7ea77d02fc1d6284318e12d67fc4c0742fde70
blob + 81518d57e3cdb40c14904ec873d086182fea8b4c
--- lib/pack_create.c
+++ lib/pack_create.c
@@ -1777,25 +1777,24 @@ done:
 }
 
 static const struct got_error *
-remove_unused_object(struct got_object_idset_element *entry, void *arg)
+remove_unused_object(struct got_object_id *id, void *data, void *arg)
 {
 	struct got_object_idset *idset = arg;
 
-	if (got_object_idset_get_element_data(entry) == NULL)
-		got_object_idset_remove_element(idset, entry);
+	if (data == NULL)
+		got_object_idset_remove(NULL, idset, id);
 
 	return NULL;
 }
 
 static const struct got_error *
-remove_reused_object(struct got_object_idset_element *entry, void *arg)
+remove_reused_object(struct got_object_id *id, void *data, void *arg)
 {
 	struct got_object_idset *idset = arg;
-	struct got_pack_meta *m;
+	struct got_pack_meta *m = data;
 
-	m = got_object_idset_get_element_data(entry);
 	if (m->have_reused_delta)
-		got_object_idset_remove_element(idset, entry);
+		got_object_idset_remove(NULL, idset, id);
 
 	return NULL;
 }
@@ -1840,8 +1839,7 @@ got_pack_create(uint8_t *packsha1, FILE *packfile,
 	if (err)
 		return err;
 
-	err = got_object_idset_for_each_element(idset,
-	    remove_unused_object, idset);
+	err = got_object_idset_for_each(idset, remove_unused_object, idset);
 	if (err)
 		goto done;
 
@@ -1877,7 +1875,7 @@ got_pack_create(uint8_t *packsha1, FILE *packfile,
 	if (err)
 		goto done;
 	if (reuse.nmeta > 0) {
-		err = got_object_idset_for_each_element(idset,
+		err = got_object_idset_for_each(idset,
 		    remove_reused_object, idset);
 		if (err)
 			goto done;