"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
Stefan Sperling <stsp@stsp.name>
Subject:
reimplement object ID-set on top of a hash table
To:
gameoftrees@openbsd.org
Date:
Fri, 15 Apr 2022 16:32:33 +0200

Download raw body.

Thread
Implement the object ID-set on top of a chaining hash table instead
of an RB tree.

This provides a slight speed advantage when packing a lot of objects:

test case: 216541 commits colored; 2048137 objects found; 994231 trees scanned
RB tree: 3m59.31s real     3m44.61s user     0m42.55s system
hash table: 3m11.87s real     2m58.43s user     0m43.60s system

jrick suggested siphash as the hash distribution function.
Initially I tried using 4 byte positions of the object ID array, but discovered
that murmurhash has a much better random distribution which results in shorter
chains. siphash seems to have similar performance to murmurhash and doesn't
suffer from collision attacks:
https://en.wikipedia.org/wiki/MurmurHash#Vulnerabilities

The idset_add() operation no longer checks whether the element is already
present in the set. All callers are already performing this check before
adding elements, and it seems bad to do it twice. Lookups are now O(1) if
the element is missing, so letting callers check for existence before
insertion should be fine.

The hash table can grow until it fails to allocate more hash buckets.
In theory we could allow the table to shrink, but so far I could not
get this working without a significant slowdown. When the number of
buckets changes all elements have to be copied over to a new table,
which is very expensive.

-----------------------------------------------
commit 2670093859bd13a721a818c4d8b2f0140a8dfc93 (idset-hash)
from: Stefan Sperling <stsp@stsp.name>
date: Fri Apr 15 14:21:05 2022 UTC
 
 reimplement object-ID set data structure on top of a hash table
 
 Siphash suggested by jrick as a better alternative to murmurhash
 for this use case.
 
diff 1b01a31979d2daafab1f160fcc263c8b7062a7f3 ba40aee1bfa885072a040a81da73cb49b5d40e08
blob - 5346f7f2ce1559b966b7c6e2f1bde6da40d729a4
blob + 750be4962021434fb27d2dcc95867ff2e333a776
--- lib/got_lib_object_idset.h
+++ lib/got_lib_object_idset.h
@@ -31,13 +31,3 @@ const struct got_error *got_object_idset_for_each(stru
     const struct got_error *(*cb)(struct got_object_id *, void *, void *),
     void *);
 int got_object_idset_num_elements(struct got_object_idset *);
-
-struct got_object_idset_element;
-struct got_object_idset_element *got_object_idset_get_element(
-    struct got_object_idset *, struct got_object_id *);
-void *got_object_idset_get_element_data(struct got_object_idset_element *);
-const struct got_error *got_object_idset_for_each_element(struct got_object_idset *,
-    const struct got_error *(*cb)(struct got_object_idset_element *, void *), void *);
-void got_object_idset_remove_element(struct got_object_idset *,
-    struct got_object_idset_element *);
-
blob - 06f7347e22beeabe835c116ac99d59e61f87ea68
blob + d188d887deea72878f96122fb9c3544c4b9da7de
--- lib/object_idset.c
+++ lib/object_idset.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, 2019 Stefan Sperling <stsp@openbsd.org>
+ * Copyright (c) 2022 Stefan Sperling <stsp@openbsd.org>
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -15,15 +15,17 @@
  */
 
 #include <sys/queue.h>
-#include <sys/tree.h>
 
 #include <stdlib.h>
+#include <stdint.h>
 #include <string.h>
 #include <sha1.h>
 #include <stdio.h>
 #include <zlib.h>
 #include <limits.h>
 #include <time.h>
+#include <errno.h>
+#include <siphash.h>
 
 #include "got_object.h"
 #include "got_error.h"
@@ -32,116 +34,190 @@
 #include "got_lib_inflate.h"
 #include "got_lib_object.h"
 #include "got_lib_object_idset.h"
+#include "got_lib_object_parse.h"
 
-struct got_object_idset_element {
-	RB_ENTRY(got_object_idset_element)	entry;
-	struct got_object_id id;
-	void *data;	/* API user data */
-};
+#define GOT_OBJECT_IDSET_MIN_BUCKETS	64
 
-RB_HEAD(got_object_idset_tree, got_object_idset_element);
-
-static int
-cmp_elements(const struct got_object_idset_element *e1,
-    const struct got_object_idset_element *e2)
-{
-	return got_object_id_cmp(&e1->id, &e2->id);
-}
-
-RB_PROTOTYPE(got_object_idset_tree, got_object_idset_element, entry,
-    cmp_elements);
-
 struct got_object_idset {
-	struct got_object_idset_tree entries;
-	int totelem;
-#define GOT_OBJECT_IDSET_MAX_ELEM INT_MAX
+	struct got_object_id_queue *ids;
+	size_t nbuckets;
+	unsigned int totelem;
+	unsigned int flags;
+#define GOT_OBJECT_IDSET_F_TRAVERSAL	0x01
+#define GOT_OBJECT_IDSET_F_NOMEM	0x02
+	SIPHASH_KEY key;
 };
 
 struct got_object_idset *
 got_object_idset_alloc(void)
 {
 	struct got_object_idset *set;
+	int i;
 
 	set = malloc(sizeof(*set));
 	if (set == NULL)
 		return NULL;
 
-	RB_INIT(&set->entries);
+	set->ids = calloc(sizeof(set->ids[0]), GOT_OBJECT_IDSET_MIN_BUCKETS);
+	if (set->ids == NULL) {
+		free(set);
+		return NULL;
+	}
+	for (i = 0; i < GOT_OBJECT_IDSET_MIN_BUCKETS; i++)
+		STAILQ_INIT(&set->ids[i]);
+
 	set->totelem = 0;
-
+	set->nbuckets = GOT_OBJECT_IDSET_MIN_BUCKETS;
+	set->flags = 0;
+	arc4random_buf(&set->key, sizeof(set->key));
 	return set;
 }
 
 void
 got_object_idset_free(struct got_object_idset *set)
 {
-	struct got_object_idset_element *entry;
+	size_t i;
+	struct got_object_qid *qid;
 
-	while ((entry = RB_MIN(got_object_idset_tree, &set->entries))) {
-		RB_REMOVE(got_object_idset_tree, &set->entries, entry);
-		/* User data should be freed by caller. */
-		free(entry);
+	for (i = 0; i < set->nbuckets; i++) {
+		while (!STAILQ_EMPTY(&set->ids[i])) {
+			qid = STAILQ_FIRST(&set->ids[i]);
+			STAILQ_REMOVE(&set->ids[i], qid, got_object_qid, entry);
+			got_object_qid_free(qid);
+		}
 	}
-
+	/* User data should be freed by caller. */
+	free(set->ids);
 	free(set);
 }
 
+static uint64_t
+idset_hash(struct got_object_idset *set, struct got_object_id *id)
+{
+	return SipHash24(&set->key, id->sha1, sizeof(id->sha1));
+}
+
+static const struct got_error *
+idset_resize(struct got_object_idset *set, size_t nbuckets)
+{
+	struct got_object_id_queue *ids;
+	unsigned int i;
+
+	ids = calloc(nbuckets, sizeof(ids[0]));
+	if (ids == NULL) {
+		if (errno != ENOMEM)
+			return got_error_from_errno("calloc");
+		/* Proceed with our current amount of hash buckets. */
+		set->flags |= GOT_OBJECT_IDSET_F_NOMEM;
+		return NULL;
+	}
+
+	for (i = 0; i < nbuckets; i++)
+		STAILQ_INIT(&ids[i]);
+
+	arc4random_buf(&set->key, sizeof(set->key));
+
+	for (i = 0; i < set->nbuckets; i++) {
+		while (!STAILQ_EMPTY(&set->ids[i])) {
+			struct got_object_qid *qid;
+			uint64_t idx;
+			qid = STAILQ_FIRST(&set->ids[i]);
+			STAILQ_REMOVE(&set->ids[i], qid, got_object_qid, entry);
+			idx = idset_hash(set, qid->id) % nbuckets;
+			STAILQ_INSERT_HEAD(&ids[idx], qid, entry);
+		}
+	}
+
+	free(set->ids);
+	set->ids = ids;
+	set->nbuckets = nbuckets;
+	return NULL;
+}
+
+static const struct got_error *
+idset_grow(struct got_object_idset *set)
+{
+	size_t nbuckets;
+
+	if (set->flags & GOT_OBJECT_IDSET_F_NOMEM)
+		return NULL;
+
+	if (set->nbuckets >= UINT_MAX / 2)
+		nbuckets = UINT_MAX;
+	else
+		nbuckets = set->nbuckets * 2;
+
+	return idset_resize(set, nbuckets);
+}
+
 const struct got_error *
 got_object_idset_add(struct got_object_idset *set, struct got_object_id *id,
     void *data)
 {
-	struct got_object_idset_element *new;
+	const struct got_error *err;
+	struct got_object_qid *qid;
+	uint64_t idx;
+	struct got_object_id_queue *head;
 
-	if (set->totelem >= GOT_OBJECT_IDSET_MAX_ELEM)
+	/* This function may resize the set. */
+	if (set->flags & GOT_OBJECT_IDSET_F_TRAVERSAL)
+		return got_error_msg(GOT_ERR_NOT_IMPL,
+		    "cannot add elements to idset during traversal");
+
+	if (set->totelem == UINT_MAX)
 		return got_error(GOT_ERR_NO_SPACE);
+		
+	err = got_object_qid_alloc_partial(&qid);
+	if (err)
+		return err;
+	memcpy(qid->id, id, sizeof(*qid->id));
+	qid->data = data;
 
-	new = malloc(sizeof(*new));
-	if (new == NULL)
-		return got_error_from_errno("malloc");
-
-	memcpy(&new->id, id, sizeof(new->id));
-	new->data = data;
-
-	if (RB_INSERT(got_object_idset_tree, &set->entries, new) != NULL) {
-		free(new);
-		return got_error(GOT_ERR_OBJ_EXISTS);
-	}
-
+	idx = idset_hash(set, id) % set->nbuckets;
+	head = &set->ids[idx];
+	STAILQ_INSERT_HEAD(head, qid, entry);
 	set->totelem++;
+
+	if (set->nbuckets < set->totelem) {
+		err = idset_grow(set);
+		if (err)
+			return err;
+		idx = idset_hash(set, id) % set->nbuckets;
+		head = &set->ids[idx];
+	}
+
 	return NULL;
 }
 
-static struct got_object_idset_element *
+static struct got_object_qid *
 find_element(struct got_object_idset *set, struct got_object_id *id)
 {
-	struct got_object_idset_element *entry;
+	uint64_t idx = idset_hash(set, id) % set->nbuckets;
+	struct got_object_id_queue *head = &set->ids[idx];
+	struct got_object_qid *qid;
 
-	entry = RB_ROOT(&set->entries);
-	while (entry) {
-		int cmp = got_object_id_cmp(id, &entry->id);
-		if (cmp < 0)
-			entry = RB_LEFT(entry, entry);
-		else if (cmp > 0)
-			entry = RB_RIGHT(entry, entry);
-		else
-			break;
+	STAILQ_FOREACH(qid, head, entry) {
+		if (got_object_id_cmp(qid->id, id) == 0)
+			return qid;
 	}
 
-	return entry;
+	return NULL;
 }
 
 void *
 got_object_idset_get(struct got_object_idset *set, struct got_object_id *id)
 {
-	struct got_object_idset_element *entry = find_element(set, id);
-	return entry ? entry->data : NULL;
+	struct got_object_qid *qid = find_element(set, id);
+	return qid ? qid->data : NULL;
 }
 
 const struct got_error *
 got_object_idset_remove(void **data, struct got_object_idset *set,
     struct got_object_id *id)
 {
-	struct got_object_idset_element *entry;
+	uint64_t idx;
+	struct got_object_id_queue *head;
+	struct got_object_qid *qid;
 
 	if (data)
 		*data = NULL;
@@ -150,20 +226,30 @@ got_object_idset_remove(void **data, struct got_object
 		return got_error(GOT_ERR_NO_OBJ);
 
 	if (id == NULL) {
-		entry = RB_ROOT(&set->entries);
-		if (entry == NULL)
-			return got_error(GOT_ERR_NO_OBJ);
+		/* Remove a "random" element. */
+		for (idx = 0; idx < set->nbuckets; idx++) {
+			head = &set->ids[idx];
+			qid = STAILQ_FIRST(head);
+			if (qid)
+				break;
+		}
 	} else {
-		entry = find_element(set, id);
-		if (entry == NULL)
+		idx = idset_hash(set, id) % set->nbuckets;
+		head = &set->ids[idx];
+		STAILQ_FOREACH(qid, head, entry) {
+			if (got_object_id_cmp(qid->id, id) == 0)
+				break;
+		}
+		if (qid == NULL)
 			return got_error_no_obj(id);
 	}
 
-	RB_REMOVE(got_object_idset_tree, &set->entries, entry);
 	if (data)
-		*data = entry->data;
-	free(entry);
+		*data = qid->data;
+	STAILQ_REMOVE(head, qid, got_object_qid, entry);
+	got_object_qid_free(qid);
 	set->totelem--;
+
 	return NULL;
 }
 
@@ -171,8 +257,8 @@ int
 got_object_idset_contains(struct got_object_idset *set,
     struct got_object_id *id)
 {
-	struct got_object_idset_element *entry = find_element(set, id);
-	return entry ? 1 : 0;
+	struct got_object_qid *qid = find_element(set, id);
+	return qid ? 1 : 0;
 }
 
 const struct got_error *
@@ -180,15 +266,22 @@ got_object_idset_for_each(struct got_object_idset *set
     const struct got_error *(*cb)(struct got_object_id *, void *, void *),
     void *arg)
 {
-	const struct got_error *err;
-	struct got_object_idset_element *entry, *tmp;
+	const struct got_error *err = NULL;
+	struct got_object_id_queue *head;
+	struct got_object_qid *qid, *tmp;
+	size_t i;
 
-	RB_FOREACH_SAFE(entry, got_object_idset_tree, &set->entries, tmp) {
-		err = (*cb)(&entry->id, entry->data, arg);
-		if (err)
-			return err;
+	set->flags |= GOT_OBJECT_IDSET_F_TRAVERSAL;
+	for (i = 0; i < set->nbuckets; i++) { head = &set->ids[i];
+		STAILQ_FOREACH_SAFE(qid, head, entry, tmp) {
+			err = (*cb)(qid->id, qid->data, arg);
+			if (err)
+				goto done;
+		}
 	}
-	return NULL;
+done:
+	set->flags &= ~GOT_OBJECT_IDSET_F_TRAVERSAL;
+	return err;
 }
 
 int
@@ -196,43 +289,3 @@ got_object_idset_num_elements(struct got_object_idset 
 {
 	return set->totelem;
 }
-
-struct got_object_idset_element *
-got_object_idset_get_element(struct got_object_idset *set, struct got_object_id *id)
-{
-	return find_element(set, id);
-}
-
-void *
-got_object_idset_get_element_data(struct got_object_idset_element *entry)
-{
-	return entry->data;
-}
-
-const struct got_error *
-got_object_idset_for_each_element(struct got_object_idset *set,
-    const struct got_error *(*cb)(struct got_object_idset_element *, void *),
-    void *arg)
-{
-	const struct got_error *err;
-	struct got_object_idset_element *entry, *tmp;
-
-	RB_FOREACH_SAFE(entry, got_object_idset_tree, &set->entries, tmp) {
-		err = (*cb)(entry, arg);
-		if (err)
-			return err;
-	}
-	return NULL;
-}
-
-void
-got_object_idset_remove_element(struct got_object_idset *set,
-    struct got_object_idset_element *entry)
-{
-	RB_REMOVE(got_object_idset_tree, &set->entries, entry);
-	free(entry);
-	set->totelem--;
-}
-
-RB_GENERATE(got_object_idset_tree, got_object_idset_element, entry,
-    cmp_elements);
blob - fa7ea77d02fc1d6284318e12d67fc4c0742fde70
blob + 81518d57e3cdb40c14904ec873d086182fea8b4c
--- lib/pack_create.c
+++ lib/pack_create.c
@@ -1777,25 +1777,24 @@ done:
 }
 
 static const struct got_error *
-remove_unused_object(struct got_object_idset_element *entry, void *arg)
+remove_unused_object(struct got_object_id *id, void *data, void *arg)
 {
 	struct got_object_idset *idset = arg;
 
-	if (got_object_idset_get_element_data(entry) == NULL)
-		got_object_idset_remove_element(idset, entry);
+	if (data == NULL)
+		got_object_idset_remove(NULL, idset, id);
 
 	return NULL;
 }
 
 static const struct got_error *
-remove_reused_object(struct got_object_idset_element *entry, void *arg)
+remove_reused_object(struct got_object_id *id, void *data, void *arg)
 {
 	struct got_object_idset *idset = arg;
-	struct got_pack_meta *m;
+	struct got_pack_meta *m = data;
 
-	m = got_object_idset_get_element_data(entry);
 	if (m->have_reused_delta)
-		got_object_idset_remove_element(idset, entry);
+		got_object_idset_remove(NULL, idset, id);
 
 	return NULL;
 }
@@ -1840,8 +1839,7 @@ got_pack_create(uint8_t *packsha1, FILE *packfile,
 	if (err)
 		return err;
 
-	err = got_object_idset_for_each_element(idset,
-	    remove_unused_object, idset);
+	err = got_object_idset_for_each(idset, remove_unused_object, idset);
 	if (err)
 		goto done;
 
@@ -1877,7 +1875,7 @@ got_pack_create(uint8_t *packsha1, FILE *packfile,
 	if (err)
 		goto done;
 	if (reuse.nmeta > 0) {
-		err = got_object_idset_for_each_element(idset,
+		err = got_object_idset_for_each(idset,
 		    remove_reused_object, idset);
 		if (err)
 			goto done;