"GOT", but the "O" is a cute, smiling sun Index | Thread

From:
Stefan Sperling <stsp@stsp.name>
Subject:
rework object-id-set to use arrays instead of lists
To:
gameoftrees@openbsd.org
Date:
Tue, 18 Oct 2022 02:42:17 +0200

Download raw body.

This patch reworks the object-ID set to use an array of arrays as
backing store, instead of using an array of lists.

This makes memory allocation on insertion unnecessary, expect of
course when an array needs to grow, or when the entire hash table
needs to grow.

On removal, we can remove any element in O(1) time by swapping in the
element at the end of the array (thanks to jrick for this trick).
Whereas removing elements from an SLIST is O(n).

I haven't yet seen any performance difference while testing with
gotadmin pack -a on got.git. Which might not be a useful test case.
The ID set is a hash table, so differences between lists and arrays
should only matter when there are many hash collisions. My tests so
far did not trigger relevant edge cases.

One downside is that traversal callbacks need to be more careful
about removing elements, but this seems fine for existing use cases.

Any preferences to keep the old implementation or switch to this
new one?

diff aabb25f81b1f8f68a03af422f9ae14ea5c3ae1fd refs/heads/id-array
commit - aabb25f81b1f8f68a03af422f9ae14ea5c3ae1fd
commit + aeb386809e780d5e142f1730a300a1e30fd48797
blob - 652bddb959b1bd1a528daf84da62d9a2f52d5d51
blob + 95601a4fb0c01d0884348f5fd5f30d49ec755a21
--- got/Makefile
+++ got/Makefile
@@ -15,7 +15,8 @@ SRCS=		got.c blame.c commit_graph.c delta.c diff.c \
 		diff_patience.c send.c deltify.c pack_create.c dial.c \
 		bloom.c murmurhash2.c ratelimit.c patch.c sigs.c date.c \
 		object_open_privsep.c read_gitconfig_privsep.c \
-		read_gotconfig_privsep.c pack_create_privsep.c
+		read_gotconfig_privsep.c pack_create_privsep.c \
+		object_id_array.c
 
 MAN =		${PROG}.1 got-worktree.5 git-repository.5 got.conf.5
 
blob - 00e5e77e780621968aa2a18606882840fb9f4d2c
blob + c4902dde541b439d591bf793087dd00ed64e7f95
--- gotadmin/Makefile
+++ gotadmin/Makefile
@@ -11,7 +11,7 @@ SRCS=		gotadmin.c \
 		worktree_open.c sha1.c bloom.c murmurhash2.c ratelimit.c \
 		sigs.c buf.c date.c object_open_privsep.c \
 		read_gitconfig_privsep.c read_gotconfig_privsep.c \
-		pack_create_privsep.c
+		pack_create_privsep.c object_id_array.c
 MAN =		${PROG}.1
 
 CPPFLAGS = -I${.CURDIR}/../include -I${.CURDIR}/../lib
blob - 2ed7fd6f24b77b5c61137092e927591fc46d903d
blob + 02f0b69812b72cf9a65f0324a44571c8a3862183
--- lib/got_lib_object.h
+++ lib/got_lib_object.h
@@ -31,6 +31,25 @@ struct got_raw_object {
 	int refcnt;		/* > 0 if open and/or cached */
 };
 
+struct got_object_aid {
+	struct got_object_id id;
+	void *data;
+};
+
+struct got_object_id_array {
+	struct got_object_aid		*ids;
+	size_t				 alloc_chunk_size;
+	size_t				 nalloc;
+	size_t				 nids;
+};
+
+void got_object_id_array_init(struct got_object_id_array *, size_t);
+const struct got_error *got_object_id_array_add(struct got_object_id_array *,
+    struct got_object_id *, void *);
+const struct got_error *got_object_id_array_remove(void **,
+    struct got_object_id_array *, int);
+void got_object_id_array_free(struct got_object_id_array *);
+
 struct got_raw_object {
 	FILE *f;		/* NULL if data buffer is being used */
 	int fd;			/* -1 unless data buffer is memory-mapped */
blob - f58d1e7dc1d81be2989c4b8114725ca657c9ea3a
blob + 40919b10148e8e56ca3af88f99e7a82aeb46ac57
--- lib/object_idset.c
+++ lib/object_idset.c
@@ -37,9 +37,10 @@
 #include "got_lib_object_parse.h"
 
 #define GOT_OBJECT_IDSET_MIN_BUCKETS	64
+#define GOT_OBJECT_IDSET_ALLOC_CHUNK_SIZE	16
 
 struct got_object_idset {
-	struct got_object_id_queue *ids;
+	struct got_object_id_array *ids;
 	size_t nbuckets;
 	unsigned int totelem;
 	unsigned int flags;
@@ -63,9 +64,12 @@ got_object_idset_alloc(void)
 		free(set);
 		return NULL;
 	}
-	for (i = 0; i < GOT_OBJECT_IDSET_MIN_BUCKETS; i++)
-		STAILQ_INIT(&set->ids[i]);
 
+	for (i = 0; i < GOT_OBJECT_IDSET_MIN_BUCKETS; i++) {
+		got_object_id_array_init(&set->ids[i],
+		    GOT_OBJECT_IDSET_ALLOC_CHUNK_SIZE);
+	}
+
 	set->totelem = 0;
 	set->nbuckets = GOT_OBJECT_IDSET_MIN_BUCKETS;
 	set->flags = 0;
@@ -77,34 +81,31 @@ got_object_idset_free(struct got_object_idset *set)
 got_object_idset_free(struct got_object_idset *set)
 {
 	size_t i;
-	struct got_object_qid *qid;
 
-	for (i = 0; i < set->nbuckets; i++) {
-		while (!STAILQ_EMPTY(&set->ids[i])) {
-			qid = STAILQ_FIRST(&set->ids[i]);
-			STAILQ_REMOVE(&set->ids[i], qid, got_object_qid, entry);
-			got_object_qid_free(qid);
-		}
-	}
+	for (i = 0; i < set->nbuckets; i++)
+		got_object_id_array_free(&set->ids[i]);
+
 	/* User data should be freed by caller. */
 	free(set->ids);
 	free(set);
 }
 
 static uint64_t
-idset_hash(struct got_object_idset *set, struct got_object_id *id)
+idset_hash(SIPHASH_KEY *key, struct got_object_id *id)
 {
-	return SipHash24(&set->key, id->sha1, sizeof(id->sha1));
+	return SipHash24(key, id->sha1, sizeof(id->sha1));
 }
 
 static const struct got_error *
 idset_resize(struct got_object_idset *set, size_t nbuckets)
 {
-	struct got_object_id_queue *ids;
-	size_t i;
+	const struct got_error *err = NULL;
+	struct got_object_id_array *new_ids;
+	size_t i, j;
+	SIPHASH_KEY new_key;
 
-	ids = calloc(nbuckets, sizeof(ids[0]));
-	if (ids == NULL) {
+	new_ids = calloc(nbuckets, sizeof(new_ids[0]));
+	if (new_ids == NULL) {
 		if (errno != ENOMEM)
 			return got_error_from_errno("calloc");
 		/* Proceed with our current amount of hash buckets. */
@@ -112,26 +113,41 @@ idset_resize(struct got_object_idset *set, size_t nbuc
 		return NULL;
 	}
 
-	for (i = 0; i < nbuckets; i++)
-		STAILQ_INIT(&ids[i]);
+	for (i = 0; i < nbuckets; i++) {
+		got_object_id_array_init(&new_ids[i],
+		    GOT_OBJECT_IDSET_ALLOC_CHUNK_SIZE);
+	}
 
-	arc4random_buf(&set->key, sizeof(set->key));
+	arc4random_buf(&new_key, sizeof(new_key));
 
 	for (i = 0; i < set->nbuckets; i++) {
-		while (!STAILQ_EMPTY(&set->ids[i])) {
-			struct got_object_qid *qid;
+		struct got_object_id_array *array = &set->ids[i];
+		for (j = 0; j < array->nids; j++) {
+			struct got_object_aid *aid = &array->ids[j];
 			uint64_t idx;
-			qid = STAILQ_FIRST(&set->ids[i]);
-			STAILQ_REMOVE(&set->ids[i], qid, got_object_qid, entry);
-			idx = idset_hash(set, &qid->id) % nbuckets;
-			STAILQ_INSERT_HEAD(&ids[idx], qid, entry);
+			idx = idset_hash(&new_key, &aid->id) % nbuckets;
+			err = got_object_id_array_add(&new_ids[idx],
+			    &aid->id, aid->data);
+			if (err)
+				goto done;
 		}
 	}
+done:
+	if (err) {
+		if (err->code == GOT_ERR_ERRNO && errno == ENOMEM)
+			set->flags |= GOT_OBJECT_IDSET_F_NOMEM;
+		for (i = 0; i < nbuckets; i++)
+			got_object_id_array_free(&new_ids[i]);
+		free(new_ids);
+	} else {
+		free(set->ids);
+		set->ids = new_ids;
+		set->nbuckets = nbuckets;
+		memcpy(&set->key, &new_key, sizeof(set->key));
+	}
 
-	free(set->ids);
-	set->ids = ids;
-	set->nbuckets = nbuckets;
-	return NULL;
+	explicit_bzero(&new_key, sizeof(new_key));
+	return err;
 }
 
 static const struct got_error *
@@ -155,9 +171,8 @@ got_object_idset_add(struct got_object_idset *set, str
     void *data)
 {
 	const struct got_error *err;
-	struct got_object_qid *qid;
 	uint64_t idx;
-	struct got_object_id_queue *head;
+	struct got_object_id_array *head;
 
 	/* This function may resize the set. */
 	if (set->flags & GOT_OBJECT_IDSET_F_TRAVERSAL)
@@ -167,15 +182,11 @@ got_object_idset_add(struct got_object_idset *set, str
 	if (set->totelem == UINT_MAX)
 		return got_error(GOT_ERR_NO_SPACE);
 
-	err = got_object_qid_alloc_partial(&qid);
-	if (err)
-		return err;
-	memcpy(&qid->id, id, sizeof(qid->id));
-	qid->data = data;
-
-	idx = idset_hash(set, id) % set->nbuckets;
+	idx = idset_hash(&set->key, id) % set->nbuckets;
 	head = &set->ids[idx];
-	STAILQ_INSERT_HEAD(head, qid, entry);
+	err = got_object_id_array_add(head, id, data);
+	if (err)
+		return err;
 	set->totelem++;
 
 	if (set->nbuckets < set->totelem)
@@ -184,16 +195,17 @@ static struct got_object_qid *
 	return err;
 }
 
-static struct got_object_qid *
+static struct got_object_aid *
 find_element(struct got_object_idset *set, struct got_object_id *id)
 {
-	uint64_t idx = idset_hash(set, id) % set->nbuckets;
-	struct got_object_id_queue *head = &set->ids[idx];
-	struct got_object_qid *qid;
+	uint64_t idx = idset_hash(&set->key, id) % set->nbuckets;
+	struct got_object_id_array *head = &set->ids[idx];
+	size_t i;
 
-	STAILQ_FOREACH(qid, head, entry) {
-		if (got_object_id_cmp(&qid->id, id) == 0)
-			return qid;
+	for (i = 0; i < head->nids; i++) {
+		struct got_object_aid *aid = &head->ids[i];
+		if (got_object_id_cmp(&aid->id, id) == 0)
+			return aid;
 	}
 
 	return NULL;
@@ -202,17 +214,19 @@ got_object_idset_get(struct got_object_idset *set, str
 void *
 got_object_idset_get(struct got_object_idset *set, struct got_object_id *id)
 {
-	struct got_object_qid *qid = find_element(set, id);
-	return qid ? qid->data : NULL;
+	struct got_object_aid *aid = find_element(set, id);
+	return aid ? aid->data : NULL;
 }
 
 const struct got_error *
 got_object_idset_remove(void **data, struct got_object_idset *set,
     struct got_object_id *id)
 {
+	const struct got_error *err = NULL;
 	uint64_t idx;
-	struct got_object_id_queue *head;
-	struct got_object_qid *qid;
+	struct got_object_id_array *head;
+	struct got_object_aid *aid = NULL;
+	size_t i;
 
 	if (data)
 		*data = NULL;
@@ -224,27 +238,31 @@ got_object_idset_remove(void **data, struct got_object
 		/* Remove a "random" element. */
 		for (idx = 0; idx < set->nbuckets; idx++) {
 			head = &set->ids[idx];
-			qid = STAILQ_FIRST(head);
-			if (qid)
+			if (head->nids > 0) {
+				i = head->nids - 1;
+				aid = &head->ids[i];
 				break;
+			}
 		}
+		if (aid == NULL)
+			return got_error(GOT_ERR_NO_OBJ);
 	} else {
-		idx = idset_hash(set, id) % set->nbuckets;
+		idx = idset_hash(&set->key, id) % set->nbuckets;
 		head = &set->ids[idx];
-		STAILQ_FOREACH(qid, head, entry) {
-			if (got_object_id_cmp(&qid->id, id) == 0)
+		for (i = 0; i < head->nids; i++) {
+			aid = &head->ids[i];
+			if (got_object_id_cmp(&aid->id, id) == 0)
 				break;
 		}
-		if (qid == NULL)
+		if (aid == NULL)
 			return got_error_no_obj(id);
 	}
 
-	if (data)
-		*data = qid->data;
-	STAILQ_REMOVE(head, qid, got_object_qid, entry);
-	got_object_qid_free(qid);
+	err = got_object_id_array_remove(data, head, i);
+	if (err)
+		return err;
+
 	set->totelem--;
-
 	return NULL;
 }
 
@@ -252,8 +270,8 @@ got_object_idset_contains(struct got_object_idset *set
 got_object_idset_contains(struct got_object_idset *set,
     struct got_object_id *id)
 {
-	struct got_object_qid *qid = find_element(set, id);
-	return qid ? 1 : 0;
+	struct got_object_aid *aid = find_element(set, id);
+	return aid ? 1 : 0;
 }
 
 const struct got_error *
@@ -262,17 +280,31 @@ got_object_idset_for_each(struct got_object_idset *set
     void *arg)
 {
 	const struct got_error *err = NULL;
-	struct got_object_id_queue *head;
-	struct got_object_qid *qid, *tmp;
-	size_t i;
+	struct got_object_id_array *head;
+	struct got_object_aid *aid;
+	size_t i, j, nids, totelem;
 
 	set->flags |= GOT_OBJECT_IDSET_F_TRAVERSAL;
 	for (i = 0; i < set->nbuckets; i++) {
 		head = &set->ids[i];
-		STAILQ_FOREACH_SAFE(qid, head, entry, tmp) {
-			err = (*cb)(&qid->id, qid->data, arg);
+		j = 0;
+		while (j < head->nids) {
+			aid = &head->ids[j];
+			nids = head->nids;
+			totelem = set->totelem;
+			err = (*cb)(&aid->id, aid->data, arg);
 			if (err)
 				goto done;
+			if (set->totelem >= totelem && head->nids >= nids) {
+				j++;
+			} else if (head->nids != nids - 1 ||
+			    set->totelem != totelem - 1) {
+				err = got_error_msg(GOT_ERR_NOT_IMPL,
+				    "removing multiple set elements per "
+				    "traversal-callback invocation is not "
+				    "supported");
+				goto done;
+			}
 		}
 	}
 done:
blob - /dev/null
blob + ac6746f322d9cb47e6675230686870ced44a4569 (mode 644)
--- /dev/null
+++ lib/object_id_array.c
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2022 Stefan Sperling <stsp@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/types.h>
+#include <sys/queue.h>
+
+#include <sha1.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+#include "got_error.h"
+#include "got_object.h"
+
+#include "got_lib_delta.h"
+#include "got_lib_object.h"
+
+void
+got_object_id_array_init(struct got_object_id_array *array,
+    size_t alloc_chunk_size)
+{
+	array->ids = NULL;
+	array->alloc_chunk_size = alloc_chunk_size;
+	array->nalloc = 0;
+	array->nids = 0;
+}
+
+const struct got_error *
+got_object_id_array_add(struct got_object_id_array *array,
+    struct got_object_id *id, void *data)
+{
+	struct got_object_aid *aid;
+
+	if (array->alloc_chunk_size == 0) {
+		return got_error_msg(GOT_ERR_NOT_IMPL,
+		    "alloc_chunk_size not set");
+	}
+
+	if (array->ids == NULL) {
+		array->ids = reallocarray(NULL, array->alloc_chunk_size,
+		    sizeof(*array->ids));
+		if (array->ids == NULL)
+			return got_error_from_errno("reallocarray");
+		array->nalloc = array->alloc_chunk_size;
+		array->nids = 0;
+	} else if (array->nalloc <= array->nids) {
+		struct got_object_aid *new;
+		new = recallocarray(array->ids, array->nalloc,
+		    array->nalloc + array->alloc_chunk_size, sizeof(*new));
+		if (new == NULL)
+			return got_error_from_errno("recallocarray");
+		array->ids = new;
+		array->nalloc += array->alloc_chunk_size;
+	}
+
+	aid = &array->ids[array->nids];
+	memcpy(&aid->id, id, sizeof(aid->id));
+	aid->data = data;
+
+	array->nids++;
+	return NULL;
+}
+
+const struct got_error *
+got_object_id_array_remove(void **data,
+    struct got_object_id_array *array, int i)
+{
+	if (array->nids == 0)
+		return got_error(GOT_ERR_NO_OBJ);
+	if (i >= array->nids)
+		return got_error_msg(GOT_ERR_RANGE, "index out of range");
+
+	if (data)
+		*data = array->ids[i].data;
+
+	if (i < array->nids - 1) {
+		memcpy(&array->ids[i], &array->ids[array->nids - 1],
+		    sizeof(array->ids[0]));
+	}
+
+	array->nids--;
+	return NULL;
+}
+
+void
+got_object_id_array_free(struct got_object_id_array *array)
+{
+	free(array->ids);
+	array->ids = NULL;
+	array->nalloc = 0;
+	array->nids = 0;
+}
blob - 3af9d7f87c01965205fc69a98015a1887233eb15
blob + c37cba346134465f847d43dedb56976fa6b879b5
--- libexec/got-index-pack/Makefile
+++ libexec/got-index-pack/Makefile
@@ -4,7 +4,8 @@ SRCS=		got-index-pack.c error.c inflate.c object_parse
 
 PROG=		got-index-pack
 SRCS=		got-index-pack.c error.c inflate.c object_parse.c object_idset.c \
-		delta_cache.c delta.c pack.c path.c privsep.c sha1.c ratelimit.c
+		delta_cache.c delta.c pack.c path.c privsep.c sha1.c ratelimit.c \
+		object_id_array.c
 
 CPPFLAGS = -I${.CURDIR}/../../include -I${.CURDIR}/../../lib
 
blob - ceb42b9503fa151a6a92e2037cea299350be312c
blob + 72ae2a2b30048d924ddb3153af8da543c8a9a65d
--- libexec/got-read-pack/Makefile
+++ libexec/got-read-pack/Makefile
@@ -5,7 +5,7 @@ SRCS=		got-read-pack.c delta.c error.c inflate.c objec
 PROG=		got-read-pack
 SRCS=		got-read-pack.c delta.c error.c inflate.c object_cache.c \
 		object_idset.c object_parse.c opentemp.c pack.c path.c \
-		privsep.c sha1.c delta_cache.c
+		privsep.c sha1.c delta_cache.c object_id_array.c
 
 CPPFLAGS = -I${.CURDIR}/../../include -I${.CURDIR}/../../lib
 
blob - f52c80673381058fa605428772575e17f2276fd9
blob + 6647028d3aa65b3422f5a098e68ff901b1848ebd
--- regress/fetch/Makefile
+++ regress/fetch/Makefile
@@ -6,7 +6,7 @@ SRCS = error.c privsep.c reference.c sha1.c object.c o
 	deflate.c delta.c delta_cache.c object_idset.c object_create.c \
 	fetch.c gotconfig.c dial.c fetch_test.c bloom.c murmurhash2.c sigs.c \
 	buf.c date.c object_open_privsep.c read_gitconfig_privsep.c \
-	read_gotconfig_privsep.c
+	read_gotconfig_privsep.c object_id_array.c
 
 CPPFLAGS = -I${.CURDIR}/../../include -I${.CURDIR}/../../lib
 LDADD = -lutil -lz -lm
blob - 6dc6aa8d5dacb2eb47651672d28391cfb1ebd270
blob + 5dd33fd59b18ed0221a8a143594fa3cb541fa1df
--- regress/idset/Makefile
+++ regress/idset/Makefile
@@ -2,7 +2,7 @@ SRCS = error.c sha1.c object_idset.c inflate.c path.c 
 
 PROG = idset_test
 SRCS = error.c sha1.c object_idset.c inflate.c path.c object_parse.c \
-	idset_test.c
+	idset_test.c object_id_array.c
 
 CPPFLAGS = -I${.CURDIR}/../../include -I${.CURDIR}/../../lib
 LDADD = -lutil -lz
blob - 77026c9204de82a647b51112fe7f475d83c9e480
blob + 9e59312ec2b06010febeec91e8f4430944bc0687
--- tog/Makefile
+++ tog/Makefile
@@ -14,7 +14,7 @@ SRCS=		tog.c blame.c commit_graph.c delta.c diff.c \
 		diff_output_unidiff.c diff_output_edscript.c \
 		diff_patience.c bloom.c murmurhash2.c sigs.c date.c \
 		object_open_privsep.c read_gitconfig_privsep.c \
-		read_gotconfig_privsep.c
+		read_gotconfig_privsep.c object_id_array.c
 MAN =		${PROG}.1
 
 CPPFLAGS = -I${.CURDIR}/../include -I${.CURDIR}/../lib