From: Stefan Sperling Subject: rework object-id-set to use arrays instead of lists To: gameoftrees@openbsd.org Date: Tue, 18 Oct 2022 02:42:17 +0200 This patch reworks the object-ID set to use an array of arrays as backing store, instead of using an array of lists. This makes memory allocation on insertion unnecessary, expect of course when an array needs to grow, or when the entire hash table needs to grow. On removal, we can remove any element in O(1) time by swapping in the element at the end of the array (thanks to jrick for this trick). Whereas removing elements from an SLIST is O(n). I haven't yet seen any performance difference while testing with gotadmin pack -a on got.git. Which might not be a useful test case. The ID set is a hash table, so differences between lists and arrays should only matter when there are many hash collisions. My tests so far did not trigger relevant edge cases. One downside is that traversal callbacks need to be more careful about removing elements, but this seems fine for existing use cases. Any preferences to keep the old implementation or switch to this new one? diff aabb25f81b1f8f68a03af422f9ae14ea5c3ae1fd refs/heads/id-array commit - aabb25f81b1f8f68a03af422f9ae14ea5c3ae1fd commit + aeb386809e780d5e142f1730a300a1e30fd48797 blob - 652bddb959b1bd1a528daf84da62d9a2f52d5d51 blob + 95601a4fb0c01d0884348f5fd5f30d49ec755a21 --- got/Makefile +++ got/Makefile @@ -15,7 +15,8 @@ SRCS= got.c blame.c commit_graph.c delta.c diff.c \ diff_patience.c send.c deltify.c pack_create.c dial.c \ bloom.c murmurhash2.c ratelimit.c patch.c sigs.c date.c \ object_open_privsep.c read_gitconfig_privsep.c \ - read_gotconfig_privsep.c pack_create_privsep.c + read_gotconfig_privsep.c pack_create_privsep.c \ + object_id_array.c MAN = ${PROG}.1 got-worktree.5 git-repository.5 got.conf.5 blob - 00e5e77e780621968aa2a18606882840fb9f4d2c blob + c4902dde541b439d591bf793087dd00ed64e7f95 --- gotadmin/Makefile +++ gotadmin/Makefile @@ -11,7 +11,7 @@ SRCS= gotadmin.c \ worktree_open.c sha1.c bloom.c murmurhash2.c ratelimit.c \ sigs.c buf.c date.c object_open_privsep.c \ read_gitconfig_privsep.c read_gotconfig_privsep.c \ - pack_create_privsep.c + pack_create_privsep.c object_id_array.c MAN = ${PROG}.1 CPPFLAGS = -I${.CURDIR}/../include -I${.CURDIR}/../lib blob - 2ed7fd6f24b77b5c61137092e927591fc46d903d blob + 02f0b69812b72cf9a65f0324a44571c8a3862183 --- lib/got_lib_object.h +++ lib/got_lib_object.h @@ -31,6 +31,25 @@ struct got_raw_object { int refcnt; /* > 0 if open and/or cached */ }; +struct got_object_aid { + struct got_object_id id; + void *data; +}; + +struct got_object_id_array { + struct got_object_aid *ids; + size_t alloc_chunk_size; + size_t nalloc; + size_t nids; +}; + +void got_object_id_array_init(struct got_object_id_array *, size_t); +const struct got_error *got_object_id_array_add(struct got_object_id_array *, + struct got_object_id *, void *); +const struct got_error *got_object_id_array_remove(void **, + struct got_object_id_array *, int); +void got_object_id_array_free(struct got_object_id_array *); + struct got_raw_object { FILE *f; /* NULL if data buffer is being used */ int fd; /* -1 unless data buffer is memory-mapped */ blob - f58d1e7dc1d81be2989c4b8114725ca657c9ea3a blob + 40919b10148e8e56ca3af88f99e7a82aeb46ac57 --- lib/object_idset.c +++ lib/object_idset.c @@ -37,9 +37,10 @@ #include "got_lib_object_parse.h" #define GOT_OBJECT_IDSET_MIN_BUCKETS 64 +#define GOT_OBJECT_IDSET_ALLOC_CHUNK_SIZE 16 struct got_object_idset { - struct got_object_id_queue *ids; + struct got_object_id_array *ids; size_t nbuckets; unsigned int totelem; unsigned int flags; @@ -63,9 +64,12 @@ got_object_idset_alloc(void) free(set); return NULL; } - for (i = 0; i < GOT_OBJECT_IDSET_MIN_BUCKETS; i++) - STAILQ_INIT(&set->ids[i]); + for (i = 0; i < GOT_OBJECT_IDSET_MIN_BUCKETS; i++) { + got_object_id_array_init(&set->ids[i], + GOT_OBJECT_IDSET_ALLOC_CHUNK_SIZE); + } + set->totelem = 0; set->nbuckets = GOT_OBJECT_IDSET_MIN_BUCKETS; set->flags = 0; @@ -77,34 +81,31 @@ got_object_idset_free(struct got_object_idset *set) got_object_idset_free(struct got_object_idset *set) { size_t i; - struct got_object_qid *qid; - for (i = 0; i < set->nbuckets; i++) { - while (!STAILQ_EMPTY(&set->ids[i])) { - qid = STAILQ_FIRST(&set->ids[i]); - STAILQ_REMOVE(&set->ids[i], qid, got_object_qid, entry); - got_object_qid_free(qid); - } - } + for (i = 0; i < set->nbuckets; i++) + got_object_id_array_free(&set->ids[i]); + /* User data should be freed by caller. */ free(set->ids); free(set); } static uint64_t -idset_hash(struct got_object_idset *set, struct got_object_id *id) +idset_hash(SIPHASH_KEY *key, struct got_object_id *id) { - return SipHash24(&set->key, id->sha1, sizeof(id->sha1)); + return SipHash24(key, id->sha1, sizeof(id->sha1)); } static const struct got_error * idset_resize(struct got_object_idset *set, size_t nbuckets) { - struct got_object_id_queue *ids; - size_t i; + const struct got_error *err = NULL; + struct got_object_id_array *new_ids; + size_t i, j; + SIPHASH_KEY new_key; - ids = calloc(nbuckets, sizeof(ids[0])); - if (ids == NULL) { + new_ids = calloc(nbuckets, sizeof(new_ids[0])); + if (new_ids == NULL) { if (errno != ENOMEM) return got_error_from_errno("calloc"); /* Proceed with our current amount of hash buckets. */ @@ -112,26 +113,41 @@ idset_resize(struct got_object_idset *set, size_t nbuc return NULL; } - for (i = 0; i < nbuckets; i++) - STAILQ_INIT(&ids[i]); + for (i = 0; i < nbuckets; i++) { + got_object_id_array_init(&new_ids[i], + GOT_OBJECT_IDSET_ALLOC_CHUNK_SIZE); + } - arc4random_buf(&set->key, sizeof(set->key)); + arc4random_buf(&new_key, sizeof(new_key)); for (i = 0; i < set->nbuckets; i++) { - while (!STAILQ_EMPTY(&set->ids[i])) { - struct got_object_qid *qid; + struct got_object_id_array *array = &set->ids[i]; + for (j = 0; j < array->nids; j++) { + struct got_object_aid *aid = &array->ids[j]; uint64_t idx; - qid = STAILQ_FIRST(&set->ids[i]); - STAILQ_REMOVE(&set->ids[i], qid, got_object_qid, entry); - idx = idset_hash(set, &qid->id) % nbuckets; - STAILQ_INSERT_HEAD(&ids[idx], qid, entry); + idx = idset_hash(&new_key, &aid->id) % nbuckets; + err = got_object_id_array_add(&new_ids[idx], + &aid->id, aid->data); + if (err) + goto done; } } +done: + if (err) { + if (err->code == GOT_ERR_ERRNO && errno == ENOMEM) + set->flags |= GOT_OBJECT_IDSET_F_NOMEM; + for (i = 0; i < nbuckets; i++) + got_object_id_array_free(&new_ids[i]); + free(new_ids); + } else { + free(set->ids); + set->ids = new_ids; + set->nbuckets = nbuckets; + memcpy(&set->key, &new_key, sizeof(set->key)); + } - free(set->ids); - set->ids = ids; - set->nbuckets = nbuckets; - return NULL; + explicit_bzero(&new_key, sizeof(new_key)); + return err; } static const struct got_error * @@ -155,9 +171,8 @@ got_object_idset_add(struct got_object_idset *set, str void *data) { const struct got_error *err; - struct got_object_qid *qid; uint64_t idx; - struct got_object_id_queue *head; + struct got_object_id_array *head; /* This function may resize the set. */ if (set->flags & GOT_OBJECT_IDSET_F_TRAVERSAL) @@ -167,15 +182,11 @@ got_object_idset_add(struct got_object_idset *set, str if (set->totelem == UINT_MAX) return got_error(GOT_ERR_NO_SPACE); - err = got_object_qid_alloc_partial(&qid); - if (err) - return err; - memcpy(&qid->id, id, sizeof(qid->id)); - qid->data = data; - - idx = idset_hash(set, id) % set->nbuckets; + idx = idset_hash(&set->key, id) % set->nbuckets; head = &set->ids[idx]; - STAILQ_INSERT_HEAD(head, qid, entry); + err = got_object_id_array_add(head, id, data); + if (err) + return err; set->totelem++; if (set->nbuckets < set->totelem) @@ -184,16 +195,17 @@ static struct got_object_qid * return err; } -static struct got_object_qid * +static struct got_object_aid * find_element(struct got_object_idset *set, struct got_object_id *id) { - uint64_t idx = idset_hash(set, id) % set->nbuckets; - struct got_object_id_queue *head = &set->ids[idx]; - struct got_object_qid *qid; + uint64_t idx = idset_hash(&set->key, id) % set->nbuckets; + struct got_object_id_array *head = &set->ids[idx]; + size_t i; - STAILQ_FOREACH(qid, head, entry) { - if (got_object_id_cmp(&qid->id, id) == 0) - return qid; + for (i = 0; i < head->nids; i++) { + struct got_object_aid *aid = &head->ids[i]; + if (got_object_id_cmp(&aid->id, id) == 0) + return aid; } return NULL; @@ -202,17 +214,19 @@ got_object_idset_get(struct got_object_idset *set, str void * got_object_idset_get(struct got_object_idset *set, struct got_object_id *id) { - struct got_object_qid *qid = find_element(set, id); - return qid ? qid->data : NULL; + struct got_object_aid *aid = find_element(set, id); + return aid ? aid->data : NULL; } const struct got_error * got_object_idset_remove(void **data, struct got_object_idset *set, struct got_object_id *id) { + const struct got_error *err = NULL; uint64_t idx; - struct got_object_id_queue *head; - struct got_object_qid *qid; + struct got_object_id_array *head; + struct got_object_aid *aid = NULL; + size_t i; if (data) *data = NULL; @@ -224,27 +238,31 @@ got_object_idset_remove(void **data, struct got_object /* Remove a "random" element. */ for (idx = 0; idx < set->nbuckets; idx++) { head = &set->ids[idx]; - qid = STAILQ_FIRST(head); - if (qid) + if (head->nids > 0) { + i = head->nids - 1; + aid = &head->ids[i]; break; + } } + if (aid == NULL) + return got_error(GOT_ERR_NO_OBJ); } else { - idx = idset_hash(set, id) % set->nbuckets; + idx = idset_hash(&set->key, id) % set->nbuckets; head = &set->ids[idx]; - STAILQ_FOREACH(qid, head, entry) { - if (got_object_id_cmp(&qid->id, id) == 0) + for (i = 0; i < head->nids; i++) { + aid = &head->ids[i]; + if (got_object_id_cmp(&aid->id, id) == 0) break; } - if (qid == NULL) + if (aid == NULL) return got_error_no_obj(id); } - if (data) - *data = qid->data; - STAILQ_REMOVE(head, qid, got_object_qid, entry); - got_object_qid_free(qid); + err = got_object_id_array_remove(data, head, i); + if (err) + return err; + set->totelem--; - return NULL; } @@ -252,8 +270,8 @@ got_object_idset_contains(struct got_object_idset *set got_object_idset_contains(struct got_object_idset *set, struct got_object_id *id) { - struct got_object_qid *qid = find_element(set, id); - return qid ? 1 : 0; + struct got_object_aid *aid = find_element(set, id); + return aid ? 1 : 0; } const struct got_error * @@ -262,17 +280,31 @@ got_object_idset_for_each(struct got_object_idset *set void *arg) { const struct got_error *err = NULL; - struct got_object_id_queue *head; - struct got_object_qid *qid, *tmp; - size_t i; + struct got_object_id_array *head; + struct got_object_aid *aid; + size_t i, j, nids, totelem; set->flags |= GOT_OBJECT_IDSET_F_TRAVERSAL; for (i = 0; i < set->nbuckets; i++) { head = &set->ids[i]; - STAILQ_FOREACH_SAFE(qid, head, entry, tmp) { - err = (*cb)(&qid->id, qid->data, arg); + j = 0; + while (j < head->nids) { + aid = &head->ids[j]; + nids = head->nids; + totelem = set->totelem; + err = (*cb)(&aid->id, aid->data, arg); if (err) goto done; + if (set->totelem >= totelem && head->nids >= nids) { + j++; + } else if (head->nids != nids - 1 || + set->totelem != totelem - 1) { + err = got_error_msg(GOT_ERR_NOT_IMPL, + "removing multiple set elements per " + "traversal-callback invocation is not " + "supported"); + goto done; + } } } done: blob - /dev/null blob + ac6746f322d9cb47e6675230686870ced44a4569 (mode 644) --- /dev/null +++ lib/object_id_array.c @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2022 Stefan Sperling + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include "got_error.h" +#include "got_object.h" + +#include "got_lib_delta.h" +#include "got_lib_object.h" + +void +got_object_id_array_init(struct got_object_id_array *array, + size_t alloc_chunk_size) +{ + array->ids = NULL; + array->alloc_chunk_size = alloc_chunk_size; + array->nalloc = 0; + array->nids = 0; +} + +const struct got_error * +got_object_id_array_add(struct got_object_id_array *array, + struct got_object_id *id, void *data) +{ + struct got_object_aid *aid; + + if (array->alloc_chunk_size == 0) { + return got_error_msg(GOT_ERR_NOT_IMPL, + "alloc_chunk_size not set"); + } + + if (array->ids == NULL) { + array->ids = reallocarray(NULL, array->alloc_chunk_size, + sizeof(*array->ids)); + if (array->ids == NULL) + return got_error_from_errno("reallocarray"); + array->nalloc = array->alloc_chunk_size; + array->nids = 0; + } else if (array->nalloc <= array->nids) { + struct got_object_aid *new; + new = recallocarray(array->ids, array->nalloc, + array->nalloc + array->alloc_chunk_size, sizeof(*new)); + if (new == NULL) + return got_error_from_errno("recallocarray"); + array->ids = new; + array->nalloc += array->alloc_chunk_size; + } + + aid = &array->ids[array->nids]; + memcpy(&aid->id, id, sizeof(aid->id)); + aid->data = data; + + array->nids++; + return NULL; +} + +const struct got_error * +got_object_id_array_remove(void **data, + struct got_object_id_array *array, int i) +{ + if (array->nids == 0) + return got_error(GOT_ERR_NO_OBJ); + if (i >= array->nids) + return got_error_msg(GOT_ERR_RANGE, "index out of range"); + + if (data) + *data = array->ids[i].data; + + if (i < array->nids - 1) { + memcpy(&array->ids[i], &array->ids[array->nids - 1], + sizeof(array->ids[0])); + } + + array->nids--; + return NULL; +} + +void +got_object_id_array_free(struct got_object_id_array *array) +{ + free(array->ids); + array->ids = NULL; + array->nalloc = 0; + array->nids = 0; +} blob - 3af9d7f87c01965205fc69a98015a1887233eb15 blob + c37cba346134465f847d43dedb56976fa6b879b5 --- libexec/got-index-pack/Makefile +++ libexec/got-index-pack/Makefile @@ -4,7 +4,8 @@ SRCS= got-index-pack.c error.c inflate.c object_parse PROG= got-index-pack SRCS= got-index-pack.c error.c inflate.c object_parse.c object_idset.c \ - delta_cache.c delta.c pack.c path.c privsep.c sha1.c ratelimit.c + delta_cache.c delta.c pack.c path.c privsep.c sha1.c ratelimit.c \ + object_id_array.c CPPFLAGS = -I${.CURDIR}/../../include -I${.CURDIR}/../../lib blob - ceb42b9503fa151a6a92e2037cea299350be312c blob + 72ae2a2b30048d924ddb3153af8da543c8a9a65d --- libexec/got-read-pack/Makefile +++ libexec/got-read-pack/Makefile @@ -5,7 +5,7 @@ SRCS= got-read-pack.c delta.c error.c inflate.c objec PROG= got-read-pack SRCS= got-read-pack.c delta.c error.c inflate.c object_cache.c \ object_idset.c object_parse.c opentemp.c pack.c path.c \ - privsep.c sha1.c delta_cache.c + privsep.c sha1.c delta_cache.c object_id_array.c CPPFLAGS = -I${.CURDIR}/../../include -I${.CURDIR}/../../lib blob - f52c80673381058fa605428772575e17f2276fd9 blob + 6647028d3aa65b3422f5a098e68ff901b1848ebd --- regress/fetch/Makefile +++ regress/fetch/Makefile @@ -6,7 +6,7 @@ SRCS = error.c privsep.c reference.c sha1.c object.c o deflate.c delta.c delta_cache.c object_idset.c object_create.c \ fetch.c gotconfig.c dial.c fetch_test.c bloom.c murmurhash2.c sigs.c \ buf.c date.c object_open_privsep.c read_gitconfig_privsep.c \ - read_gotconfig_privsep.c + read_gotconfig_privsep.c object_id_array.c CPPFLAGS = -I${.CURDIR}/../../include -I${.CURDIR}/../../lib LDADD = -lutil -lz -lm blob - 6dc6aa8d5dacb2eb47651672d28391cfb1ebd270 blob + 5dd33fd59b18ed0221a8a143594fa3cb541fa1df --- regress/idset/Makefile +++ regress/idset/Makefile @@ -2,7 +2,7 @@ SRCS = error.c sha1.c object_idset.c inflate.c path.c PROG = idset_test SRCS = error.c sha1.c object_idset.c inflate.c path.c object_parse.c \ - idset_test.c + idset_test.c object_id_array.c CPPFLAGS = -I${.CURDIR}/../../include -I${.CURDIR}/../../lib LDADD = -lutil -lz blob - 77026c9204de82a647b51112fe7f475d83c9e480 blob + 9e59312ec2b06010febeec91e8f4430944bc0687 --- tog/Makefile +++ tog/Makefile @@ -14,7 +14,7 @@ SRCS= tog.c blame.c commit_graph.c delta.c diff.c \ diff_output_unidiff.c diff_output_edscript.c \ diff_patience.c bloom.c murmurhash2.c sigs.c date.c \ object_open_privsep.c read_gitconfig_privsep.c \ - read_gotconfig_privsep.c + read_gotconfig_privsep.c object_id_array.c MAN = ${PROG}.1 CPPFLAGS = -I${.CURDIR}/../include -I${.CURDIR}/../lib