"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
Stefan Sperling <stsp@stsp.name>
Subject:
convert delta cache to hash table
To:
gameoftrees@openbsd.org
Date:
Tue, 31 May 2022 14:37:53 +0200

Download raw body.

Thread
This patch converts the delta cache from a linked list to a hash table.

got-read-pack and got-index-pack are now faster because the cache is
more efficient and less time is spent decompressing small deltas over
and over again.

The cache will now store a lot more deltas. To limit memory consumption
only relatively small deltas are stored, up to 2048 bytes in size.
This threshold seems to work well. The average delta size in OpenBSD
src.git is about 1600 bytes (determined roughly by listing the largest
pack file with 'gotadmin ls', adding up all the size values, and dividing
this total size by the number of objects in the pack file).

We could tweak delta cache parameters in the future if we find the
current settings to be suboptimal. Tweaking the global constants at
the top of the file should be good enough for experimentation.
The important effects to look for are time spent creating and/or
indexing a pack file, and memory usage.

ok?

diff 83bd17334fecebef56255e68a393bcd22320124a 7da6f95cb933db2a62164f07dfe889da775f6ab7
blob - a8493a5806aaa56fb238ea9b39b593a778931da1
blob + 3d1986bb2cda3891bde5245bde2461c960e932e1
--- lib/delta_cache.c
+++ lib/delta_cache.c
@@ -23,6 +23,8 @@
 #include <zlib.h>
 #include <limits.h>
 #include <time.h>
+#include <errno.h>
+#include <siphash.h>
 
 #include "got_object.h"
 #include "got_error.h"
@@ -36,80 +38,152 @@
 #define nitems(_a) (sizeof(_a) / sizeof((_a)[0]))
 #endif
 
-struct got_delta_cache_element {
-	TAILQ_ENTRY(got_delta_cache_element) entry;
-	off_t delta_data_offset;
-	uint8_t *delta_data;
-	size_t delta_len;
+#define GOT_DELTA_CACHE_MIN_BUCKETS	64
+#define GOT_DELTA_CACHE_MAX_BUCKETS	16384
+#define GOT_DELTA_CACHE_MAX_CHAIN	2
+#define GOT_DELTA_CACHE_MAX_DELTA_SIZE	2048
+
+struct got_cached_delta {
+	off_t offset;
+	uint8_t *data;
+	size_t len;
 };
 
-TAILQ_HEAD(got_delta_cache_head, got_delta_cache_element);
+struct got_delta_cache_head {
+	struct got_cached_delta entries[GOT_DELTA_CACHE_MAX_CHAIN];
+	unsigned int nchain;
+};
 
 struct got_delta_cache {
-	struct got_delta_cache_head entries;
-	int nelem;
-	int maxelem;
-	size_t maxelemsize;
+	struct got_delta_cache_head *buckets;
+	unsigned int nbuckets;
+	unsigned int totelem;
 	int cache_search;
 	int cache_hit;
 	int cache_miss;
 	int cache_evict;
 	int cache_toolarge;
+	unsigned int flags;
+#define GOT_DELTA_CACHE_F_NOMEM	0x01
+	SIPHASH_KEY key;
 };
 
-struct got_delta_cache *
-got_delta_cache_alloc(int maxelem, size_t maxelemsize)
+const struct got_error *
+got_delta_cache_alloc(struct got_delta_cache **new)
 {
+	const struct got_error *err;
 	struct got_delta_cache *cache;
 
+	*new = NULL;
+
 	cache = calloc(1, sizeof(*cache));
 	if (cache == NULL)
-		return NULL;
+		return got_error_from_errno("calloc");
+	
+	cache->buckets = calloc(GOT_DELTA_CACHE_MIN_BUCKETS,
+	    sizeof(cache->buckets[0]));
+	if (cache->buckets == NULL) {
+		err = got_error_from_errno("calloc");
+		free(cache);
+		return err;
+	}
+	cache->nbuckets = GOT_DELTA_CACHE_MIN_BUCKETS;
 
-	TAILQ_INIT(&cache->entries);
-	cache->maxelem = maxelem;
-	cache->maxelemsize = maxelemsize;
-	return cache;
+	arc4random_buf(&cache->key, sizeof(cache->key));
+	*new = cache;
+	return NULL;
 }
 
 void
 got_delta_cache_free(struct got_delta_cache *cache)
 {
-	struct got_delta_cache_element *entry;
+	struct got_cached_delta *delta;
+	unsigned int i;
 
 #ifdef GOT_OBJ_CACHE_DEBUG
-	fprintf(stderr, "%s: delta cache: %d elements, %d searches, %d hits, "
-	    "%d missed, %d evicted, %d too large\n", getprogname(), cache->nelem,
-	    cache->cache_search, cache->cache_hit, cache->cache_miss,
-	    cache->cache_evict, cache->cache_toolarge);
+	fprintf(stderr, "%s: delta cache: %u elements, %d searches, %d hits, "
+	    "%d missed, %d evicted, %d too large\n", getprogname(),
+	    cache->totelem, cache->cache_search, cache->cache_hit,
+	    cache->cache_miss, cache->cache_evict, cache->cache_toolarge);
 #endif
-	while (!TAILQ_EMPTY(&cache->entries)) {
-		entry = TAILQ_FIRST(&cache->entries);
-		TAILQ_REMOVE(&cache->entries, entry, entry);
-		free(entry->delta_data);
-		free(entry);
+	for (i = 0; i < cache->nbuckets; i++) {
+		struct got_delta_cache_head *head;
+		int j;
+		head = &cache->buckets[i];
+		for (j = 0; j < head->nchain; j++) {
+			delta = &head->entries[j];
+			free(delta->data);
+		}
 	}
+	free(cache->buckets);
 	free(cache);
 }
 
-#ifndef GOT_NO_OBJ_CACHE
-static void
-remove_least_used_element(struct got_delta_cache *cache)
+static uint64_t
+delta_cache_hash(struct got_delta_cache *cache, off_t delta_offset)
 {
-	struct got_delta_cache_element *entry;
+	return SipHash24(&cache->key, &delta_offset, sizeof(delta_offset));
+}
 
-	if (cache->nelem == 0)
-		return;
+static const struct got_error *
+delta_cache_resize(struct got_delta_cache *cache, unsigned int nbuckets)
+{
+	struct got_delta_cache_head *buckets;
+	size_t i;
 
-	entry = TAILQ_LAST(&cache->entries, got_delta_cache_head);
-	TAILQ_REMOVE(&cache->entries, entry, entry);
-	free(entry->delta_data);
-	free(entry);
-	cache->nelem--;
-	cache->cache_evict++;
+	buckets = calloc(nbuckets, sizeof(buckets[0]));
+	if (buckets == NULL) {
+		if (errno != ENOMEM)
+			return got_error_from_errno("calloc");
+		/* Proceed with our current amount of hash buckets. */
+		cache->flags |= GOT_DELTA_CACHE_F_NOMEM;
+		return NULL;
+	}
+
+	arc4random_buf(&cache->key, sizeof(cache->key));
+
+	for (i = 0; i < cache->nbuckets; i++) {
+		unsigned int j;
+		for (j = 0; j < cache->buckets[i].nchain; j++) {
+			struct got_delta_cache_head *head;
+			struct got_cached_delta *delta;
+			uint64_t idx;
+			delta = &cache->buckets[i].entries[j];
+			idx = delta_cache_hash(cache, delta->offset) % nbuckets;
+			head = &buckets[idx];
+			if (head->nchain < nitems(head->entries)) {
+				struct got_cached_delta *new_delta;
+				new_delta = &head->entries[head->nchain];
+				memcpy(new_delta, delta, sizeof(*new_delta));
+				head->nchain++;
+			} else
+				free(delta->data);
+		}
+	}
+
+	free(cache->buckets);
+	cache->buckets = buckets;
+	cache->nbuckets = nbuckets;
+	return NULL;
 }
-#endif
 
+static const struct got_error *
+delta_cache_grow(struct got_delta_cache *cache)
+{
+	unsigned int nbuckets;
+
+	if ((cache->flags & GOT_DELTA_CACHE_F_NOMEM) ||
+	    cache->nbuckets == GOT_DELTA_CACHE_MAX_BUCKETS)
+		return NULL;
+
+	if (cache->nbuckets >= GOT_DELTA_CACHE_MAX_BUCKETS / 2)
+		nbuckets = GOT_DELTA_CACHE_MAX_BUCKETS;
+	else
+		nbuckets = cache->nbuckets * 2;
+
+	return delta_cache_resize(cache, nbuckets);
+}
+
 const struct got_error *
 got_delta_cache_add(struct got_delta_cache *cache,
     off_t delta_data_offset, uint8_t *delta_data, size_t delta_len)
@@ -117,26 +191,38 @@ got_delta_cache_add(struct got_delta_cache *cache,
 #ifdef GOT_NO_OBJ_CACHE
 	return got_error(GOT_ERR_NO_SPACE);
 #else
-	struct got_delta_cache_element *entry;
+	const struct got_error *err = NULL;
+	struct got_cached_delta *delta;
+	struct got_delta_cache_head *head;
+	uint64_t idx;
 
-	if (delta_len > cache->maxelemsize) {
+	if (delta_len > GOT_DELTA_RESULT_SIZE_CACHED_MAX) {
 		cache->cache_toolarge++;
 		return got_error(GOT_ERR_NO_SPACE);
 	}
 
-	if (cache->nelem >= cache->maxelem)
-		remove_least_used_element(cache);
+	if (cache->nbuckets * 3 < cache->totelem * 4) {
+		err = delta_cache_grow(cache);
+		if (err)
+			return err;
+	}
 
-	entry = calloc(1, sizeof(*entry));
-	if (entry == NULL)
-		return got_error_from_errno("calloc");
+	idx = delta_cache_hash(cache, delta_data_offset) % cache->nbuckets;
+	head = &cache->buckets[idx];
+	if (head->nchain >= nitems(head->entries)) {
+		delta = &head->entries[head->nchain - 1];
+		free(delta->data);
+		memset(delta, 0, sizeof(*delta));
+		head->nchain--;
+	}
 
-	entry->delta_data_offset = delta_data_offset;
-	entry->delta_data = delta_data;
-	entry->delta_len = delta_len;
+	delta = &head->entries[head->nchain];
+	delta->offset = delta_data_offset;
+	delta->data = delta_data;
+	delta->len = delta_len;
+	head->nchain++;
+	cache->totelem++;
 
-	TAILQ_INSERT_HEAD(&cache->entries, entry, entry);
-	cache->nelem++;
 	return NULL;
 #endif
 }
@@ -145,21 +231,33 @@ void
 got_delta_cache_get(uint8_t **delta_data, size_t *delta_len,
     struct got_delta_cache *cache, off_t delta_data_offset)
 {
-	struct got_delta_cache_element *entry;
+	uint64_t idx;
+	struct got_delta_cache_head *head;
+	struct got_cached_delta *delta;
+	int i;
 
+	idx = delta_cache_hash(cache, delta_data_offset) % cache->nbuckets;
+	head = &cache->buckets[idx];
+
 	cache->cache_search++;
 	*delta_data = NULL;
 	*delta_len = 0;
-	TAILQ_FOREACH(entry, &cache->entries, entry) {
-		if (entry->delta_data_offset != delta_data_offset)
+	for (i = 0; i < head->nchain; i++) {
+		delta = &head->entries[i];
+		if (delta->offset != delta_data_offset)
 			continue;
 		cache->cache_hit++;
-		if (entry != TAILQ_FIRST(&cache->entries)) {
-			TAILQ_REMOVE(&cache->entries, entry, entry);
-			TAILQ_INSERT_HEAD(&cache->entries, entry, entry);
+		if (i > 0) {
+			struct got_cached_delta tmp;
+			memcpy(&tmp, &head->entries[0], sizeof(tmp));
+			memcpy(&head->entries[0], &head->entries[i],
+			    sizeof(head->entries[0]));
+			memcpy(&head->entries[i], &tmp,
+			    sizeof(head->entries[i]));
+			delta = &head->entries[0];
 		}
-		*delta_data = entry->delta_data;
-		*delta_len = entry->delta_len;
+		*delta_data = delta->data;
+		*delta_len = delta->len;
 		return;
 	}
 
blob - 39cb23dc018390d1de373fde1dc990faa76c21e2
blob + 7da86957201fd53ae69d5deb71b64d3bf6216599
--- lib/got_lib_delta_cache.h
+++ lib/got_lib_delta_cache.h
@@ -16,7 +16,7 @@
 
 struct got_delta_cache;
 
-struct got_delta_cache *got_delta_cache_alloc(int, size_t);
+const struct got_error *got_delta_cache_alloc(struct got_delta_cache **);
 void got_delta_cache_free(struct got_delta_cache *);
 
 const struct got_error *got_delta_cache_add(struct got_delta_cache *, off_t,
blob - 8c98a684210fa971a6971574e89365cac98a43db
blob + a6b975ef7bcdec20b95cc40a92b212aba60abcf5
--- libexec/got-index-pack/got-index-pack.c
+++ libexec/got-index-pack/got-index-pack.c
@@ -1009,12 +1009,9 @@ main(int argc, char **argv)
 
 	memset(&pack, 0, sizeof(pack));
 	pack.fd = -1;
-	pack.delta_cache = got_delta_cache_alloc(500,
-	    GOT_DELTA_RESULT_SIZE_CACHED_MAX);
-	if (pack.delta_cache == NULL) {
-		err = got_error_from_errno("got_delta_cache_alloc");
+	err = got_delta_cache_alloc(&pack.delta_cache);
+	if (err)
 		goto done;
-	}
 
 	imsg_init(&ibuf, GOT_IMSG_FD_CHILD);
 #ifndef PROFILE
blob - ea9a7e564cb840af0d6a61c8c8e0cf5ed3d93147
blob + 63bd0d3fe2ed9ec49659d7fcd00aa9f1412aacab
--- libexec/got-read-pack/got-read-pack.c
+++ libexec/got-read-pack/got-read-pack.c
@@ -1185,12 +1185,9 @@ receive_pack(struct got_pack **packp, struct imsgbuf *
 		goto done;
 	}
 
-	pack->delta_cache = got_delta_cache_alloc(100,
-	    GOT_DELTA_RESULT_SIZE_CACHED_MAX);
-	if (pack->delta_cache == NULL) {
-		err = got_error_from_errno("got_delta_cache_alloc");
+	err = got_delta_cache_alloc(&pack->delta_cache);
+	if (err)
 		goto done;
-	}
 
 #ifndef GOT_PACK_NO_MMAP
 	pack->map = mmap(NULL, pack->filesize, PROT_READ, MAP_PRIVATE,