From: Stefan Sperling Subject: convert delta cache to hash table To: gameoftrees@openbsd.org Date: Tue, 31 May 2022 14:37:53 +0200 This patch converts the delta cache from a linked list to a hash table. got-read-pack and got-index-pack are now faster because the cache is more efficient and less time is spent decompressing small deltas over and over again. The cache will now store a lot more deltas. To limit memory consumption only relatively small deltas are stored, up to 2048 bytes in size. This threshold seems to work well. The average delta size in OpenBSD src.git is about 1600 bytes (determined roughly by listing the largest pack file with 'gotadmin ls', adding up all the size values, and dividing this total size by the number of objects in the pack file). We could tweak delta cache parameters in the future if we find the current settings to be suboptimal. Tweaking the global constants at the top of the file should be good enough for experimentation. The important effects to look for are time spent creating and/or indexing a pack file, and memory usage. ok? diff 83bd17334fecebef56255e68a393bcd22320124a 7da6f95cb933db2a62164f07dfe889da775f6ab7 blob - a8493a5806aaa56fb238ea9b39b593a778931da1 blob + 3d1986bb2cda3891bde5245bde2461c960e932e1 --- lib/delta_cache.c +++ lib/delta_cache.c @@ -23,6 +23,8 @@ #include #include #include +#include +#include #include "got_object.h" #include "got_error.h" @@ -36,80 +38,152 @@ #define nitems(_a) (sizeof(_a) / sizeof((_a)[0])) #endif -struct got_delta_cache_element { - TAILQ_ENTRY(got_delta_cache_element) entry; - off_t delta_data_offset; - uint8_t *delta_data; - size_t delta_len; +#define GOT_DELTA_CACHE_MIN_BUCKETS 64 +#define GOT_DELTA_CACHE_MAX_BUCKETS 16384 +#define GOT_DELTA_CACHE_MAX_CHAIN 2 +#define GOT_DELTA_CACHE_MAX_DELTA_SIZE 2048 + +struct got_cached_delta { + off_t offset; + uint8_t *data; + size_t len; }; -TAILQ_HEAD(got_delta_cache_head, got_delta_cache_element); +struct got_delta_cache_head { + struct got_cached_delta entries[GOT_DELTA_CACHE_MAX_CHAIN]; + unsigned int nchain; +}; struct got_delta_cache { - struct got_delta_cache_head entries; - int nelem; - int maxelem; - size_t maxelemsize; + struct got_delta_cache_head *buckets; + unsigned int nbuckets; + unsigned int totelem; int cache_search; int cache_hit; int cache_miss; int cache_evict; int cache_toolarge; + unsigned int flags; +#define GOT_DELTA_CACHE_F_NOMEM 0x01 + SIPHASH_KEY key; }; -struct got_delta_cache * -got_delta_cache_alloc(int maxelem, size_t maxelemsize) +const struct got_error * +got_delta_cache_alloc(struct got_delta_cache **new) { + const struct got_error *err; struct got_delta_cache *cache; + *new = NULL; + cache = calloc(1, sizeof(*cache)); if (cache == NULL) - return NULL; + return got_error_from_errno("calloc"); + + cache->buckets = calloc(GOT_DELTA_CACHE_MIN_BUCKETS, + sizeof(cache->buckets[0])); + if (cache->buckets == NULL) { + err = got_error_from_errno("calloc"); + free(cache); + return err; + } + cache->nbuckets = GOT_DELTA_CACHE_MIN_BUCKETS; - TAILQ_INIT(&cache->entries); - cache->maxelem = maxelem; - cache->maxelemsize = maxelemsize; - return cache; + arc4random_buf(&cache->key, sizeof(cache->key)); + *new = cache; + return NULL; } void got_delta_cache_free(struct got_delta_cache *cache) { - struct got_delta_cache_element *entry; + struct got_cached_delta *delta; + unsigned int i; #ifdef GOT_OBJ_CACHE_DEBUG - fprintf(stderr, "%s: delta cache: %d elements, %d searches, %d hits, " - "%d missed, %d evicted, %d too large\n", getprogname(), cache->nelem, - cache->cache_search, cache->cache_hit, cache->cache_miss, - cache->cache_evict, cache->cache_toolarge); + fprintf(stderr, "%s: delta cache: %u elements, %d searches, %d hits, " + "%d missed, %d evicted, %d too large\n", getprogname(), + cache->totelem, cache->cache_search, cache->cache_hit, + cache->cache_miss, cache->cache_evict, cache->cache_toolarge); #endif - while (!TAILQ_EMPTY(&cache->entries)) { - entry = TAILQ_FIRST(&cache->entries); - TAILQ_REMOVE(&cache->entries, entry, entry); - free(entry->delta_data); - free(entry); + for (i = 0; i < cache->nbuckets; i++) { + struct got_delta_cache_head *head; + int j; + head = &cache->buckets[i]; + for (j = 0; j < head->nchain; j++) { + delta = &head->entries[j]; + free(delta->data); + } } + free(cache->buckets); free(cache); } -#ifndef GOT_NO_OBJ_CACHE -static void -remove_least_used_element(struct got_delta_cache *cache) +static uint64_t +delta_cache_hash(struct got_delta_cache *cache, off_t delta_offset) { - struct got_delta_cache_element *entry; + return SipHash24(&cache->key, &delta_offset, sizeof(delta_offset)); +} - if (cache->nelem == 0) - return; +static const struct got_error * +delta_cache_resize(struct got_delta_cache *cache, unsigned int nbuckets) +{ + struct got_delta_cache_head *buckets; + size_t i; - entry = TAILQ_LAST(&cache->entries, got_delta_cache_head); - TAILQ_REMOVE(&cache->entries, entry, entry); - free(entry->delta_data); - free(entry); - cache->nelem--; - cache->cache_evict++; + buckets = calloc(nbuckets, sizeof(buckets[0])); + if (buckets == NULL) { + if (errno != ENOMEM) + return got_error_from_errno("calloc"); + /* Proceed with our current amount of hash buckets. */ + cache->flags |= GOT_DELTA_CACHE_F_NOMEM; + return NULL; + } + + arc4random_buf(&cache->key, sizeof(cache->key)); + + for (i = 0; i < cache->nbuckets; i++) { + unsigned int j; + for (j = 0; j < cache->buckets[i].nchain; j++) { + struct got_delta_cache_head *head; + struct got_cached_delta *delta; + uint64_t idx; + delta = &cache->buckets[i].entries[j]; + idx = delta_cache_hash(cache, delta->offset) % nbuckets; + head = &buckets[idx]; + if (head->nchain < nitems(head->entries)) { + struct got_cached_delta *new_delta; + new_delta = &head->entries[head->nchain]; + memcpy(new_delta, delta, sizeof(*new_delta)); + head->nchain++; + } else + free(delta->data); + } + } + + free(cache->buckets); + cache->buckets = buckets; + cache->nbuckets = nbuckets; + return NULL; } -#endif +static const struct got_error * +delta_cache_grow(struct got_delta_cache *cache) +{ + unsigned int nbuckets; + + if ((cache->flags & GOT_DELTA_CACHE_F_NOMEM) || + cache->nbuckets == GOT_DELTA_CACHE_MAX_BUCKETS) + return NULL; + + if (cache->nbuckets >= GOT_DELTA_CACHE_MAX_BUCKETS / 2) + nbuckets = GOT_DELTA_CACHE_MAX_BUCKETS; + else + nbuckets = cache->nbuckets * 2; + + return delta_cache_resize(cache, nbuckets); +} + const struct got_error * got_delta_cache_add(struct got_delta_cache *cache, off_t delta_data_offset, uint8_t *delta_data, size_t delta_len) @@ -117,26 +191,38 @@ got_delta_cache_add(struct got_delta_cache *cache, #ifdef GOT_NO_OBJ_CACHE return got_error(GOT_ERR_NO_SPACE); #else - struct got_delta_cache_element *entry; + const struct got_error *err = NULL; + struct got_cached_delta *delta; + struct got_delta_cache_head *head; + uint64_t idx; - if (delta_len > cache->maxelemsize) { + if (delta_len > GOT_DELTA_RESULT_SIZE_CACHED_MAX) { cache->cache_toolarge++; return got_error(GOT_ERR_NO_SPACE); } - if (cache->nelem >= cache->maxelem) - remove_least_used_element(cache); + if (cache->nbuckets * 3 < cache->totelem * 4) { + err = delta_cache_grow(cache); + if (err) + return err; + } - entry = calloc(1, sizeof(*entry)); - if (entry == NULL) - return got_error_from_errno("calloc"); + idx = delta_cache_hash(cache, delta_data_offset) % cache->nbuckets; + head = &cache->buckets[idx]; + if (head->nchain >= nitems(head->entries)) { + delta = &head->entries[head->nchain - 1]; + free(delta->data); + memset(delta, 0, sizeof(*delta)); + head->nchain--; + } - entry->delta_data_offset = delta_data_offset; - entry->delta_data = delta_data; - entry->delta_len = delta_len; + delta = &head->entries[head->nchain]; + delta->offset = delta_data_offset; + delta->data = delta_data; + delta->len = delta_len; + head->nchain++; + cache->totelem++; - TAILQ_INSERT_HEAD(&cache->entries, entry, entry); - cache->nelem++; return NULL; #endif } @@ -145,21 +231,33 @@ void got_delta_cache_get(uint8_t **delta_data, size_t *delta_len, struct got_delta_cache *cache, off_t delta_data_offset) { - struct got_delta_cache_element *entry; + uint64_t idx; + struct got_delta_cache_head *head; + struct got_cached_delta *delta; + int i; + idx = delta_cache_hash(cache, delta_data_offset) % cache->nbuckets; + head = &cache->buckets[idx]; + cache->cache_search++; *delta_data = NULL; *delta_len = 0; - TAILQ_FOREACH(entry, &cache->entries, entry) { - if (entry->delta_data_offset != delta_data_offset) + for (i = 0; i < head->nchain; i++) { + delta = &head->entries[i]; + if (delta->offset != delta_data_offset) continue; cache->cache_hit++; - if (entry != TAILQ_FIRST(&cache->entries)) { - TAILQ_REMOVE(&cache->entries, entry, entry); - TAILQ_INSERT_HEAD(&cache->entries, entry, entry); + if (i > 0) { + struct got_cached_delta tmp; + memcpy(&tmp, &head->entries[0], sizeof(tmp)); + memcpy(&head->entries[0], &head->entries[i], + sizeof(head->entries[0])); + memcpy(&head->entries[i], &tmp, + sizeof(head->entries[i])); + delta = &head->entries[0]; } - *delta_data = entry->delta_data; - *delta_len = entry->delta_len; + *delta_data = delta->data; + *delta_len = delta->len; return; } blob - 39cb23dc018390d1de373fde1dc990faa76c21e2 blob + 7da86957201fd53ae69d5deb71b64d3bf6216599 --- lib/got_lib_delta_cache.h +++ lib/got_lib_delta_cache.h @@ -16,7 +16,7 @@ struct got_delta_cache; -struct got_delta_cache *got_delta_cache_alloc(int, size_t); +const struct got_error *got_delta_cache_alloc(struct got_delta_cache **); void got_delta_cache_free(struct got_delta_cache *); const struct got_error *got_delta_cache_add(struct got_delta_cache *, off_t, blob - 8c98a684210fa971a6971574e89365cac98a43db blob + a6b975ef7bcdec20b95cc40a92b212aba60abcf5 --- libexec/got-index-pack/got-index-pack.c +++ libexec/got-index-pack/got-index-pack.c @@ -1009,12 +1009,9 @@ main(int argc, char **argv) memset(&pack, 0, sizeof(pack)); pack.fd = -1; - pack.delta_cache = got_delta_cache_alloc(500, - GOT_DELTA_RESULT_SIZE_CACHED_MAX); - if (pack.delta_cache == NULL) { - err = got_error_from_errno("got_delta_cache_alloc"); + err = got_delta_cache_alloc(&pack.delta_cache); + if (err) goto done; - } imsg_init(&ibuf, GOT_IMSG_FD_CHILD); #ifndef PROFILE blob - ea9a7e564cb840af0d6a61c8c8e0cf5ed3d93147 blob + 63bd0d3fe2ed9ec49659d7fcd00aa9f1412aacab --- libexec/got-read-pack/got-read-pack.c +++ libexec/got-read-pack/got-read-pack.c @@ -1185,12 +1185,9 @@ receive_pack(struct got_pack **packp, struct imsgbuf * goto done; } - pack->delta_cache = got_delta_cache_alloc(100, - GOT_DELTA_RESULT_SIZE_CACHED_MAX); - if (pack->delta_cache == NULL) { - err = got_error_from_errno("got_delta_cache_alloc"); + err = got_delta_cache_alloc(&pack->delta_cache); + if (err) goto done; - } #ifndef GOT_PACK_NO_MMAP pack->map = mmap(NULL, pack->filesize, PROT_READ, MAP_PRIVATE,