Download raw body.
On 23-04-23 11:40PM, Stefan Sperling wrote:
> On Wed, Apr 19, 2023 at 02:14:36PM +0200, Stefan Sperling wrote:
> > The patch below implements fulltext caching in the delta cache.
> > This is a proof of concept, not ready for commit mostly because we would
> > need to carefully evaluate the various constants that control memory usage.
> > For now, I've sized the cache such that all fulltexts in your test case
> > will fit.
> >
> > This shaves off about 10% of time spent inflating data.
> > But got-read-pack uses about 300MB of memory now, instead of about 150MB.
>
> Here is a new version of this diff with more conservative memory usage.
>
> Together with another diff I will send out to optimize the search for
> changed paths this helps make 'got log path' about 30%-50% faster with
> one single large pack file in freebsd-ports.git.
Very nice! These make a marked improvement.
A couple tiny style nits below if you'd like to polish now or later,
either way is ok for me
> -----------------------------------------------
> cache fulltext data in delta cache to improve speed with long delta chains
>
> diff 91db220264b6d9be6d44223dd76ae8eb9bea3641 24e3f99b1f2ea548f27d6d72325797c8b1fe3b08
> commit - 91db220264b6d9be6d44223dd76ae8eb9bea3641
> commit + 24e3f99b1f2ea548f27d6d72325797c8b1fe3b08
> blob - a88ea5659f6ff7be69d846ef2825a6e32b5963a7
> blob + aaa03170319bd65b941e76c4152b6d5720bdcd5a
> --- lib/delta_cache.c
> +++ lib/delta_cache.c
> @@ -40,15 +40,19 @@
> #define nitems(_a) (sizeof(_a) / sizeof((_a)[0]))
> #endif
>
> -#define GOT_DELTA_CACHE_MIN_BUCKETS 64
> -#define GOT_DELTA_CACHE_MAX_BUCKETS 2048
> -#define GOT_DELTA_CACHE_MAX_CHAIN 2
> -#define GOT_DELTA_CACHE_MAX_DELTA_SIZE 1024
> +#define GOT_DELTA_CACHE_MIN_BUCKETS 64
> +#define GOT_DELTA_CACHE_MAX_BUCKETS 2048
> +#define GOT_DELTA_CACHE_MAX_CHAIN 2
> +#define GOT_DELTA_CACHE_MAX_DELTA_SIZE 1024
> +#define GOT_DELTA_CACHE_MAX_FULLTEXT_SIZE 524288
>
> +
> struct got_cached_delta {
> off_t offset;
> uint8_t *data;
> size_t len;
> + uint8_t *fulltext;
> + size_t fulltext_len;
> };
>
> struct got_delta_cache_head {
> @@ -62,10 +66,13 @@ struct got_delta_cache {
> unsigned int totelem;
> int cache_search;
> int cache_hit;
> + int cache_hit_fulltext;
> int cache_miss;
> int cache_evict;
> int cache_toolarge;
> + int cache_toolarge_fulltext;
> int cache_maxtoolarge;
> + int cache_maxtoolarge_fulltext;
> unsigned int flags;
> #define GOT_DELTA_CACHE_F_NOMEM 0x01
> SIPHASH_KEY key;
> @@ -105,10 +112,14 @@ got_delta_cache_free(struct got_delta_cache *cache)
>
> #ifdef GOT_DELTA_CACHE_DEBUG
> fprintf(stderr, "%s: delta cache: %u elements, %d searches, %d hits, "
> - "%d missed, %d evicted, %d too large (max %d)\n", getprogname(),
> - cache->totelem, cache->cache_search, cache->cache_hit,
> + "%d fulltext hits, %d missed, %d evicted, %d too large (max %d), "
> + "%d too large fulltext (max %d)\n",
> + getprogname(), cache->totelem, cache->cache_search,
> + cache->cache_hit, cache->cache_hit_fulltext,
> cache->cache_miss, cache->cache_evict, cache->cache_toolarge,
> - cache->cache_maxtoolarge);
> + cache->cache_maxtoolarge,
> + cache->cache_toolarge_fulltext,
> + cache->cache_maxtoolarge_fulltext);
> #endif
> for (i = 0; i < cache->nbuckets; i++) {
> struct got_delta_cache_head *head;
> @@ -222,6 +233,7 @@ got_delta_cache_add(struct got_delta_cache *cache,
> if (head->nchain >= nitems(head->entries)) {
> delta = &head->entries[head->nchain - 1];
> free(delta->data);
> + free(delta->fulltext);
> memset(delta, 0, sizeof(*delta));
> head->nchain--;
> cache->totelem--;
> @@ -232,6 +244,8 @@ got_delta_cache_add(struct got_delta_cache *cache,
> delta->offset = delta_data_offset;
> delta->data = delta_data;
> delta->len = delta_len;
> + delta->fulltext = NULL;
> + delta->fulltext_len = 0;
> head->nchain++;
> cache->totelem++;
>
> @@ -239,8 +253,56 @@ void
> #endif
> }
>
> +const struct got_error *
> +got_delta_cache_add_fulltext(struct got_delta_cache *cache,
> + off_t delta_data_offset, uint8_t *fulltext, size_t fulltext_len)
> +{
> +#ifdef GOT_NO_DELTA_CACHE
> + return got_error(GOT_ERR_NO_SPACE);
> +#else
> + struct got_cached_delta *delta;
> + struct got_delta_cache_head *head;
> + uint64_t idx;
> + int i;
> +
> + if (fulltext_len > GOT_DELTA_CACHE_MAX_FULLTEXT_SIZE) {
> + cache->cache_toolarge_fulltext++;
> + if (fulltext_len > cache->cache_maxtoolarge)
> + cache->cache_maxtoolarge_fulltext = fulltext_len;
> + return got_error(GOT_ERR_NO_SPACE);
> + }
> +
> + idx = delta_cache_hash(cache, delta_data_offset) % cache->nbuckets;
> + head = &cache->buckets[idx];
> +
> + for (i = 0; i < head->nchain; i++) {
> + delta = &head->entries[i];
> + if (delta->offset != delta_data_offset)
> + continue;
> + if (i > 0) {
> + struct got_cached_delta tmp;
please insert blank line following declaration
> + memcpy(&tmp, &head->entries[0], sizeof(tmp));
> + memcpy(&head->entries[0], &head->entries[i],
> + sizeof(head->entries[0]));
> + memcpy(&head->entries[i], &tmp,
> + sizeof(head->entries[i]));
> + delta = &head->entries[0];
> + }
> + delta->fulltext = malloc(fulltext_len);
> + if (delta->fulltext == NULL)
> + return got_error_from_errno("malloc");
> + memcpy(delta->fulltext, fulltext, fulltext_len);
> + delta->fulltext_len = fulltext_len;
> + break;
> + }
> +
> + return NULL;
> +#endif
> +}
> +
> void
> got_delta_cache_get(uint8_t **delta_data, size_t *delta_len,
> + uint8_t **fulltext, size_t *fulltext_len,
> struct got_delta_cache *cache, off_t delta_data_offset)
> {
> uint64_t idx;
> @@ -254,6 +316,10 @@ got_delta_cache_get(uint8_t **delta_data, size_t *delt
> cache->cache_search++;
> *delta_data = NULL;
> *delta_len = 0;
> + if (fulltext)
> + *fulltext = NULL;
> + if (fulltext_len)
> + *fulltext_len = 0;
> for (i = 0; i < head->nchain; i++) {
> delta = &head->entries[i];
> if (delta->offset != delta_data_offset)
> @@ -270,6 +336,13 @@ got_delta_cache_get(uint8_t **delta_data, size_t *delt
> }
> *delta_data = delta->data;
> *delta_len = delta->len;
> + if (fulltext && fulltext_len &&
> + delta->fulltext && delta->fulltext_len) {
> + *fulltext = delta->fulltext;
> + *fulltext_len = delta->fulltext_len;
> + cache->cache_hit_fulltext++;
> + }
> +
> return;
> }
>
> blob - 7da86957201fd53ae69d5deb71b64d3bf6216599
> blob + 03111a87e65fe8d7de030236276b4a0961ab8d05
> --- lib/got_lib_delta_cache.h
> +++ lib/got_lib_delta_cache.h
> @@ -21,4 +21,7 @@ void got_delta_cache_get(uint8_t **, size_t *, struct
>
> const struct got_error *got_delta_cache_add(struct got_delta_cache *, off_t,
> uint8_t *, size_t);
> -void got_delta_cache_get(uint8_t **, size_t *, struct got_delta_cache *, off_t);
> +const struct got_error *got_delta_cache_add_fulltext(struct got_delta_cache *,
> + off_t , uint8_t *, size_t);
> +void got_delta_cache_get(uint8_t **, size_t *, uint8_t **, size_t *,
> + struct got_delta_cache *, off_t);
> blob - a9b71c809585eb6a270c9f49e602be196d76f3ad
> blob + 4bef4fa0fe94d62da5c1003246d35e196a4e04dc
> --- lib/pack.c
> +++ lib/pack.c
> @@ -1316,7 +1316,8 @@ got_pack_get_delta_chain_max_size(uint64_t *max_size,
>
> if (pack->delta_cache) {
> got_delta_cache_get(&delta_buf, &delta_len,
> - pack->delta_cache, delta->data_offset);
> + NULL, NULL, pack->delta_cache,
> + delta->data_offset);
> }
> if (delta_buf == NULL) {
> cached = 0;
> @@ -1370,7 +1371,7 @@ got_pack_dump_delta_chain_to_file(size_t *result_size,
> const struct got_error *err = NULL;
> struct got_delta *delta;
> uint8_t *base_buf = NULL, *accum_buf = NULL;
> - size_t base_bufsz = 0, accum_bufsz = 0, accum_size = 0, delta_len;
> + size_t base_bufsz = 0, accum_bufsz = 0, accum_size = 0;
> /* We process small enough files entirely in memory for speed. */
> const size_t max_bufsize = GOT_DELTA_RESULT_SIZE_CACHED_MAX;
> uint64_t max_size = 0;
> @@ -1381,6 +1382,23 @@ got_pack_dump_delta_chain_to_file(size_t *result_size,
> if (STAILQ_EMPTY(&deltas->entries))
> return got_error(GOT_ERR_BAD_DELTA_CHAIN);
>
> + if (pack->delta_cache) {
> + uint8_t *delta_buf = NULL, *fulltext = NULL;
> + size_t delta_len, fulltext_len;
> +
> + delta = STAILQ_LAST(&deltas->entries, got_delta, entry);
> + got_delta_cache_get(&delta_buf, &delta_len,
> + &fulltext, &fulltext_len,
> + pack->delta_cache, delta->data_offset);
> + if (fulltext) {
> + size_t w = fwrite(fulltext, 1, fulltext_len, outfile);
please don't use function calls in initialisers, and as above leave
a blank line after:
if (fulltext) {
size_t w;
w = fwrite(fulltext, 1, fulltext_len, outfile);
if (w != fulltext_len)
return got_ferror(outfile, GOT_ERR_IO);
...
> + if (w != fulltext_len)
> + return got_ferror(outfile, GOT_ERR_IO);
> + *result_size = fulltext_len;
> + return NULL;
> + }
> + }
> +
> if (fseeko(base_file, 0L, SEEK_SET) == -1)
> return got_error_from_errno("fseeko");
> if (fseeko(accum_file, 0L, SEEK_SET) == -1)
> @@ -1388,7 +1406,8 @@ got_pack_dump_delta_chain_to_file(size_t *result_size,
>
> /* Deltas are ordered in ascending order. */
> STAILQ_FOREACH(delta, &deltas->entries, entry) {
> - uint8_t *delta_buf = NULL;
> + uint8_t *delta_buf = NULL, *fulltext = NULL;
> + size_t delta_len, fulltext_len;
> uint64_t base_size, result_size = 0;
> int cached = 1;
> if (n == 0) {
> @@ -1475,6 +1494,7 @@ got_pack_dump_delta_chain_to_file(size_t *result_size,
>
> if (pack->delta_cache) {
> got_delta_cache_get(&delta_buf, &delta_len,
> + &fulltext, &fulltext_len,
> pack->delta_cache, delta->data_offset);
> }
> if (delta_buf == NULL) {
> @@ -1506,6 +1526,8 @@ got_pack_dump_delta_chain_to_file(size_t *result_size,
> max_size = base_size;
> if (result_size > max_size)
> max_size = result_size;
> + if (fulltext_len > max_size)
> + max_size = fulltext_len;
>
> if (base_buf && max_size > max_bufsize) {
> /* Switch from buffers to temporary files. */
> @@ -1548,21 +1570,41 @@ got_pack_dump_delta_chain_to_file(size_t *result_size,
> }
>
> if (base_buf) {
> - err = got_delta_apply_in_mem(base_buf, base_bufsz,
> - delta_buf, delta_len, accum_buf,
> - &accum_size, max_size);
> + if (fulltext) {
> + memcpy(accum_buf, fulltext, fulltext_len);
> + accum_size = fulltext_len;
> + err = NULL;
> + } else {
> + err = got_delta_apply_in_mem(base_buf, base_bufsz,
please wrap before 80 columns:
err = got_delta_apply_in_mem(base_buf,
base_bufsz, delta_buf, delta_len,
accum_buf, &accum_size, max_size);
> + delta_buf, delta_len, accum_buf,
> + &accum_size, max_size);
> + }
> n++;
> + if (!cached)
> + free(delta_buf);
> + if (err)
> + goto done;
> + if (fulltext == NULL) {
> + err = got_delta_cache_add_fulltext(
> + pack->delta_cache, delta->data_offset,
> + accum_buf, accum_size);
> + if (err) {
> + if (err->code != GOT_ERR_NO_SPACE)
> + goto done;
> + err = NULL;
> + }
> + }
> } else {
> err = got_delta_apply(base_file, delta_buf,
> delta_len,
> /* Final delta application writes to output file. */
> ++n < deltas->nentries ? accum_file : outfile,
> &accum_size);
> + if (!cached)
> + free(delta_buf);
> + if (err)
> + goto done;
> }
> - if (!cached)
> - free(delta_buf);
> - if (err)
> - goto done;
>
> if (n < deltas->nentries) {
> /* Accumulated delta becomes the new base. */
> @@ -1604,7 +1646,7 @@ got_pack_dump_delta_chain_to_mem(uint8_t **outbuf, siz
> const struct got_error *err = NULL;
> struct got_delta *delta;
> uint8_t *base_buf = NULL, *accum_buf = NULL;
> - size_t base_bufsz = 0, accum_bufsz = 0, accum_size = 0, delta_len;
> + size_t base_bufsz = 0, accum_bufsz = 0, accum_size = 0;
> uint64_t max_size = 0;
> int n = 0;
>
> @@ -1614,9 +1656,28 @@ got_pack_dump_delta_chain_to_mem(uint8_t **outbuf, siz
> if (STAILQ_EMPTY(&deltas->entries))
> return got_error(GOT_ERR_BAD_DELTA_CHAIN);
>
> + if (pack->delta_cache) {
> + uint8_t *delta_buf = NULL, *fulltext = NULL;
> + size_t delta_len, fulltext_len;
> +
> + delta = STAILQ_LAST(&deltas->entries, got_delta, entry);
> + got_delta_cache_get(&delta_buf, &delta_len,
> + &fulltext, &fulltext_len,
> + pack->delta_cache, delta->data_offset);
> + if (fulltext) {
> + *outbuf = malloc(fulltext_len);
> + if (*outbuf == NULL)
> + return got_error_from_errno("malloc");
> + memcpy(*outbuf, fulltext, fulltext_len);
> + *outlen = fulltext_len;
> + return NULL;
> + }
> + }
> +
> /* Deltas are ordered in ascending order. */
> STAILQ_FOREACH(delta, &deltas->entries, entry) {
> - uint8_t *delta_buf = NULL;
> + uint8_t *delta_buf = NULL, *fulltext = NULL;
> + size_t delta_len, fulltext_len = 0;
> uint64_t base_size, result_size = 0;
> int cached = 1;
> if (n == 0) {
> @@ -1637,10 +1698,26 @@ got_pack_dump_delta_chain_to_mem(uint8_t **outbuf, siz
> goto done;
> }
>
> + if (pack->delta_cache) {
> + got_delta_cache_get(&delta_buf, &delta_len,
> + &fulltext, &fulltext_len,
> + pack->delta_cache, delta_data_offset);
> + }
> +
> if (delta->size > max_size)
> max_size = delta->size;
> + if (delta->size > fulltext_len)
> + max_size = fulltext_len;
>
> - if (pack->map) {
> + if (fulltext) {
> + base_buf = malloc(fulltext_len);
> + if (base_buf == NULL) {
> + err = got_error_from_errno("malloc");
> + goto done;
> + }
> + memcpy(base_buf, fulltext, fulltext_len);
> + base_bufsz = fulltext_len;
> + } else if (pack->map) {
> size_t mapoff;
>
> if (delta_data_offset > SIZE_MAX) {
> @@ -1667,11 +1744,30 @@ got_pack_dump_delta_chain_to_mem(uint8_t **outbuf, siz
> if (err)
> goto done;
> n++;
> +
> + if (pack->delta_cache && fulltext == NULL) {
> + err = got_delta_cache_add(pack->delta_cache,
> + delta_data_offset, NULL, 0);
> + if (err) {
> + if (err->code != GOT_ERR_NO_SPACE)
> + goto done;
> + err = NULL;
> + } else {
> + err = got_delta_cache_add_fulltext(
> + pack->delta_cache,
> + delta_data_offset,
> + fulltext, fulltext_len);
> + if (err && err->code != GOT_ERR_NO_SPACE)
> + goto done;
likewise, please wrap before 80:
if (err &&
err->code != GOT_ERR_NO_SPACE)
goto done;
> + err = NULL;
> + }
> + }
> continue;
> }
>
> if (pack->delta_cache) {
> got_delta_cache_get(&delta_buf, &delta_len,
> + &fulltext, &fulltext_len,
> pack->delta_cache, delta->data_offset);
> }
> if (delta_buf == NULL) {
> @@ -1703,6 +1799,8 @@ got_pack_dump_delta_chain_to_mem(uint8_t **outbuf, siz
> max_size = base_size;
> if (result_size > max_size)
> max_size = result_size;
> + if (fulltext_len > max_size)
> + max_size = fulltext_len;
>
> if (max_size > base_bufsz) {
> uint8_t *p = realloc(base_buf, max_size);
> @@ -1728,15 +1826,31 @@ got_pack_dump_delta_chain_to_mem(uint8_t **outbuf, siz
> accum_bufsz = max_size;
> }
>
> - err = got_delta_apply_in_mem(base_buf, base_bufsz,
> - delta_buf, delta_len, accum_buf,
> - &accum_size, max_size);
> + if (fulltext) {
> + memcpy(accum_buf, fulltext, fulltext_len);
> + accum_size = fulltext_len;
> + err = NULL;
> + } else {
> + err = got_delta_apply_in_mem(base_buf, base_bufsz,
> + delta_buf, delta_len, accum_buf,
> + &accum_size, max_size);
> + }
> if (!cached)
> free(delta_buf);
> n++;
> if (err)
> goto done;
>
> + if (fulltext == NULL) {
> + err = got_delta_cache_add_fulltext(pack->delta_cache,
> + delta->data_offset, accum_buf, accum_size);
> + if (err) {
> + if (err->code != GOT_ERR_NO_SPACE)
> + goto done;
> + err = NULL;
> + }
> + }
> +
> if (n < deltas->nentries) {
> /* Accumulated delta becomes the new base. */
> uint8_t *tmp = accum_buf;
>
--
Mark Jamsek <fnc.bsdbox.org|got.bsdbox.org>
GPG: F2FF 13DE 6A06 C471 CA80 E6E2 2930 DC66 86EE CF68