"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
Stefan Sperling <stsp@stsp.name>
Subject:
Re: attempt at speeding up the deltification
To:
Omar Polo <op@omarpolo.com>
Cc:
gameoftrees@openbsd.org
Date:
Mon, 26 Feb 2024 12:28:01 +0100

Download raw body.

Thread
On Mon, Feb 26, 2024 at 11:51:26AM +0100, Omar Polo wrote:
> Here's the updated diff for got.
> 
> I did some benchmarking on 'gotadmin pack -a' with hyperfine, on the
> repo that once prompted me to bump the resize step from 64 to 256.  In
> practice there I can some small improvements (around 0.5s), which I'm
> not sure warrants the added code.
> 
> I also did some profiling on a dummy repository with some files off
> /dev/random in there -- 1m, 10m, 20m and 50m -- and the performance gain
> is much noticeable, in the x10 range, but I'm not sure it's a good idea
> to optimize for an edge cases.

I guess the tradeoff is worth it.  ok stsp@

Large binaries are an edge case in Git for most people.
But when we run into one and take minutes for processing that will
always be very annoying. In some cases excising such files from
history is very difficult (e.g. when related systems like issue
trackers or CI depend on the hashes not changing).

Did you already try with large zip/tar.gz files?
 
> diffstat /home/op/w/got
>  M  lib/deltify.c          |  150+  133-
>  M  lib/got_lib_deltify.h  |    8+    0-
> 
> 2 files changed, 158 insertions(+), 133 deletions(-)
> 
> diff /home/op/w/got
> commit - 7a86002db34d49472a7d75c1802ee99c2120ef3c
> path + /home/op/w/got
> blob - 99b5d418b8cec83d130d52be4e5022d0310f8c31
> file + lib/deltify.c
> --- lib/deltify.c
> +++ lib/deltify.c
> @@ -87,6 +87,9 @@ static const uint32_t geartab[256] = {
>      0xf1f6e72c, 0x5551128a, 0x83af87e2, 0x6f0342ba,
>  };
>  
> +static const struct got_error *addblk(struct got_delta_table *, FILE *,
> +    uint8_t *, off_t, off_t, off_t, uint32_t);
> +
>  static uint32_t
>  hashblk(const unsigned char *p, off_t n, uint32_t seed)
>  {
> @@ -94,145 +97,150 @@ hashblk(const unsigned char *p, off_t n, uint32_t seed
>  }
>  
>  static const struct got_error *
> -addblk(struct got_delta_table *dt, FILE *f, off_t file_offset0, off_t len,
> -    off_t offset, uint32_t h)
> +lookup(int *n, struct got_delta_table *dt, FILE *f, off_t file_offset0,
> +    off_t len, off_t offset, uint32_t h)
>  {
> -	const struct got_error *err = NULL;
> -	int i;
> +	struct got_delta_block *block;
>  	uint8_t buf[GOT_DELTIFY_MAXCHUNK];
>  	uint8_t buf2[GOT_DELTIFY_MAXCHUNK];
>  	size_t r = 0;
> +	int i;
>  
> -	if (len == 0)
> -		return NULL;
> +	for (i = h % dt->size; dt->offs[i] != 0; i = (i + 1) % dt->size) {
> +		block = &dt->blocks[dt->offs[i] - 1];
> +		if (block->len != len || block->hash != h)
> +			continue;
>  
> -	i = h % dt->nalloc;
> -	while (dt->blocks[i].len != 0) {
> -		/*
> -		 * Avoid adding duplicate blocks.
> -		 * NB: A matching hash is insufficient for detecting equality.
> -		 * The hash can only detect inequality.
> -		 */
> -		if (len == dt->blocks[i].len && h == dt->blocks[i].hash) {
> -			if (r == 0) {
> -				if (fseeko(f, file_offset0 + offset, SEEK_SET) == -1)
> -					return got_error_from_errno("fseeko");
> -				r = fread(buf, 1, len, f);
> -				if (r != len) {
> -					if (!ferror(f))
> -						return NULL;
> -					return got_ferror(f, GOT_ERR_IO);
> -				}
> -			}
> -			if (fseeko(f, file_offset0 + dt->blocks[i].offset,
> -			    SEEK_SET) == -1)
> +		if (r == 0) {
> +			if (fseeko(f, file_offset0 + offset, SEEK_SET) == -1)
>  				return got_error_from_errno("fseeko");
> -			if (fread(buf2, 1, len, f) != len)
> +			r = fread(buf, 1, len, f);
> +			if (r != len) {
> +				if (!ferror(f))
> +					break;		/* why? */
>  				return got_ferror(f, GOT_ERR_IO);
> -			if (memcmp(buf, buf2, len) == 0)
> -				return NULL;
> +			}
>  		}
>  
> -		i = (i + 1) % dt->nalloc;
> +		if (fseeko(f, file_offset0 + block->offset, SEEK_SET) == -1)
> +			return got_error_from_errno("fseeko");
> +		if (fread(buf2, 1, len, f) != len)
> +			return got_ferror(f, GOT_ERR_IO);
> +		if (memcmp(buf, buf2, len) == 0)
> +			break;
>  	}
> -	assert(dt->blocks[i].len == 0);
> -	dt->blocks[i].len = len;
> -	dt->blocks[i].offset = offset;
> -	dt->blocks[i].hash = h;
> -	dt->nblocks++;
> -	if (dt->nalloc < dt->nblocks + 256) {
> -		struct got_delta_block *db;
> -		size_t old_size = dt->nalloc;
> -		db = dt->blocks;
> -		dt->blocks = calloc(dt->nalloc + 256,
> -		    sizeof(struct got_delta_block));
> -		if (dt->blocks == NULL) {
> -			err = got_error_from_errno("calloc");
> -			dt->blocks = db;
> -			return err;
> -		}
> -		dt->nalloc += 256;
> -		/*
> -		 * Recompute all block positions. Hash-based indices of blocks
> -		 * in the array depend on the allocated length of the array.
> -		 */
> -		dt->nblocks = 0;
> -		for (i = 0; i < old_size; i++) {
> -			if (db[i].len == 0)
> -				continue;
> -			err = addblk(dt, f, file_offset0, db[i].len,
> -			    db[i].offset, db[i].hash);
> -			if (err)
> +
> +	*n = i;
> +	return NULL;
> +}
> +
> +static const struct got_error *
> +lookup_mem(int *n, struct got_delta_table *dt, uint8_t *basedata,
> +    off_t basefile_offset0, uint8_t *p, off_t len, uint32_t h)
> +{
> +	struct got_delta_block *block;
> +	uint8_t *d;
> +	int i;
> +
> +	for (i = h % dt->size; dt->offs[i] != 0; i = (i + 1) % dt->size) {
> +		block = &dt->blocks[dt->offs[i] - 1];
> +		if (len == block->len && h == block->hash) {
> +			d = basedata + basefile_offset0 + block->offset;
> +			if (memcmp(d, p, len) == 0)
>  				break;
>  		}
> -		free(db);
>  	}
>  
> -	return err;
> +	*n = i;
> +	return NULL;
>  }
>  
>  static const struct got_error *
> -addblk_mem(struct got_delta_table *dt, uint8_t *data, off_t file_offset0,
> +resizeht(struct got_delta_table *dt, FILE *f, off_t offset0, uint8_t *data,
> +    off_t len)
> +{
> +	const struct got_error *err;
> +	struct got_delta_block *b;
> +	size_t newsize, oldsize;
> +	int i, n;
> +
> +	if (dt->nblocks == dt->nalloc) {
> +		newsize = dt->nalloc + 256;
> +		b = reallocarray(dt->blocks, newsize, sizeof(*dt->blocks));
> +		if (b == NULL)
> +			return got_error_from_errno("reallocarray");
> +		dt->blocks = b;
> +		dt->nalloc = newsize;
> +	}
> +
> +	if (dt->blocks == NULL || dt->len * 100 / dt->size > 70) {
> +		oldsize = dt->size;
> +		dt->size *= 2;
> +		free(dt->offs);
> +		dt->offs = calloc(dt->size, sizeof(*dt->offs));
> +		if (dt->offs == NULL) {
> +			dt->size = oldsize;
> +			return got_error_from_errno("calloc");
> +		}
> +
> +		for (i = 0; i < dt->nblocks; ++i) {
> +			b = &dt->blocks[i];
> +			if (f)
> +				err = lookup(&n, dt, f, offset0, b->len,
> +				    b->offset, b->hash);
> +			else
> +				err = lookup_mem(&n, dt, data, offset0,
> +				    data + b->offset, b->len, b->hash);
> +			if (err) {
> +				free(dt->offs);
> +				dt->offs = NULL;
> +				dt->size = oldsize;
> +				return err;
> +			}
> +			assert(dt->offs[n] == 0);
> +			dt->offs[n] = i + 1;
> +		}
> +	}
> +
> +	return NULL;
> +}
> +
> +static const struct got_error *
> +addblk(struct got_delta_table *dt, FILE *f, uint8_t *data, off_t file_offset0,
>      off_t len, off_t offset, uint32_t h)
>  {
> -	const struct got_error *err = NULL;
> +	const struct got_error *err;
> +	struct got_delta_block *block;
>  	int i;
> -	uint8_t *block1;
> -	uint8_t *block2;
>  
>  	if (len == 0)
>  		return NULL;
>  
> -	i = h % dt->nalloc;
> -	while (dt->blocks[i].len != 0) {
> -		/*
> -		 * Avoid adding duplicate blocks.
> -		 * NB: A matching hash is insufficient for detecting equality.
> -		 * The hash can only detect inequality.
> -		 */
> -		if (len == dt->blocks[i].len && h == dt->blocks[i].hash) {
> -			block1 = data + file_offset0 + dt->blocks[i].offset;
> -			block2 = data + file_offset0 + offset;
> -			if (memcmp(block1, block2, len) == 0)
> -				return NULL;
> -		}
> +	err = resizeht(dt, f, file_offset0, data, len);
> +	if (err)
> +		return err;
>  
> -		i = (i + 1) % dt->nalloc;
> -	}
> -	assert(dt->blocks[i].len == 0);
> -	dt->blocks[i].len = len;
> -	dt->blocks[i].offset = offset;
> -	dt->blocks[i].hash = h;
> +	if (f)
> +		err = lookup(&i, dt, f, file_offset0, len, offset, h);
> +	else
> +		err = lookup_mem(&i, dt, data, file_offset0, data + offset,
> +		    len, h);
> +	if (err)
> +		return err;
> +
> +	if (dt->offs[i] != 0)
> +		return NULL;	/* found */
> +
> +	dt->offs[i] = dt->nblocks + 1;
> +	block = &dt->blocks[dt->nblocks];
> +	block->len = len;
> +	block->offset = offset;
> +	block->hash = h;
> +
>  	dt->nblocks++;
> -	if (dt->nalloc < dt->nblocks + 256) {
> -		struct got_delta_block *db;
> -		size_t old_size = dt->nalloc;
> -		db = dt->blocks;
> -		dt->blocks = calloc(dt->nalloc + 256,
> -		    sizeof(struct got_delta_block));
> -		if (dt->blocks == NULL) {
> -			err = got_error_from_errno("calloc");
> -			dt->blocks = db;
> -			return err;
> -		}
> -		dt->nalloc += 256;
> -		/*
> -		 * Recompute all block positions. Hash-based indices of blocks
> -		 * in the array depend on the allocated length of the array.
> -		 */
> -		dt->nblocks = 0;
> -		for (i = 0; i < old_size; i++) {
> -			if (db[i].len == 0)
> -				continue;
> -			err = addblk_mem(dt, data, file_offset0, db[i].len,
> -			    db[i].offset, db[i].hash);
> -			if (err)
> -				break;
> -		}
> -		free(db);
> -	}
> +	dt->len++;
>  
> -	return err;
> +	return NULL;
>  }
>  
>  static const struct got_error *
> @@ -243,24 +251,25 @@ lookupblk(struct got_delta_block **block, struct got_d
>  	int i;
>  	uint32_t h;
>  	uint8_t buf[GOT_DELTIFY_MAXCHUNK];
> +	struct got_delta_block *b;
>  	size_t r;
>  
>  	*block = NULL;
>  
>  	h = hashblk(p, len, seed);
> -	for (i = h % dt->nalloc; dt->blocks[i].len != 0;
> -	     i = (i + 1) % dt->nalloc) {
> -		if (dt->blocks[i].hash != h ||
> -		    dt->blocks[i].len != len)
> +	
> +	for (i = h % dt->size; dt->offs[i] != 0; i = (i + 1) % dt->size) {
> +		b = &dt->blocks[dt->offs[i] - 1];
> +		if (b->hash != h || b->len != len)
>  			continue;
> -		if (fseeko(basefile, basefile_offset0 + dt->blocks[i].offset,
> +		if (fseeko(basefile, basefile_offset0 + b->offset,
>  		    SEEK_SET) == -1)
>  			return got_error_from_errno("fseeko");
>  		r = fread(buf, 1, len, basefile);
>  		if (r != len)
>  			return got_ferror(basefile, GOT_ERR_IO);
>  		if (memcmp(p, buf, len) == 0) {
> -			*block = &dt->blocks[i];
> +			*block = b;
>  			break;
>  		}
>  	}
> @@ -272,24 +281,17 @@ lookupblk_mem(struct got_delta_block **block, struct g
>      unsigned char *p, off_t len, uint32_t seed, uint8_t *basedata,
>      off_t basefile_offset0)
>  {
> -	int i;
> +	const struct got_error *err;
> +	int n;
>  	uint32_t h;
> -	uint8_t *b;
>  
>  	*block = NULL;
> -
>  	h = hashblk(p, len, seed);
> -	for (i = h % dt->nalloc; dt->blocks[i].len != 0;
> -	     i = (i + 1) % dt->nalloc) {
> -		if (dt->blocks[i].hash != h ||
> -		    dt->blocks[i].len != len)
> -			continue;
> -		b = basedata + basefile_offset0 + dt->blocks[i].offset;
> -		if (memcmp(p, b, len) == 0) {
> -			*block = &dt->blocks[i];
> -			break;
> -		}
> -	}
> +	err = lookup_mem(&n, dt, basedata, basefile_offset0, p, len, h);
> +	if (err)
> +		return err;
> +	if (dt->offs[n] != 0)
> +		*block = &dt->blocks[dt->offs[n] - 1];
>  	return NULL;
>  }
>  
> @@ -370,6 +372,13 @@ got_deltify_init(struct got_delta_table **dt, FILE *f,
>  		goto done;
>  	}
>  
> +	(*dt)->size = 128;
> +	(*dt)->offs = calloc((*dt)->size, sizeof(*(*dt)->offs));
> +	if ((*dt)->offs == NULL) {
> +		err = got_error_from_errno("calloc");
> +		goto done;
> +	}
> +
>  	if (fseeko(f, fileoffset, SEEK_SET) == -1)
>  		return got_error_from_errno("fseeko");
>  
> @@ -382,7 +391,7 @@ got_deltify_init(struct got_delta_table **dt, FILE *f,
>  		if (blocklen == 0)
>  			break;
>  		h = hashblk(buf, blocklen, seed);
> -		err = addblk(*dt, f, offset0, blocklen,
> +		err = addblk(*dt, f, NULL, offset0, blocklen,
>  		    fileoffset - offset0, h);
>  		if (err)
>  			goto done;
> @@ -420,6 +429,13 @@ got_deltify_init_mem(struct got_delta_table **dt, uint
>  		goto done;
>  	}
>  
> +	(*dt)->size = 128;
> +	(*dt)->offs = calloc((*dt)->size, sizeof(*(*dt)->offs));
> +	if ((*dt)->offs == NULL) {
> +		err = got_error_from_errno("calloc");
> +		goto done;
> +	}
> +
>  	while (fileoffset < filesize) {
>  		off_t blocklen;
>  		err = nextblk_mem(&blocklen, data, fileoffset, filesize);
> @@ -428,7 +444,7 @@ got_deltify_init_mem(struct got_delta_table **dt, uint
>  		if (blocklen == 0)
>  			break;
>  		h = hashblk(data + fileoffset, blocklen, seed);
> -		err = addblk_mem(*dt, data, offset0, blocklen,
> +		err = addblk(*dt, NULL, data, offset0, blocklen,
>  		    fileoffset - offset0, h);
>  		if (err)
>  			goto done;
> @@ -450,6 +466,7 @@ got_deltify_free(struct got_delta_table *dt)
>  	if (dt == NULL)
>  		return;
>  	free(dt->blocks);
> +	free(dt->offs);
>  	free(dt);
>  }
>  
> blob - 23d51d665edd5927c4b9ca18e2ebebf3473b1795
> file + lib/got_lib_deltify.h
> --- lib/got_lib_deltify.h
> +++ lib/got_lib_deltify.h
> @@ -25,6 +25,14 @@ struct got_delta_table {
>  	struct got_delta_block	*blocks;
>  	int			nblocks;
>  	int			nalloc;
> +
> +	/*
> +	 * Index for blocks.  offs[n] is zero when the slot is free,
> +	 * otherwise it points to blocks[offs[n] - 1].
> +	 */
> +	uint32_t		*offs;
> +	int			 len;
> +	int			 size;
>  };
>  
>  struct got_delta_instruction {
>